In [1]:
!pip install spacy



In [12]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
import spacy
from spacy import displacy
from spacy.tokens import DocBin

nlp = spacy.load("en_core_web_sm")

In [40]:
!python -m spacy convert ./formatted_annotations.json .

[38;5;2m✔ Generated output file (1 documents): formatted_annotations.spacy[0m


In [35]:
# Correct json format returned by NER Text Annotator 
import json

# Path to the input JSON file
input_json_file_path = './annotations.json'

# Path to the output JSON file
output_json_file_path = './formatted_annotations.json'

# Load the input JSON file
with open(input_json_file_path, 'r') as file:
    data = json.load(file)

# Extract classes and annotations
classes = data['classes']
annotations = data['annotations']

# Transform the data into the correct format
formatted_data = []
for i, annotation in enumerate(annotations):
    text = annotation[0]
    entities = annotation[1]['entities']
    ents = []
    for entity in entities:
        start = entity[0]
        end = entity[1]
        label = entity[2]
        ents.append({"start": start, "end": end, "label": label})
    formatted_data.append({
        "id": i - 1,
        "paragraphs": [
            {
                "raw": text,
                "sentences": [],
                "entities": ents
            }
        ]
    })
formatted_data.pop(0)

# Save the formatted data to the output JSON file
with open(output_json_file_path, 'w') as file:
    json.dump(formatted_data, file, indent=2)

print(f"Formatted JSON data saved to '{output_json_file_path}'")

Formatted JSON data saved to './formatted_annotations.json'


In [4]:
import pandas as pd

dataset = "Internship Tracker - ML_Dataset.csv"
# Load the dataset
data = pd.read_csv(dataset)

In [5]:
# Function to ensure each subject ends with punctuation
def ensure_punctuation(subject):
    if subject[-1] not in '.!?':
        return subject + '.'
    return subject

# Apply the function to each subject and join with a space
text = " ".join(ensure_punctuation(subject) for subject in data['email subject'])
print(text)

Jane Street Application Received. Thanks for applying to Stripe! Aaryaman, Thanks for Applying for Salesforce's Summer 2025 Intern - Software Engineer. Aaryaman, your application was sent to American Institute of Insurance Research. Your application was viewed by [P1] Games. Aaryaman, your application was sent to [P1] Games. Aaryaman, your application was sent to Vantiva. Aaryaman, your application was sent to Hangtight. Your application was viewed by TuneTunnel. Aaryaman, your application was sent to TuneTunnel. Your application was viewed by Rayca Precision. Your application to Software Engineer Intern at Boostability. Aaryaman, your application was sent to Rayca Precision. Aaryaman, your application was sent to Collabera. Aaryaman, your application was sent to Boostability. Your application to Software Engineer Intern at Scoper Inc. Your application to AI/ML Engineer Intern - Fall 2024 at Cranium. Aaryaman, your application was sent to Scoper. Aaryaman, your application was sent to 

In [6]:
with open('email_subjects.txt', 'w') as file:
    file.write(text)

In [38]:
annotations_file = "./formatted_annotations.spacy"
try:
    doc_bin = DocBin().from_disk(annotations_file)
    print("File loaded successfully.")
except Exception as e:
    print(f"Error loading file: {e}")
docs = list(doc_bin.get_docs(nlp.vocab))
print(f"Number of documents: {len(docs)}")

File loaded successfully.
Number of documents: 1


In [39]:
for doc in docs:
    print(f"Document text: {doc.text}")
    for ent in doc.ents:
        print(f"Entity: {ent.text}, Label: {ent.label_}")

Document text: 
