In [None]:
import spacy
import pandas as pd
import random
from spacy.tokens import DocBin
from spacy.training import Example

# Load dataset
df = pd.read_csv("robust_ner_dataset.csv")

# Convert dataset into spaCy format
train_data = {}
for _, row in df.iterrows():
    text = row["Text"]
    label = row["Entity"]
    start = row["Start"]
    end = row["End"]

    if text not in train_data:
        train_data[text] = []
    train_data[text].append((start, end, label))

# Prepare spaCy training data
train_examples = []
nlp = spacy.blank("en")  # Create a blank English model
ner = nlp.add_pipe("ner", last=True)

for text, annotations in train_data.items():
    entities = {"entities": annotations}
    example = Example.from_dict(nlp.make_doc(text), entities)
    train_examples.append(example)
    for ent in annotations:
        ner.add_label(ent[2])  # Add labels dynamically

# Training the NER Model
optimizer = nlp.begin_training()
for i in range(20):  # 20 Epochs
    random.shuffle(train_examples)
    losses = {}
    for batch in spacy.util.minibatch(train_examples, size=8):
        nlp.update(batch, drop=0.5, losses=losses)
    print(f"Iteration {i + 1}, Loss: {losses}")

# Save the trained model
nlp.to_disk("ner_model")
print("✅ Model trained and saved successfully to 'ner_model'")




Iteration 1, Loss: {'ner': 5358.079948465119}
Iteration 2, Loss: {'ner': 1681.4269673948427}
Iteration 3, Loss: {'ner': 460.15500259844}
Iteration 4, Loss: {'ner': 152.0473360897811}
Iteration 5, Loss: {'ner': 64.19709463435512}
Iteration 6, Loss: {'ner': 93.60510775640687}
Iteration 7, Loss: {'ner': 105.86006617743428}
Iteration 8, Loss: {'ner': 46.13107698003203}
Iteration 9, Loss: {'ner': 35.8642476377315}
Iteration 10, Loss: {'ner': 53.28397012907602}
Iteration 11, Loss: {'ner': 31.547786752176865}
Iteration 12, Loss: {'ner': 15.209717312421876}
Iteration 13, Loss: {'ner': 36.30806120420189}
Iteration 14, Loss: {'ner': 27.23342182632846}
Iteration 15, Loss: {'ner': 45.39037896548134}
Iteration 16, Loss: {'ner': 13.840635664313242}
Iteration 17, Loss: {'ner': 33.85730126477086}
Iteration 18, Loss: {'ner': 6.0111034149698055}
Iteration 19, Loss: {'ner': 19.099090215364843}
Iteration 20, Loss: {'ner': 40.43003746434827}
✅ Model trained and saved successfully to 'ner_model'


In [None]:
import spacy

# Load the trained model
nlp = spacy.load("ner_model")

# Test on new text
text = "Hey, my name is John and you can reach me at johndoe@gmail.com or +1-234-567-8901."
doc = nlp(text)

# Print detected entities
for ent in doc.ents:
    print(f"{ent.text} ({ent.label_})")


johndoe@gmail.com (EMAIL)
+1-234-567-8901 (PHONE)


In [None]:
import spacy
import pandas as pd
import random
from spacy.tokens import DocBin
from spacy.training import Example

# Load the dataset (improved dataset with more variations)
df = pd.read_csv("robust_ner_dataset_v2.csv")

# Create a blank spaCy model and add the NER component
nlp = spacy.blank("en")  # Blank English model (without pre-trained weights)
ner = nlp.add_pipe("ner", last=True)  # Add NER to the pipeline

# Create a list of all unique labels in the dataset
labels = df['Entity'].unique()
for label in labels:
    ner.add_label(label)

# Prepare the training data
train_data = []
for _, row in df.iterrows():
    text = row["Text"]
    label = row["Entity"]
    start = row["Start"]
    end = row["End"]

    # Create annotations
    entities = [(start, end, label)]
    example = Example.from_dict(nlp.make_doc(text), {"entities": entities})
    train_data.append(example)

# Initialize the optimizer and begin training
optimizer = nlp.begin_training()
for epoch in range(30):  # Train for 30 epochs
    random.shuffle(train_data)
    losses = {}
    # Create minibatches and update the model
    for batch in spacy.util.minibatch(train_data, size=8):
        nlp.update(batch, drop=0.5, losses=losses)  # Dropout = 0.5 to avoid overfitting
    print(f"Epoch {epoch + 1} - Losses: {losses}")

# Save the trained model
nlp.to_disk("ner_model_v2")
print("✅ Model trained and saved successfully as 'ner_model_v2'.")





Epoch 1 - Losses: {'ner': 7821.935389175606}
Epoch 2 - Losses: {'ner': 5808.348688117778}
Epoch 3 - Losses: {'ner': 5271.890501320382}
Epoch 4 - Losses: {'ner': 5150.9685091234205}
Epoch 5 - Losses: {'ner': 5026.79392125362}
Epoch 6 - Losses: {'ner': 4958.728948377951}
Epoch 7 - Losses: {'ner': 4912.868275149066}
Epoch 8 - Losses: {'ner': 4904.407866673563}
Epoch 9 - Losses: {'ner': 4854.817922480866}
Epoch 10 - Losses: {'ner': 4793.760577115431}
Epoch 11 - Losses: {'ner': 4819.236708221638}
Epoch 12 - Losses: {'ner': 4744.068016212728}
Epoch 13 - Losses: {'ner': 4733.538328401323}
Epoch 14 - Losses: {'ner': 4713.112160743171}
Epoch 15 - Losses: {'ner': 4738.225603871748}
Epoch 16 - Losses: {'ner': 4657.985158780498}
Epoch 17 - Losses: {'ner': 4672.909248579281}
Epoch 18 - Losses: {'ner': 4645.969814691186}
Epoch 19 - Losses: {'ner': 4643.908553548666}
Epoch 20 - Losses: {'ner': 4616.808977696144}
Epoch 21 - Losses: {'ner': 4598.307102296213}
Epoch 22 - Losses: {'ner': 4568.43502907007

In [None]:
import spacy

# Load the trained model
nlp = spacy.load("ner_model_v2")

# Test text with the trained model
text = "John Doe works at OpenAI and can be reached at john.doe@example.com. His office is at 1234 Market St."
doc = nlp(text)

# Display the entities found by the model
for ent in doc.ents:
    print(f"{ent.text} ({ent.label_})")


In [None]:
import spacy
from spacy.training import Example
import random
import pandas as pd

# Load the dataset (updated one)
df = pd.read_csv("robust_ner_dataset_v3.csv")

# Load a pre-trained spaCy model (en_core_web_sm)
nlp = spacy.load("en_core_web_sm")

# Add the NER component to the pipeline if it's not already present
if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe("ner")

# Add labels
labels = df['Entity'].unique()
for label in labels:
    ner.add_label(label)

# Prepare the training data
train_data = []
for _, row in df.iterrows():
    text = row["Text"]
    label = row["Entity"]
    start = row["Start"]
    end = row["End"]

    entities = [(start, end, label)]
    example = Example.from_dict(nlp.make_doc(text), {"entities": entities})
    train_data.append(example)

# Fine-tuning the model with pre-trained weights
optimizer = nlp.create_optimizer()

# Training loop with more epochs (e.g., 50 epochs)
for epoch in range(50):
    random.shuffle(train_data)
    losses = {}
    for batch in spacy.util.minibatch(train_data, size=8):
        nlp.update(batch, drop=0.5, losses=losses)
    print(f"Epoch {epoch + 1} - Losses: {losses}")

# Save the trained model
nlp.to_disk("fine_tuned_ner_model")
print("✅ Fine-tuned model saved as 'fine_tuned_ner_model'.")




Epoch 1 - Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 6387.164383188184}
Epoch 2 - Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 5172.807850872055}
Epoch 3 - Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 4991.59218290751}
Epoch 4 - Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 4862.097522056457}
Epoch 5 - Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 4769.0139771610575}
Epoch 6 - Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 4780.2820279898615}
Epoch 7 - Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 4747.716757016137}
Epoch 8 - Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 4731.366480456195}
Epoch 9 - Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 4714.714872575713}
Epoch 10 - Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 4657.313722714297}
Epoch 11 - Losses: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 4670.7839210713

In [None]:
import spacy

# Load the fine-tuned model
nlp = spacy.load("fine_tuned_ner_model")

# Test text
text = "John Doe works at OpenAI and can be reached at john.doe@example.com. His phone number is +1-234-567-8901."
doc = nlp(text)

# Print the entities found by the model
for ent in doc.ents:
    print(f"{ent.text} ({ent.label_})")


john.doe@example.com (EMAIL)
