# Step 1: Install necessary libraries

In [1]:
!pip install spacy tqdm

Collecting spacy
  Downloading spacy-3.7.5-cp312-cp312-win_amd64.whl.metadata (27 kB)
Collecting tqdm
  Using cached tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Using cached spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Using cached spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Using cached murmurhash-1.0.10-cp312-cp312-win_amd64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Using cached cymem-2.0.8-cp312-cp312-win_amd64.whl.metadata (8.6 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Using cached preshed-3.0.9-cp312-cp312-win_amd64.whl.metadata (2.2 kB)
Collecting thinc<8.3.0,>=8.2.2 (from spacy)
  Downloading thinc-8.2.5-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Using cached wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collectin

# Step 2: Import necessary libraries

In [None]:
import spacy
from spacy.training.example import Example
from tqdm import tqdm
import json
import re

# Step 3: Prepare training data
# Ensure to collect and annotate your data accordingly

In [None]:
TRAIN_DATA = [
    ("John Doe, Software Engineer at XYZ Corporation, phone: +254 711 123456, email: john.doe@example.com, website: www.johndoe.com, address: 123 Main St, Nairobi, Kenya",
     {"entities": [
         (0, 8, "PERSON"), (30, 47, "ORG"), (56, 69, "PHONE"), (77, 96, "EMAIL"), (106, 119, "WEBSITE"), (129, 156, "ADDRESS")
     ]}),
    ("Jane Smith, CEO of ABC Inc., phone: +254 722 987654, email: jane.smith@abc.com, website: www.abc.com, address: 456 Elm St, Mombasa, Kenya",
     {"entities": [
         (0, 10, "PERSON"), (18, 25, "ORG"), (33, 46, "PHONE"), (54, 73, "EMAIL"), (83, 91, "WEBSITE"), (101, 124, "ADDRESS")
     ]}),
    # Add more annotated examples here
]

# Save training data to a file for future use

In [None]:
with open("kenya_business_cards_train.json", "w", encoding="utf-8") as f:
    json.dump(TRAIN_DATA, f, ensure_ascii=False, indent=4)

# Step 4: Create and train the custom NER model

In [None]:
def train_custom_ner(training_data, output_dir):
    # Create a blank Language class
    nlp = spacy.blank("en")

 # Create the built-in pipeline components and add them to the pipeline

In [None]:
 if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner")

   # Add labels to the NER component

In [None]:
  for _, annotations in training_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

  # Start training

In [None]:
optimizer = nlp.begin_training()

  # Training loop

In [None]:
  for i in range(20):  # Increase the number of iterations as needed
        losses = {}
        for text, annotations in tqdm(training_data):
            example = Example.from_dict(nlp.make_doc(text), annotations)
            nlp.update([example], drop=0.5, losses=losses)
        print(f"Iteration {i+1}, Losses: {losses}")

 # Save the trained model

In [None]:
nlp.to_disk(output_dir)

# Load training data from the saved file

In [None]:
with open("kenya_business_cards_train.json", "r", encoding="utf-8") as f:
    TRAIN_DATA = json.load(f)

# Train and save the custom NER model

In [None]:
train_custom_ner(TRAIN_DATA, "./custom_ner_model")

# Step 5: Load the trained model and use it for entity extraction

In [None]:
nlp = spacy.load("./custom_ner_model")

def extract_entities(text):
    doc = nlp(text)
    entities = {ent.label_: ent.text for ent in doc.ents}

    # Validate and clean phone numbers
    phone_regex = re.compile(r'(\+254\s?\d{3}\s?\d{6})')
    phone_matches = phone_regex.findall(text)
    if phone_matches:
        entities["PHONE"] = phone_matches

    # Validate and clean email addresses
    email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b')
    email_matches = email_regex.findall(text)
    if email_matches:
        entities["EMAIL"] = email_matches

    # Validate and clean website URLs
    website_regex = re.compile(r'\b(?:http[s]?://)?(?:www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,7}\b')
    website_matches = website_regex.findall(text)
    if website_matches:
        entities["WEBSITE"] = website_matches

    return entities

# Example usage

In [None]:
text = "Alice Johnson, CTO at Tech Solutions, phone: +254 711 123456, email: alice.johnson@techsolutions.com, website: www.techsolutions.co.ke, address: 789 Maple Ave, Nairobi, Kenya"
entities = extract_entities(text)
print(entities)