# Step 1: Install necessary libraries

In [1]:
!pip install spacy tqdm doccano-client pytesseract pillow

Collecting doccano-client
  Downloading doccano_client-1.2.8-py3-none-any.whl.metadata (2.8 kB)
Collecting pytesseract
  Using cached pytesseract-0.3.10-py3-none-any.whl.metadata (11 kB)
Collecting dataclasses-json<0.6.0,>=0.5.7 (from doccano-client)
  Downloading dataclasses_json-0.5.14-py3-none-any.whl.metadata (22 kB)
Collecting pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 (from spacy)
  Downloading pydantic-1.10.17-cp312-cp312-win_amd64.whl.metadata (153 kB)
     ---------------------------------------- 0.0/153.0 kB ? eta -:--:--
     --------- --------------------------- 41.0/153.0 kB 991.0 kB/s eta 0:00:01
     --------- --------------------------- 41.0/153.0 kB 991.0 kB/s eta 0:00:01
     -------------- ---------------------- 61.4/153.0 kB 409.6 kB/s eta 0:00:01
     -------------------------- --------- 112.6/153.0 kB 595.3 kB/s eta 0:00:01
     ------------------------------- ---- 133.1/153.0 kB 605.3 kB/s eta 0:00:01
     --------------------------------- -- 143.4/153.0 kB 500.5 kB/s 

# Step 2: Import necessary libraries

In [2]:
import spacy
from spacy.training.example import Example
from tqdm import tqdm
import json
import os
from PIL import Image
import pytesseract

# Set the path to the Tesseract executable

In [3]:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Windows
# pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'  # macOS/Linux


# Step 3: Extract text from business card images

In [4]:
def extract_text_from_image(image_path):
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image)
    return text

image_folder = 'images'
extracted_texts = []

for image_file in os.listdir(image_folder):
    if image_file.endswith(('png', 'jpg', 'jpeg')):
        image_path = os.path.join(image_folder, image_file)
        text = extract_text_from_image(image_path)
        extracted_texts.append((image_file, text))


# Save extracted texts to a file

In [5]:
with open('extracted_texts.txt', 'w', encoding='utf-8') as f:
    for image_file, text in extracted_texts:
        f.write(f'File: {image_file}\n{text}\n\n')

# Step 4: Annotate the extracted text using Doccano
# Follow the steps in the Doccano section above to create a project, import the data, and annotate the text


# Step 5: Convert Doccano annotations to spaCy format
# Load the doccano JSONL file

In [7]:
with open('path/to/doccano_export.jsonl', 'r', encoding='utf-8') as f:
    data = [json.loads(line) for line in f]

FileNotFoundError: [Errno 2] No such file or directory: 'path/to/doccano_export.jsonl'

# Convert to spaCy training format

In [None]:
TRAIN_DATA = []
for entry in data:
    text = entry['text']
    entities = []
    for label in entry['labels']:
        entities.append((label[0], label[1], label[2]))
    TRAIN_DATA.append((text, {"entities": entities}))

# Save TRAIN_DATA to a file for future use

In [None]:
with open("kenya_business_cards_train.json", "w", encoding="utf-8") as f:
    json.dump(TRAIN_DATA, f, ensure_ascii=False, indent=4)

# Step 6: Train the custom NER model

In [None]:
def train_custom_ner(training_data, output_dir):
    # Create a blank Language class
    nlp = spacy.blank("en")

    # Create the built-in pipeline components and add them to the pipeline
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner")

    # Add labels to the NER component
    for _, annotations in training_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # Start training
    optimizer = nlp.begin_training()

    # Training loop
    for i in range(20):  # Increase the number of iterations as needed
        losses = {}
        for text, annotations in tqdm(training_data):
            example = Example.from_dict(nlp.make_doc(text), annotations)
            nlp.update([example], drop=0.5, losses=losses)
        print(f"Iteration {i+1}, Losses: {losses}")

    # Save the trained model
    nlp.to_disk(output_dir)

# Load training data from the saved file

In [None]:
with open("kenya_business_cards_train.json", "r", encoding="utf-8") as f:
    TRAIN_DATA = json.load(f)

# Train and save the custom NER model

In [None]:
train_custom_ner(TRAIN_DATA, "./custom_ner_model")

# Step 7: Load the trained model and use it for entity extraction

In [None]:
nlp = spacy.load("./custom_ner_model")

def extract_entities(text):
    doc = nlp(text)
    entities = {ent.label_: ent.text for ent in doc.ents}

    # Validate and clean phone numbers
    phone_regex = re.compile(r'(\+254\s?\d{3}\s?\d{6})')
    phone_matches = phone_regex.findall(text)
    if phone_matches:
        entities["PHONE"] = phone_matches

    # Validate and clean email addresses
    email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b')
    email_matches = email_regex.findall(text)
    if email_matches:
        entities["EMAIL"] = email_matches

    # Validate and clean website URLs
    website_regex = re.compile(r'\b(?:http[s]?://)?(?:www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,7}\b')
    website_matches = website_regex.findall(text)
    if website_matches:
        entities["WEBSITE"] = website_matches

    return entities

# Example usage

In [None]:
text = "Alice Johnson, CTO at Tech Solutions, phone: +254 711 123456, email: alice.johnson@techsolutions.com, website: www.techsolutions.co.ke, address: 789 Maple Ave, Nairobi, Kenya"
entities = extract_entities(text)
print(entities)