In [1]:
import pytesseract as pyt
import cv2 as cv
import re 
from pathlib import Path
import os
from dotenv import load_dotenv
import datetime as dt 


In [2]:
timestamp = dt.datetime.now().strftime("%Y%m%d%H%M%S")

In [3]:
def config():
    load_dotenv()

In [4]:
def read_business_card(input_path):
    """
    Read text information from a business card image using OCR (Optical Character Recognition).

    Args:
    - input_path (str): Path to the input business card image file.

    Returns:
    - str: Text extracted from the business card.
    """

    # Set the path to the Tesseract executable (replace 'tesseract_path' with the actual path)
    tesseract_path = os.getenv("TESSERACT_PATH")
    pyt.pytesseract.tesseract_cmd = tesseract_path

    img = cv.imread(cv.samples.findFile(input_path))
    img_rgb = cv.cvtColor(img, cv.COLOR_BGR2RGB)

    # Uncomment and customize the configuration if needed
    # custom_config = r'--oem 3 --psm 6'

    # Perform OCR on the input image to extract text
    card_text = pyt.image_to_string(img_rgb )  #, config=custom_config)

    return card_text

In [5]:
def extract_data(card_text):
    """
    Extract contact information from the text of a business card.

    Args:
    - card_text (str): Text extracted from a business card.

    Returns:
    - tuple: A tuple containing extracted information - phone numbers, emails, name, title, website.
    """

    # Define a regular expression pattern for websites
    url_pattern = re.compile(r'\b(?:www\.)?[a-zA-Z0-9-]+\.[a-zA-Z]{2,}(?:\.[a-zA-Z]{2,})?\b')

    # Define a regular expression pattern for international phone numbers with seven or more digits
    phone_pattern = re.compile(r'\b(?:\+[\d\s-]+|\(\+\d+\)|\d{1,4}[-.\s]?)\d{1,9}[-.\s]?\d{1,9}[-.\s]?\d{1,9}[-.\s]?\d{1,9}\b')

    # Define a regular expression pattern for email addresses
    email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')

    # Extract websites, phone numbers, and emails from the card text
    websites = [site for site in re.findall(url_pattern, card_text) if 'www' in site]
    phone_numbers = [number for number in re.findall(phone_pattern, card_text) if len(number) > 7]
    emails = re.findall(email_pattern, card_text)

    # Extract name and title using a custom pattern
    name_parsing = r"^[\w'\-.][^0-9-!¡?+?¿/\\+=@#$%^&*(){}|~<>;:[\]]{2,}"
    name_title = re.findall(name_parsing, card_text)
    text_list = [word for line in name_title for word in line.split("\n") if len(word) > 3]

    # Assign extracted name and title
    name = text_list[0] if text_list else None
    title = text_list[1] if len(text_list) > 1 else None

    return phone_numbers, emails, name, title, websites

In [6]:
def create_vcf_file(name, phone_numbers, email, organization, filename=f'contact.vcf'):
    """
    Create a VCF (Virtual Contact File) with the provided information.

    Args:
    - first_name (str): First name of the contact.
    - last_name (str): Last name of the contact.
    - phone_number (str): Phone number of the contact.
    - email (str): Email address of the contact.
    - organization (str): Organization of the contact.
    - filename (str, optional): Name of the output VCF file. Defaults to 'contact.vcf'.
    """

    vcf_content = f"""BEGIN:VCARD
VERSION:3.0
FN:{name}
ORG:{organization}
TEL:{phone_numbers[0]}
EMAIL:{email[0]}
END:VCARD
"""

    print(vcf_content)
    with open(filename, 'r+') as vcf_file:
        vcf_file.write(vcf_content)

In [7]:
file = "cards/download-3.jpg"


In [8]:
card_text = read_business_card(file)

TesseractNotFoundError: r"C:\Users\katle\AppData\Local\Programs\Tesseract-OCR\tesseract.exe" is not installed or it's not in your PATH. See README file for more information.

In [None]:
print(card_text)

In [None]:
# Define a regular expression pattern for email addresses
email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')

# Find all matches in the text
email_addresses = email_pattern.findall(card_text)

# Print the extracted email addresses
print("Email Addresses:", email_addresses)

In [None]:
website_pattern = re.compile(r'\b(?:www\.)?[a-zA-Z0-9-]+\.[a-zA-Z]{2,}(?:\.[a-zA-Z]{2,})?\b')

# Find all matches in the text
websites = website_pattern.findall(card_text)

# Print the extracted websites
print("Websites:", websites)

phone_pattern = re.compile(r'\b(?:\+[\d\s-]+|\(\+\d+\)|\d{1,4}[-.\s]?)\d{1,9}[-.\s]?\d{1,9}[-.\s]?\d{1,9}[-.\s]?\d{1,9}\b')
# Find all matches in the text
phone_numbers = phone_pattern.findall(card_text)

# Print the extracted phone numbers
print("Phone Numbers:", phone_numbers)

In [None]:
phone_numbers, emails, name, title, website = extract_data(card_text)

In [None]:
print(f"""
{phone_numbers}
{emails}
{name}
{title}
{website}
""")

In [None]:
create_vcf_file(name, phone_numbers, emails, organization=title)

In [None]:
import spacy

# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

# Sample text from a business card
business_card_text = """
John Doe
CEO
Acme Corporation
"""

# Process the text with spaCy
doc = nlp(card_text)

# Extract named entities and categorize them
entities = {ent.text:ent.label_ for ent in doc.ents}

# Print the categorized entities
print(entities)