# Named Entity Recognition (NER) Script

In [None]:

import os
import sys
import pandas as pd
import pdfplumber
import spacy
from pathlib import Path


### Function to extract text from CSV

In [None]:

def extract_text_from_csv(file_path, text_columns=None):
    """
    Extract text from specified columns in a CSV file.
    If text_columns is None, extract from all object (string) columns.
    """
    try:
        df = pd.read_csv(file_path)
        if text_columns:
            text_data = df[text_columns].astype(str).agg(' '.join, axis=1).tolist()
        else:
            # Extract from all object (string) columns
            string_cols = df.select_dtypes(include=['object']).columns
            text_data = df[string_cols].astype(str).agg(' '.join, axis=1).tolist()
        combined_text = ' '.join(text_data)
        return combined_text
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        sys.exit(1)


### Function to extract text from Excel

In [None]:

def extract_text_from_excel(file_path, text_columns=None, sheet_name=0):
    """
    Extract text from specified columns in an Excel file.
    If text_columns is None, extract from all object (string) columns.
    """
    try:
        df = pd.read_excel(file_path, sheet_name=sheet_name)
        if text_columns:
            text_data = df[text_columns].astype(str).agg(' '.join, axis=1).tolist()
        else:
            # Extract from all object (string) columns
            string_cols = df.select_dtypes(include=['object']).columns
            text_data = df[string_cols].astype(str).agg(' '.join, axis=1).tolist()
        combined_text = ' '.join(text_data)
        return combined_text
    except Exception as e:
        print(f"Error reading Excel file: {e}")
        sys.exit(1)


### Function to extract text from PDF

In [None]:

def extract_text_from_pdf(file_path):
    """
    Extract text from a PDF file using pdfplumber.
    """
    try:
        with pdfplumber.open(file_path) as pdf:
            text_data = ''
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text_data += page_text + '\n'
        return text_data
    except Exception as e:
        print(f"Error reading PDF file: {e}")
        sys.exit(1)


### Perform Named Entity Recognition (NER) using spaCy

In [None]:

def perform_ner(text, nlp):
    """
    Perform Named Entity Recognition on the provided text using spaCy.
    Returns a list of entities with their labels and character offsets.
    """
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        entities.append({
            'Entity': ent.text,
            'Label': ent.label_,
            'Start_char': ent.start_char,
            'End_char': ent.end_char
        })
    return entities


### Save entities to a CSV file

In [None]:

def save_entities_to_csv(entities, output_file):
    """
    Save the list of entities to a CSV file.
    """
    try:
        df = pd.DataFrame(entities)
        df.to_csv(output_file, index=False)
        print(f"Entities successfully saved to {output_file}")
    except Exception as e:
        print(f"Error saving entities to CSV: {e}")
        sys.exit(1)


### Main function to handle input and output

In [None]:

def main(input_file, output_file, text_columns=None, sheet_name=0):
    input_path = Path(input_file)
    output_path = Path(output_file)

    if not input_path.exists():
        print(f"Input file {input_path} does not exist.")
        sys.exit(1)

    file_extension = input_path.suffix.lower()

    # Extract text based on file type
    if file_extension == '.csv':
        text = extract_text_from_csv(input_path, text_columns=text_columns)
    elif file_extension in ['.xls', '.xlsx']:
        text = extract_text_from_excel(input_path, text_columns=text_columns, sheet_name=sheet_name)
    elif file_extension == '.pdf':
        text = extract_text_from_pdf(input_path)
    else:
        print("Unsupported file type. Please provide a CSV, Excel, or PDF file.")
        sys.exit(1)

    if not text:
        print("No text extracted from the input file.")
        sys.exit(1)

    print("Text extraction complete. Performing Named Entity Recognition...")

    # Load spaCy model
    try:
        nlp = spacy.load('en_core_web_sm')
    except OSError:
        print("spaCy model 'en_core_web_sm' not found. Downloading now...")
        os.system("python -m spacy download en_core_web_sm")
        nlp = spacy.load('en_core_web_sm')

    # Perform NER
    entities = perform_ner(text, nlp)

    if not entities:
        print("No entities found in the text.")
        sys.exit(0)

    # Save entities to CSV
    save_entities_to_csv(entities, output_path)
