In [3]:
#libraries
import os
import pandas as pd
import PyPDF2
import kagglehub
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Download latest version
path = kagglehub.dataset_download("ayoubcherguelaine/company-documents-dataset")

print("Path to dataset files:", path)

Path to dataset files: /Users/triniroca/.cache/kagglehub/datasets/ayoubcherguelaine/company-documents-dataset/versions/1


In [5]:
#seeinG What files are within the dataset
files = os.listdir(path)
print("Files in the dataset:")
for file in files:
    print(file)
# Reading the CSV file


Files in the dataset:
CompanyDocuments
company-document-text.csv


In [6]:
df = pd.read_csv(os.path.join(path, "company-document-text.csv"))

print("Dataframe shape:", df.shape)
print("dataframe columns:", df.columns)
print("First few rows of the dataframe:", df.head())

Dataframe shape: (2676, 3)
dataframe columns: Index(['text', 'label', 'word_count'], dtype='object')
First few rows of the dataframe:                                                 text          label  \
0  order id  10718 shipping details  ship name  k...  ShippingOrder   
1  invoice order id  10707 customer id  arout ord...        invoice   
2  order id  10448 shipping details  ship name  r...  ShippingOrder   
3  invoice order id  11068 customer id  queen ord...        invoice   
4  order id  10656 shipping details  ship name  g...  ShippingOrder   

   word_count  
0         120  
1          66  
2          96  
3          68  
4         109  


In [7]:
# src/preprocessing.py
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

def preprocess_text(text):
    """Preprocess text by converting to lowercase, removing special characters, and normalizing spaces."""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize spaces
    return text

def load_and_preprocess_data(dataset_path):
    """Load the dataset, preprocess the text, and split into train/test sets."""
    df = pd.read_csv(dataset_path)
    df['cleaned_text'] = df['text'].apply(preprocess_text)
    X = df['cleaned_text'].tolist()
    y = df['label'].tolist()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test, df

def vectorize_text(X_train, X_test, max_features=5000):
    """Convert text to TF-IDF features."""
    vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english', ngram_range=(1, 2))
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    return X_train_tfidf, X_test_tfidf, vectorizer

In [8]:
# Preprocessing function for classification
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize spaces
    return text

# Apply preprocessing
df['cleaned_text'] = df['text'].apply(preprocess_text)
print("Sample cleaned text:\n", df['cleaned_text'].iloc[0])

Sample cleaned text:
 order id shipping details ship name kniglich essen ship address maubelstr ship city brandenburg ship region western europe ship postal code ship country germany customer details customer id koene customer name kniglich essen employee details employee name nancy davolio shipper details shipper id shipper name federal shipping order details order date shipped date products product queso manchego la pastora quantity unit price total product pavlova quantity unit price total product inlagd sill quantity unit price total product tarte au sucre quantity unit price total total price total price


In [9]:

# Prepare data
X = df['cleaned_text'].tolist()
y = df['label'].tolist()

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train SVM classifier
classifier = SVC(kernel='linear')
classifier.fit(X_train_tfidf, y_train)

# Evaluate
y_pred = classifier.predict(X_test_tfidf)
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
                 precision    recall  f1-score   support

 ShippingOrder       1.00      1.00      1.00       174
       invoice       1.00      1.00      1.00       171
purchase Order       1.00      1.00      1.00       151
        report       1.00      1.00      1.00        40

      accuracy                           1.00       536
     macro avg       1.00      1.00      1.00       536
  weighted avg       1.00      1.00      1.00       536



In [10]:
# Step 3: Extraction
def extract_invoice_info(text):
    info = {}
    info['Invoice Number'] = re.search(r'order id\s+(\d+)', text, re.IGNORECASE).group(1) if re.search(r'order id\s+(\d+)', text) else "Not found"
    info['Invoice Date'] = re.search(r'order date\s+(\d{4}-\d{2}-\d{2})', text, re.IGNORECASE).group(1) if re.search(r'order date\s+(\d{4}-\d{2}-\d{2})', text) else "Not found"
    info['Due Date'] = re.search(r'due date\s+(\d{4}-\d{2}-\d{2})', text, re.IGNORECASE).group(1) if re.search(r'due date\s+(\d{4}-\d{2}-\d{2})', text) else "Not found"
    info['Issuer Name'] = re.search(r'from\s+([A-Za-z\s]+)', text, re.IGNORECASE).group(1) if re.search(r'from\s+([A-Za-z\s]+)', text) else "Northwind Traders"
    info['Recipient Name'] = re.search(r'customer id\s+(\w+)', text, re.IGNORECASE).group(1) if re.search(r'customer id\s+(\w+)', text) else "Not found"
    info['Total Amount'] = re.search(r'TotalPrice\s+\$?(\d+\.?\d*)', text, re.IGNORECASE).group(1) if re.search(r'TotalPrice\s+\$?(\d+\.?\d*)', text) else "Not found"
    return info

In [11]:
# Step 4: End-to-End
def process_document(text):
    cleaned_text = preprocess_text(text)
    text_tfidf = vectorizer.transform([cleaned_text])
    category = classifier.predict(text_tfidf)[0]
    if category.lower() == 'invoice':
        return extract_invoice_info(text)
    return None

# Test
sample_text = df['text'].iloc[1]
result = process_document(sample_text)
print("Predicted Category:", classifier.predict(vectorizer.transform([preprocess_text(sample_text)]))[0])
print("Result:", result)

Predicted Category: invoice
Result: {'Invoice Number': '10707', 'Invoice Date': '2017-10-16', 'Due Date': 'Not found', 'Issuer Name': 'Northwind Traders', 'Recipient Name': 'arout', 'Total Amount': 'Not found'}
