# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import os
import PyPDF2
import textract
import nltk
import re
import time
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer
from autocorrect import Speller
from PIL import Image
import string
from collections import Counter
from wordcloud import WordCloud
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from mlxtend.plotting import plot_confusion_matrix
np.bool = np.bool_

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Load Data

In [None]:
data_dir = "/mnt/hdd/Datasets/pdfs/RawData/"

In [None]:
folders = os.listdir(data_dir)
folders

In [None]:
data = []
labels = []

for folder in folders:
    folder_path = os.path.join(data_dir, folder)
    files = os.listdir(folder_path)
    for file in files:
        if file.endswith(".pdf"):
            filename = os.path.join(folder_path, file)
            open_filename = open(filename, "rb")
            pdf_reader = PyPDF2.PdfReader(open_filename)
            total_pages = len(pdf_reader.pages)
            if total_pages > 3:
                total_pages = 3
            count = 0
            text = ""
            while count < total_pages:
                page = pdf_reader.pages[count]
                count += 1
                text += page.extract_text()
            
            spell = Speller(lang="en")
            texts = spell(text)
            text_file = " ".join([word.lower() for word in word_tokenize(texts)])
            data.append(text_file)
            labels.append(folder)

In [None]:
df = pd.DataFrame({"text": data, "category": labels})
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.category.value_counts()

In [None]:
plt.figure()
df.category.value_counts().plot(kind="barh")
plt.xlabel("Category")
plt.ylabel("Count")
plt.title("Category Count")
plt.show()

In [None]:
def word_freq(category, column):
    freq_df = df[df["category"] == category]
    freq_words = freq_df[column].tolist()
    freq_words = [word.lower() for word in freq_words]
    freq_punc = []

    for word in freq_words:
        freq_punc += word_tokenize(word)

    freq_punc = [word for word in freq_punc if word not in string.punctuation]
    freq_freq = Counter(freq_punc)

    freq_top = freq_freq.most_common(15)

    words = [word for word, _ in freq_top]
    counts = [count for _, count in freq_top]

    plt.figure(figsize=(15, 5))
    plt.bar(words, counts)
    plt.title(f"TOP 15 WORDS in {category}")
    plt.xlabel("Word")
    plt.ylabel("Frequency")
    plt.show()
    
    return freq_top

In [None]:
agreements_top = word_freq("Agreements", "text")

In [None]:
deeds_top = word_freq("Deeds", "text")

In [None]:
valuations_top = word_freq("Valuations", "text")

In [None]:
taxes_top = word_freq("Taxes", "text")

In [None]:
human_resources_top = word_freq("Human Resources", "text")

In [None]:
def print_wordcloud(freq_top, category):
    dict_top = dict(freq_top)
    wordcloud = WordCloud(width=350, height=350, background_color="white", min_font_size=5).generate_from_frequencies(dict_top)

    plt.figure()
    plt.title(f"TOP 15 Words in {category}")
    plt.imshow(wordcloud)
    #plt.axis("off")
    plt.tight_layout(pad=0)
    plt.show()

In [None]:
print_wordcloud(agreements_top, "Agreements")

In [None]:
print_wordcloud(deeds_top, "Deeds")

In [None]:
print_wordcloud(valuations_top, "Valuations")

In [None]:
print_wordcloud(taxes_top, "Taxes")

In [None]:
print_wordcloud(human_resources_top, "Human Resources")

# Preprocess

In [None]:
stop_words = set(stopwords.words("english"))

In [None]:
def clean_text(text):
    text = re.sub(r"[^\w\s]", "", text)
    text = text.lower()
    text = re.sub(r"\d+", "", text)
    text = " ".join([word for word in text.split() if word not in stop_words])
    lemmatizer = WordNetLemmatizer()
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split() if len(word) > 2])
    text = text.strip()
    return text

In [None]:
df["clean"] = df["text"].apply(clean_text)

In [None]:
df.head()

In [None]:
pre_agreements_top = word_freq("Agreements", "clean")

In [None]:
pre_deeds_top = word_freq("Deeds", "clean")

In [None]:
pre_valuations_top = word_freq("Valuations", "clean")

In [None]:
pre_taxes_top = word_freq("Taxes", "clean")

In [None]:
pre_human_resources_top = word_freq("Human Resources", "clean")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
def label_encoder(column):
    le = LabelEncoder().fit(column)
    print(column.name, le.classes_)
    return le.transform(column)

In [None]:
df["label"] = label_encoder(df["category"])

In [None]:
class_names = ["Agreements", "Deeds", "Human Resources", "Taxes", "Valuations"]

# Model

In [None]:
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
df.head()

In [None]:
X = df["clean"]
y = df["label"]

In [None]:
tfidf = TfidfVectorizer(max_features=1000, stop_words=stopwords.words("english"))
X_tfidf = tfidf.fit_transform(X)

In [None]:
pickle.dump(tfidf, open("tfidf.pkl", "wb"))

In [None]:
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_tfidf, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Logistic Regression

In [None]:
logreg = LogisticRegression()
start = time.time()
logreg.fit(X_train, y_train)
end = time.time()
logreg_time = end - start
print("Logistic Regression Train Time:", logreg_time)

In [None]:
pickle.dump(logreg, open("logreg.pkl", "wb"))

In [None]:
logreg_scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=3)
print("Logistic Regression Cross-Validation Accuracy Scores:", logreg_scores)

In [None]:
logreg_pred_train = logreg.predict(X_train)
logreg_pred_test = logreg.predict(X_test)

logreg_train_score = accuracy_score(logreg_pred_train, y_train)
logreg_test_score = accuracy_score(logreg_pred_test, y_test)
print("Logistic Regression Train Score:", logreg_train_score)
print("Logistic Regression Test Score:", logreg_test_score)

In [None]:
print(classification_report(y_test, logreg_pred_test))

In [None]:
logreg_cm = confusion_matrix(y_test, logreg_pred_test)
fig, ax = plot_confusion_matrix(conf_mat=logreg_cm, show_absolute=True, show_normed=True, colorbar=True, class_names=class_names)
plt.title("Logistic Regression Confusion Matrix")
plt.show()

# Random Forest

In [None]:
rf = RandomForestClassifier()
start = time.time()
rf.fit(X_train, y_train)
end = time.time()
rf_time = end - start
print("Random Forest Train Time:", rf_time)

In [None]:
pickle.dump(rf, open("rf.pkl", "wb"))

In [None]:
rf_scores = cross_val_score(RandomForestClassifier(), X_train, y_train, cv=3)
print("Random Forest Cross-Validation Accuracy Scores:", rf_scores)

In [None]:
rf_pred_train = rf.predict(X_train)
rf_pred_test = rf.predict(X_test)

rf_train_score = accuracy_score(rf_pred_train, y_train)
rf_test_score = accuracy_score(rf_pred_test, y_test)
print("Random Forest Train Score:", rf_train_score)
print("Random Forest Test Score:", rf_test_score)

In [None]:
print(classification_report(y_test, rf_pred_test))

In [None]:
rf_cm = confusion_matrix(y_test, rf_pred_test)
fig, ax = plot_confusion_matrix(conf_mat=rf_cm, show_absolute=True, show_normed=True, colorbar=True, class_names=class_names)
plt.title("Random Forest confusion Matrix")
plt.show()

# Decision Tree

In [None]:
dt = DecisionTreeClassifier()
start = time.time()
dt.fit(X_train, y_train)
end = time.time()
dt_time = end - start
print("Decision Tree Train Time:", dt_time)

In [None]:
pickle.dump(dt, open("dt.pkl", "wb"))

In [None]:
dt_scores = cross_val_score(DecisionTreeClassifier(), X_train, y_train, cv=3)
print("Decision Tree Cross-Validation Accuracy Scores:", dt_scores)

In [None]:
dt_pred_train = dt.predict(X_train)
dt_pred_test = dt.predict(X_test)

dt_train_score = accuracy_score(dt_pred_train, y_train)
dt_test_score = accuracy_score(dt_pred_test, y_test)
print("Decision Tree Train Score:", dt_train_score)
print("Decision Tree Test Score:", dt_test_score)

In [None]:
print(classification_report(y_test, dt_pred_test))

In [None]:
dt_cm = confusion_matrix(y_test, dt_pred_test)
fig, ax = plot_confusion_matrix(conf_mat=dt_cm, show_absolute=True, show_normed=True, colorbar=True, class_names=class_names)
plt.title("Decision Tree Confusion Matrix")
plt.show()

# MultinomialNB

In [None]:
mnb = MultinomialNB()
start = time.time()
mnb.fit(X_train, y_train)
end = time.time()
mnb_time = end - start
print("MultinomialNB Train Time:", mnb_time)

In [None]:
pickle.dump(mnb, open("mnb.pkl", "wb"))

In [None]:
mnb_scores = cross_val_score(MultinomialNB(), X_train, y_train, cv=3)
print("MultinomialNB Cross-Validation Accuracy Scores:", mnb_scores)

In [None]:
mnb_pred_train = mnb.predict(X_train)
mnb_pred_test = mnb.predict(X_test)

mnb_train_score = accuracy_score(mnb_pred_train, y_train)
mnb_test_score = accuracy_score(mnb_pred_test, y_test)
print("MultinomialNB Train Score:", mnb_train_score)
print("MultinomialNB Test Score:", mnb_test_score)

In [None]:
print(classification_report(y_test, mnb_pred_test))

In [None]:
mnb_cm = confusion_matrix(y_test, mnb_pred_test)
fig, ax = plot_confusion_matrix(conf_mat=mnb_cm, show_absolute=True, show_normed=True, colorbar=True, class_names=class_names)
plt.title("MultinomialNB Confusion Matrix")
plt.show()

# XGB

In [None]:
xgb = XGBClassifier()
start = time.time()
xgb.fit(X_train, y_train)
end = time.time()
xgb_time = end - start
print("XGB Train Time:", xgb_time)

In [None]:
pickle.dump(xgb, open("xgb.pkl", "wb"))

In [None]:
xgb_scores = cross_val_score(XGBClassifier(), X_train, y_train, cv=3)
print("XGB Cross-Validation Accuracy Scores:", xgb_scores)

In [None]:
xgb_pred_train = xgb.predict(X_train)
xgb_pred_test = xgb.predict(X_test)

xgb_train_score = accuracy_score(xgb_pred_train, y_train)
xgb_test_score = accuracy_score(xgb_pred_test, y_test)
print("XGB Train Score:", xgb_train_score)
print("XGB Test Score:", xgb_test_score)

In [None]:
print(classification_report(y_test, xgb_pred_test))

In [None]:
xgb_cm = confusion_matrix(y_test, xgb_pred_test)
fig, ax = plot_confusion_matrix(conf_mat=xgb_cm, show_absolute=True, show_normed=True, colorbar=True, class_names=class_names)
plt.title("XGB Confusion Matrix")
plt.show()

In [None]:
labels = ["LR", "DT", "RF", "MNB", "XGB"]
scores = [logreg_test_score, dt_test_score, rf_test_score, mnb_test_score, xgb_test_score]

In [None]:
plt.figure()
ax = sns.barplot(x=labels, y=scores)
ax.bar_label(ax.containers[0])
plt.title("Trained Models Accuracy")
plt.show()

# Test

In [None]:
test_agreement = "/mnt/hdd/Datasets/pdfs/agree.pdf"
test_agreement_file = open(test_agreement, "rb")
pdf_reader = PyPDF2.PdfReader(test_agreement_file)
total_pages = len(pdf_reader.pages)
if total_pages > 3:
    total_pages = 3
count = 0
text = ""
while count < total_pages:
    page = pdf_reader.pages[count]
    count += 1
    text += page.extract_text()

spell = Speller(lang="en")
texts = spell(text)
text_file = " ".join([word.lower() for word in word_tokenize(texts)])
#print(text_file)
text_cleaned = clean_text(text_file)
text_tfidf = tfidf.transform([text_cleaned])
res_idx = xgb.predict(text_tfidf)[0]
print(class_names[res_idx])