# Import Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import cv2
import pytesseract
import string
import spacy
import nltk
import en_core_web_sm
import pickle
import seaborn as sns
import time
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

from PIL import Image
from sklearn.model_selection import train_test_split, cross_val_score
from spacy import displacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier

from mlxtend.plotting import plot_confusion_matrix
from scikitplot.metrics import plot_cumulative_gain, plot_precision_recall, plot_roc_curve
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, f1_score
from sklearn.metrics import precision_recall_curve, recall_score, auc, roc_auc_score, roc_curve

np.bool = np.bool_

# Load Data

In [None]:
root_dir = "/mnt/hdd/Datasets/docs-sm"
folders = ["form", "invoice", "letter", "questionnaire", "resume", ]

In [None]:
form_image = Image.open(os.path.join(root_dir, "form", "00043194.jpg"))
form_image

In [None]:
file_path = os.path.join(root_dir, "form", "00043194.jpg")
img = cv2.imread(file_path)
text = pytesseract.image_to_data(img, output_type="data.frame")
text = text.dropna()
words = []
for t in text["text"]:
    words.append(t.strip())
texts = " ".join([word for word in words])
print(texts)

In [None]:
data = []
labels = []

for folder in folders:
    files = os.listdir(os.path.join(root_dir, folder))
    for file in files:
        file_path = os.path.join(root_dir, folder, file)
        img = cv2.imread(file_path)
        text = pytesseract.image_to_data(img, output_type="data.frame")
        text = text.dropna()
        words = []
        for t in text["text"]:
            words.append(t.strip())

        texts = " ".join([word for word in words])
        data.append(texts)
        labels.append(folder)

In [None]:
df = pd.DataFrame({"text": data, "doc_type": labels})
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df["doc_type"].value_counts()

In [None]:
df.to_csv("df.csv", encoding="utf-8", index=None, header=None)

# Preprocess

In [None]:
def label_encoder(column):
    le = LabelEncoder().fit(column)
    print(column.name, le.classes_)
    return le.transform(column)

In [None]:
df["doc_type"] = label_encoder(df["doc_type"])

In [None]:
df.head()

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
stopwords = list(STOP_WORDS)
print(len(stopwords))

In [None]:
punctuations = string.punctuation

In [None]:
def sentence_tokens(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_ for token in doc]
    tokens = [token for token in tokens if token not in punctuations and token not in stopwords]
    return tokens

In [None]:
tfidf = TfidfVectorizer(tokenizer=sentence_tokens)

# Model

In [None]:
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
df.head()

In [None]:
X = df["text"]
y = df["doc_type"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
pickle.dump(tfidf, open("tfidf.pkl", "wb"))

# Logistic Regression

In [None]:
logreg = LogisticRegression()
start = time.time()
logreg.fit(X_train_tfidf, y_train)
end = time.time()
logreg_time = end - start
print("Logistic Regression Train Time:", logreg_time)

In [None]:
pickle.dump(logreg, open("logreg.pkl", "wb"))

In [None]:
logreg_scores = cross_val_score(LogisticRegression(), X_train_tfidf, y_train, cv=3)
print("Logistic Regression Cross-Validation Accuracy Scores:", logreg_scores)

In [None]:
logreg_pred_train = logreg.predict(X_train_tfidf)
logreg_pred_test = logreg.predict(X_test_tfidf)

logreg_train_score = accuracy_score(logreg_pred_train, y_train)
logreg_test_score = accuracy_score(logreg_pred_test, y_test)
print("Logistic Regression Train Score:", logreg_train_score)
print("Logistic Regression Test Score:", logreg_test_score)

In [None]:
print(classification_report(y_test, logreg_pred_test))

In [None]:
logreg_cm = confusion_matrix(y_test, logreg_pred_test)
fig, ax = plot_confusion_matrix(conf_mat=logreg_cm, show_absolute=True, show_normed=True, colorbar=True, class_names=class_names)
plt.title("Logistic Regression Confusion Matrix")
plt.show()

# Random Forest

In [None]:
rf = RandomForestClassifier()
start = time.time()
rf.fit(X_train_tfidf, y_train)
end = time.time()
rf_time = end - start
print("Random Forest Train Time:", rf_time)

In [None]:
pickle.dump(rf, open("rf.pkl", "wb"))

In [None]:
rf_scores = cross_val_score(RandomForestClassifier(), X_train_tfidf, y_train, cv=3)
print("Random Forest Cross-Validation Acuracy Scores:", rf_scores)

In [None]:
rf_pred_train = rf.predict(X_train_tfidf)
rf_pred_test = rf.predict(X_test_tfidf)

rf_train_score = accuracy_score(rf_pred_train, y_train)
rf_test_score = accuracy_score(rf_pred_test, y_test)
print("Random Forest Train Score:", rf_train_score)
print("Random Forest Test Score:", rf_test_score)

In [None]:
print(classification_report(y_test, rf_pred_test))

In [None]:
rf_cm = confusion_matrix(y_test, rf_pred_test)
fig, ax = plot_confusion_matrix(conf_mat=rf_cm, show_absolute=True, show_normed=True, colorbar=True, class_names=class_names)
plt.title("Random Forest Confusion Matrix")
plt.show()

# Decision Tree

In [None]:
dt = DecisionTreeClassifier()
start = time.time()
dt.fit(X_train_tfidf, y_train)
end = time.time()
dt_time = end - start
print("Decision Tree Train Time:", dt_time)

In [None]:
pickle.dump(dt, open("dt.pkl", "wb"))

In [None]:
dt_scores = cross_val_score(DecisionTreeClassifier(), X_train_tfidf, y_train, cv=3)
print("Decision Tree Cross-Validation Accuracy Scores:", dt_scores)

In [None]:
dt_pred_train = dt.predict(X_train_tfidf)
dt_pred_test = dt.predict(X_test_tfidf)

dt_train_score = accuracy_score(dt_pred_train, y_train)
dt_test_score = accuracy_score(dt_pred_test, y_test)
print("Decision Tree Train Score:", dt_train_score)
print("Decision Tree Test Score:", dt_test_score)

In [None]:
print(classification_report(y_test, dt_pred_test))

In [None]:
dt_cm = confusion_matrix(y_test, dt_pred_test)
fig, ax = plot_confusion_matrix(conf_mat=dt_cm, show_absolute=True, show_normed=True, colorbar=True, class_names=class_names)
plt.title("Decision Tree Confusion Matrix")
plt.show()

# MultinomialNB

In [None]:
mnb = MultinomialNB()
start = time.time()
mnb.fit(X_train_tfidf, y_train)
end = time.time()
mnb_time = end - start
print("MultinomialNB Train Time:", mnb_time)

In [None]:
pickle.dump(mnb, open("mnb.pkl", "wb"))

In [None]:
mnb_score = cross_val_score(MultinomialNB(), X_train_tfidf, y_train, cv=3)
print("MultinomialNB Cross-Validation Accuracy Scores:", mnb_score)

In [None]:
mnb_pred_train = mnb.predict(X_train_tfidf)
mnb_pred_test = mnb.predict(X_test_tfidf)

mnb_train_score = accuracy_score(mnb_pred_train, y_train)
mnb_test_score = accuracy_score(mnb_pred_test, y_test)
print("MultinomialNB Train Score:", mnb_train_score)
print("MultinomialNB Test Score:", mnb_test_score)

In [None]:
print(classification_report(y_test, mnb_pred_test))

In [None]:
mnb_cm = confusion_matrix(y_test, mnb_pred_test)
plot_confusion_matrix(conf_mat=mnb_cm, show_absolute=True, show_normed=True, colorbar=True, class_names=class_names)
plt.title("MultinomialNB Confusion Matrix")
plt.show()

# XGB

In [None]:
xgb = XGBClassifier()
start = time.time()
xgb.fit(X_train_tfidf, y_train)
end = time.time()
xgb_time = end - start
print("XGB Train Time:", xgb_time)

In [None]:
xgb_scores = cross_val_score(XGBClassifier(), X_train_tfidf, y_train, cv=3)
print("XGBClassifier Cross-Validation Accuracy Scores:", xgb_scores)

In [None]:
xgb_pred_train = xgb.predict(X_train_tfidf)
xgb_pred_test = xgb.predict(X_test_tfidf)

xgb_train_score = accuracy_score(xgb_pred_train, y_train)
xgb_test_score = accuracy_score(xgb_pred_test, y_test)
print("XGB Train Score:", xgb_train_score)
print("XGB Test Score:", xgb_test_score)

In [None]:
print(classification_report(y_test, xgb_pred_test))

In [None]:
xgb_cm = confusion_matrix(y_test, xgb_pred_test)
plot_confusion_matrix(conf_mat=xgb_cm, show_absolute=True, show_normed=True, colorbar=True, class_names=class_names)
plt.title("XGB Confusion Matrix")
plt.show()

In [None]:
labels = ["LR", "DT", "RF", "MNB", "XGB"]
scores = [logreg_test_score, dt_test_score, rf_test_score, mnb_test_score, xgb_test_score]

In [None]:
plt.figure()
ax = sns.barplot(x=labels, y=scores)
ax.set_title("Trained Models Accuracy")
for container in ax.containers:
    ax.bar_label(container)

plt.show()

# Test

In [None]:
folder = "form"
file_path = os.path.join(root_dir, folder, "00043194.jpg")
img = cv2.imread(file_path)
text = pytesseract.image_to_data(img, output_type="data.frame")
text = text.dropna()
words = []
for t in text["text"]:
    words.append(t.strip())
texts = " ".join([word for word in words])
texts_tfidf = tfidf.transform([texts])

In [None]:
result_idx = logreg.predict(texts_tfidf)
result = class_names[result_idx[0]]
print(result)

In [None]:
plt.figure()
plt.imshow(img)
plt.title(f"Predicted: {result}")
plt.show()