## Imports

In [1]:
import os
import pickle
from collections import Counter

import numpy as np
import pandas as pd

import pytesseract
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_classif
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_recall_fscore_support,
    precision_score,
    recall_score,
)
from sklearn.model_selection import (
    RandomizedSearchCV,
    cross_val_score,
    train_test_split,
)
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from tqdm.notebook import tqdm


## Useful functions

In [2]:
def split_dataset(X, Y, test_size=0.3):
    return train_test_split(
        X, Y, test_size=test_size, random_state=42, shuffle=True, stratify=Y
    )


def save_vars(data, name):
    with open(name, "wb") as f:
        pickle.dump(data, f)
    return name


def get_vars(pickle_file):
    with open(pickle_file, "rb") as f:
        data = pickle.load(f)
    return data


def clean_text(text):
    text = " ".join(
        char.lower()
        for char in text.split()
        if char.isalpha() and char != "" and len(char) > 2
    )
    return text


def prepare_dataset(texts, classes):
    y = []
    x = []
    for item in tqdm(texts):
        if not item.endswith(".txt"):
            continue
        with open(item, "r") as f:
            text = f.read()
        text = clean_text(text)
        short_name = os.path.basename(item)
        name_without_ext = os.path.splitext(short_name)[0]
        document_class = "_".join(name_without_ext.split("_")[:-2])
        if document_class not in classes:
            document_class = "unknown"
        y.append(classes.index(document_class))
        x.append(text)
    return x, y


def balance_dataset(X, Y, document_classes, weights=None):
    if weights is None:
        weights = [1] * len(document_classes)
    X_balanced = []
    Y_balanced = []
    min_frequency = min(Counter(Y).values())
    for index, doc_class in enumerate(document_classes):
        x, y = get_samples_of_type(
            X, Y, index, document_classes, num=min_frequency * weights[index]
        )
        X_balanced += list(x)
        Y_balanced += list(y)
    return np.array(X_balanced), np.array(Y_balanced)


def get_samples_of_type(documents, labels, document_type, classes, num=None):
    print(f"Processing {classes[document_type]}...")
    documents = np.array(documents)
    labels = np.array(labels)
    mask = np.where(labels == document_type)[0]
    if num is not None:
        mask = np.random.choice(mask, size=num)
    documents_filtered = documents[mask]
    labels_filtered = labels[mask]
    return documents_filtered, labels_filtered


def get_accuracy(document_type, model, type_dict, verbose=False):
    x, y = type_dict.get(document_type, None)
    predictions = model.predict(x)
    total = 0
    correct = 0
    for predicted, true in zip(predictions, y):
        total += 1
        if predicted == true:
            correct += 1
    accuracy = round(correct / total, 3)
    if verbose:
        print(f"Total: {total}")
        print(f"Correct: {correct}")
        print(f"Accuracy: {accuracy}")
    return accuracy


def evaluate_models(models, model_names, document_names, verbose=False):
    accuracies = []
    for model in models:
        model_accuracies = []
        if verbose:
            print(f"Model: {model}")
        for key in type_dict:
            if verbose:
                print(f"Document type: {key}\n")
            accuracy = get_accuracy(key, model, type_dict)
            model_accuracies.append(accuracy)
        accuracies.append(model_accuracies)
    df = pd.DataFrame(accuracies)
    df.index = model_names
    df.columns = document_names
    return df


def build_confusion_matrix(model, features_test, labels_test, avg_method="micro"):
    prediction = model.predict(features_test)
    print("Accuracy:", accuracy_score(labels_test, prediction))
    print("F1 score:", f1_score(labels_test, prediction, average=avg_method))
    print("Recall:", recall_score(labels_test, prediction, average=avg_method))
    print("Precision:", precision_score(labels_test, prediction, average=avg_method))
    print("\n clasification report:\n", classification_report(labels_test, prediction))
    print("\n confussion matrix:\n", confusion_matrix(labels_test, prediction))


def get_clf_results(grid_output, features_test, labels_test):
    pipeline = grid_output.best_estimator_
    build_confusion_matrix(pipeline, features_test, labels_test)


In [3]:
def test_classifier(
    X_train,
    Y_train,
    max_df_range=(0.5, 1),
    min_df_range=(1, 10),
    k_range=(1000, 10000),
    clf=LinearSVC(),
    ngram_max=2,
    cv=5,
):

    epsilon = 0.0001
    vectorizer = TfidfVectorizer()
    calibrated_clf = CalibratedClassifierCV(base_estimator=clf)
    selector = SelectKBest(f_classif, k=10000)
    param_grid = {
        "vect__max_df": np.arange(max_df_range[0], max_df_range[1] + epsilon, 0.1),
        "select__k": list(range(k_range[0], k_range[1] + 1, 2000)),
        "vect__ngram_range": [(1, k) for k in range(2, ngram_max + 1)],
        "vect__min_df": np.arange(min_df_range[0], min_df_range[1] + 1, 1),
    }

    if isinstance(clf, LinearSVC):
        param_grid["clf__base_estimator__C"] = [0.01, 0.1, 1, 10, 100, 1000]

    steps = [("vect", vectorizer), ("select", selector), ("clf", calibrated_clf)]
    pipeline = Pipeline(steps=steps)
    grid = RandomizedSearchCV(
        pipeline, cv=cv, param_distributions=param_grid, verbose=10, n_jobs=24
    )
    grid.fit(X_train, Y_train)
    print("Best: %f using %s" % (grid.best_score_, grid.best_params_))
    means = grid.cv_results_["mean_test_score"]
    stds = grid.cv_results_["std_test_score"]
    params = grid.cv_results_["params"]
    for mean, stdev, param in zip(means, stds, params):
        print(f"Mean {round(mean,3)}+-{round(stdev,3)} and {param}")
    return grid


## Setup

In [4]:
CLASSES = [
    "commercial_invoice",
    "packing_list",
    "delivery_note",
    "customs_summary_declaration_with_commercial_detail_export",
    "despatch_note_model_t",
    "unknown"
]

In [5]:
texts_path = os.path.join(os.getcwd(), "all_texts") 
texts = [os.path.join(texts_path, item) for item in os.listdir(texts_path)]
len(texts)

178513

## Data preparation

### Data | `UNBALANCED`

In [6]:
X, Y = prepare_dataset(texts, CLASSES)

HBox(children=(FloatProgress(value=0.0, max=178513.0), HTML(value='')))




#### Total data distribution

In [7]:
data_distribution = {CLASSES[key]: value for key, value in Counter(Y).items()}
data_distribution

{'delivery_note': 10877,
 'customs_summary_declaration_with_commercial_detail_export': 38814,
 'commercial_invoice': 92255,
 'unknown': 25473,
 'despatch_note_model_t': 5675,
 'packing_list': 5419}

#### Creating holdout test dataset

In [8]:
X, X_holdout, Y, Y_holdout = split_dataset(
    X, Y, test_size=0.3
)

In [9]:
Counter(Y_holdout)

Counter({0: 27677, 3: 11644, 5: 7642, 1: 1626, 2: 3263, 4: 1702})

### Data | `BALANCED_SAME`

In [10]:
X_balanced_same, Y_balanced_same = balance_dataset(X, Y, CLASSES)

Processing commercial_invoice...
Processing packing_list...
Processing delivery_note...
Processing customs_summary_declaration_with_commercial_detail_export...
Processing despatch_note_model_t...
Processing unknown...


In [11]:
Counter(Y_balanced_same)

Counter({0: 3793, 1: 3793, 2: 3793, 3: 3793, 4: 3793, 5: 3793})

### Data | `BALANCED_UNKNOWN`

In [12]:
X_balanced_unknown, Y_balanced_unknown = balance_dataset(
    X, Y, CLASSES, weights=[1, 1, 1, 1, 1, 2]
)


Processing commercial_invoice...
Processing packing_list...
Processing delivery_note...
Processing customs_summary_declaration_with_commercial_detail_export...
Processing despatch_note_model_t...
Processing unknown...


In [13]:
Counter(Y_balanced_unknown)

Counter({0: 3793, 1: 3793, 2: 3793, 3: 3793, 4: 3793, 5: 7586})

### Train test split | `UNBALANCED`

In [14]:
features_train, features_test, labels_train, labels_test = split_dataset(
    X, Y, test_size=0.2
)

### Train test split | `BALANCED_SAME`

In [15]:
(
    features_train_balanced_same,
    features_test_balanced_same,
    labels_train_balanced_same,
    labels_test_balanced_same,
) = split_dataset(X_balanced_same, Y_balanced_same, test_size=0.2)

### Train test split | `BALANCED_UNKNOWN`

In [16]:
(
    features_train_balanced_unknown,
    features_test_balanced_unknown,
    labels_train_balanced_unknown,
    labels_test_balanced_unknown,
) = split_dataset(X_balanced_unknown, Y_balanced_unknown, test_size=0.2)

## Training

### `UNBALANCED`

In [None]:
grid_output_svc = test_classifier(
    features_train,
    labels_train,
    max_df_range=(0.5, 1),
    min_df_range=(1,10),
    k_range=(5000, 50000),
    ngram_max=3,
    clf=LinearSVC(class_weight="balanced", max_iter=10000),
)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:  1.2min
[Parallel(n_jobs=24)]: Done   9 out of  50 | elapsed:  1.6min remaining:  7.4min
[Parallel(n_jobs=24)]: Done  15 out of  50 | elapsed:  4.1min remaining:  9.7min
[Parallel(n_jobs=24)]: Done  21 out of  50 | elapsed:  5.2min remaining:  7.1min
[Parallel(n_jobs=24)]: Done  27 out of  50 | elapsed:  5.4min remaining:  4.6min
[Parallel(n_jobs=24)]: Done  33 out of  50 | elapsed:  6.0min remaining:  3.1min
[Parallel(n_jobs=24)]: Done  39 out of  50 | elapsed:  6.6min remaining:  1.9min


### `BALANCED_SAME`

In [None]:
grid_output_svc_balanced_same = test_classifier(
    features_train_balanced_same,
    labels_train_balanced_same,
    max_df_range=(0.5, 1),
    min_df_range=(1,10),
    k_range=(5000, 50000),
    ngram_max=3,
    clf=LinearSVC(max_iter=10000),
)


### `BALANCED_UNKNOWN`

In [None]:
grid_output_svc_balanced_unknown = test_classifier(
    features_train_balanced_unknown,
    labels_train_balanced_unknown,
    max_df_range=(0.5, 1),
    min_df_range=(1,10),
    k_range=(5000, 50000),
    ngram_max=3,
    clf=LinearSVC(class_weight="balanced", max_iter=10000),
)


### Saving models

In [None]:
if not os.path.exists("pkls_best"):
    os.makedirs("pkls_best")
save_vars(grid_output_svc, "./pkls_best/grid_output_svc.pkl")
save_vars(grid_output_svc_balanced_same, "./pkls_best/grid_output_svc_balanced_same.pkl")
save_vars(grid_output_svc_balanced_unknown, "./pkls_best/grid_output_svc_balanced_unknown.pkl")

In [None]:
# svc = get_vars("grid_output_svc_best.pkl")

## Results

In [None]:
model_unbalanced = grid_output_svc.best_estimator_
model_balanced_same = grid_output_svc_balanced_same.best_estimator_
model_balanced_unknown = grid_output_svc_balanced_unknown.best_estimator_

### `UNBALANCED`

In [None]:
get_clf_results(grid_output_svc, features_test, labels_test)

### `BALANCED_SAME`

In [None]:
get_clf_results(grid_output_svc_balanced_same, features_test_balanced_same, labels_test_balanced_same)

### `BALANCED_UNKNOWN`

In [None]:
get_clf_results(grid_output_svc_balanced_unknown, features_test_balanced_unknown, labels_test_balanced_unknown)

### Docs and labels for each document type

In [None]:
invoice_docs, invoice_labels = get_samples_of_type(
    X_holdout, Y_holdout, 0, CLASSES
)
packing_docs, packing_labels = get_samples_of_type(
    X_holdout, Y_holdout, 1, CLASSES
)
delivery_docs, delivery_labels = get_samples_of_type(
    X_holdout, Y_holdout, 2, CLASSES
)
customs_docs, customs_labels = get_samples_of_type(
    X_holdout, Y_holdout, 3, CLASSES
)
despatch_docs, despatch_labels = get_samples_of_type(
    X_holdout, Y_holdout, 4, CLASSES
)
unknown_docs, unknown_labels = get_samples_of_type(
    X_holdout, Y_holdout, 5, CLASSES
)

type_dict = {
    "invoice": (invoice_docs, invoice_labels),
    "packing": (packing_docs, packing_labels),
    "delivery": (delivery_docs, delivery_labels),
    "customs": (customs_docs, customs_labels),
    "despatch": (despatch_docs, despatch_labels),
    "unknown": (unknown_docs, unknown_labels),
}


### Results for each document type

In [None]:
model_names = ["model_unbalanced", "model_balanced_same", "model_balanced_unknown"]
models = [model_unbalanced, model_balanced_same, model_balanced_unknown]
document_names = [
    "commercial_invoice",
    "packing_list",
    "delivery_note",
    "customs_summary",
    "despatch_note",
    "unknown",
]


In [None]:
evaluate_models(models, model_names, document_names)