# Modelling & Training

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import re
import json
import joblib
import requests

In [None]:
try:
    data = pd.read_csv('cleanData.csv', index_col=0)
except Exception:
    %run -i preprocess.py
    data = pd.read_csv('cleanData.csv', index_col=0)

datasetIds = pd.read_csv('dialect_dataset.csv', index_col='id')
dialects = sorted(datasetIds.dialect.unique())
label_map = dict(zip(dialects, range(len(dialects))))
inv_label_map = dict(enumerate(dialects))

# Figure 7 in `arxiv:2005.06557`
ordered_dialects = 'IQ YE OM BH KW SA AE QA DZ MA LY TN EG SD JO PL LB SY'.split()
ordered_dialects = [label_map[x] for x in ordered_dialects]

In [None]:
bins_x = [inv_label_map[x] for x in data.dialect.value_counts().index]
bins_y = data.dialect.value_counts() # / len(data)None

plt.figure(figsize=(10,5))
plt.bar(bins_x, bins_y)
plt.title('Dialects counts sorted')
plt.tight_layout()

# this sorting matches the order in paper, it's here for illustration
freq = data.dialect.value_counts()
# transform the returned series ordered_dialects with index as key
freq = dict(zip(freq.index, freq))
# sort the keys in freq by ordered dialects
freq = [freq[x] for x in ordered_dialects_ids] # / len(data)None

plt.figure(figsize=(10,5))
plt.bar(ordered_dialects, freq)
plt.title('Dialects counts as ordered in paper')
plt.tight_layout()

In [None]:
from pprint import pprint
from time import time
import logging

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import (train_test_split, cross_val_score,
                                     GridSearchCV, RandomizedSearchCV)
from sklearn.pipeline import Pipeline
from sklearn.metrics import (confusion_matrix, ConfusionMatrixDisplay,
                             balanced_accuracy_score, classification_report)
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

limit = 5000
mini = data.groupby('dialect').sample(limit)
feature = mini.text
target = mini.dialect

# # splitting the data into target and feature
# # getting out of memmory everytime -> sol: Out Of Core (partial_fit)
# # Out Of Core + Stratified KFold, maybe coming soon
# feature = data.text
# target = data.label

X_train, X_test, y_train, y_test = train_test_split(feature, target, 
                                                    test_size =.1, random_state=42)

# # splitting into train and validation
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
#                                                   test_size =.1, random_state=42)

def fit_model(pipeline, parameters, random_search, itr=0, interactive=1, X_train=X_train,
              y_train=y_train, X_test=X_test, y_test=y_test):
    if interactive:
        print(f'Performing {random_search.__class__.__name__} ...')
        print('pipeline:', [name for name, _ in pipeline.steps])
        print('parameters:')
        pprint(parameters)
    t0 = time()
    random_search.fit(X_train, y_train)
    print('done in %0.3fs' % (time() - t0))
    print()

    if interactive:
        print('Best score: %0.3f' % random_search.best_score_)
        print('Best parameters set:')
        best_parameters = random_search.best_estimator_.get_params()
        for param_name in sorted(parameters.keys()):
            print('\t%s: %r' % (param_name, best_parameters[param_name]))

        # make prediction and print accuracy
        prediction = random_search.predict(X_test)
        print(f'Accuracy score is {balanced_accuracy_score(y_test, prediction):.2f}')

        print(classification_report(y_test, prediction, labels=ordered_dialects_ids))
    
    joblib.dump(random_search.best_estimator_, 
                f'{pipeline.named_steps.clf.__class__.__name__}_{itr}.pkl',
                compress = 1)
    
    return

## SGDClassifier 

In [None]:
def random_search_fit_sgd(random_state=42, n_iter=1, itr=0, interactive=1):
    # make pipeline
    pipeline = Pipeline(
        [
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', SGDClassifier()),
        ]
    )

    # make param grid
    parameters = {
        'vect__max_df': (0.5, 0.75, 1.0),
        'vect__max_features': (None, 5000, 10000, 50000),
        'vect__ngram_range': ((1, 2), (1, 3), (1, 4), (1, 5), (1, 6)),
        'tfidf__use_idf': (True, False),
        'tfidf__norm': ('l1', 'l2'),
        'clf__max_iter': (20, 50, 80, 100, 150),
        'clf__alpha': (0.00003, 0.0001, 0.0003, 0.001, 0.003, 0.01,),
        'clf__penalty': ('l2', 'elasticnet'),
    }

    # create and fit the model
    random_search_sgd = RandomizedSearchCV(pipeline, parameters, cv=5, verbose=1,
                                           scoring='f1_macro', n_iter=n_iter, n_jobs=-1,
                                           random_state=random_state)

    fit_model(pipeline, parameters, random_search_sgd, itr=itr, interactive=interactive)
    
    return random_search_sgd

random_search_sgd = random_search_fit_sgd(interactive=0)

In [None]:
preds = random_search_sgd.best_estimator_.predict(X_test)
print(classification_report(y_test, preds, labels=ordered_dialects_ids))

fig, ax = plt.subplots(1,1,figsize=(14,10))
ConfusionMatrixDisplay.from_estimator(random_search_sgd.best_estimator_, X_test,
                                      y_test, ax=ax, labels=ordered_dialects_ids,
                                      normalize='true', values_format='.2f', );

## Random Forest Classifier
Quite bad, so dropped it

In [None]:
def random_search_fit_rf(random_state=42, n_iter=1, itr=0, interactive=1):
    # make pipeline
    pipeline = Pipeline(
        [
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', RandomForestClassifier()),
        ]
    )

    # make param grid
    parameters = {
        'vect__max_df': (0.5, 0.75, 1.0),
        'vect__max_features': (None, 5000, 10000, 50000),
        'vect__ngram_range': ((1, 2), (1, 3), (1, 4), (1, 5), (1, 6)),
        'tfidf__use_idf': (True, False),
        'tfidf__norm': ('l1', 'l2'),
        'clf__criterion': ('gini', 'entropy'),
        'clf__max_depth': list(range(10,110,10,),)+[None,],
        'clf__max_features': ['auto', 'sqrt'],
        'clf__min_samples_leaf': [1, 2, 4],
        'clf__min_samples_split': [2, 5, 10],
    }

    # create and fit the model
    random_search_rf = RandomizedSearchCV(pipeline, parameters, cv=5, verbose=1,
                                           scoring='f1_macro', n_iter=1, n_jobs=-1,
                                           random_state=42)

    fit_model(pipeline, parameters, random_search_rf, itr=itr, interactive=interactive)
    
    return random_search_rf

random_search_rf = random_search_fit_rf(interactive=0)

In [None]:
preds = random_search_rf.best_estimator_.predict(X_test)
print(classification_report(y_test, preds, labels=ordered_dialects_ids))

fig, ax = plt.subplots(1,1,figsize=(14,10))
ConfusionMatrixDisplay.from_estimator(random_search_rf.best_estimator_, X_test, y_test,
                                      ax=ax, labels=ordered_dialects_ids, normalize='true',
                                      values_format='.2f', );

## Voting Classifier
Train 5 models on 5 folds, with each fold represent a sample of 5k sample from every dialect.

In [None]:
num = 5
SGDs = []
for i in range(num):
    print('-'*10, f'{i+1} / {num}', '-'*10)
    sgd = random_search_fit_sgd(itr=i, interactive=0)
    SGDs.append(sgd.best_estimator_)

SGDLbls = [f'SGDClassifier_{x}' for x in range(1,6)]
for clf, label in zip(SGDs, SGDLbls):
    scores = cross_val_score(clf, X_test, y_test, error_score='raise',
                             scoring='f1_macro', cv=5)
    print(f'F1 score: {scores.mean():0.2f} (+/- {scores.std():0.2f}) [{label}]')

#### I've already trained/fit my models, and I don't want to retrain so, I'm listing the objects of my trained models as a list then fitting labelEncoder of the Ensembler, then save the ensembler as pkl, and it'll automatically write all the trained models, and later when trying to use it for inference, load and use.
#### Ensembler expects predictions from base estimatros to be int and we have encoded classes into numbers, now it'll call predict on all classes for given inputs and return a result of majority answer for every input.

In [None]:
eclf = VotingClassifier(estimators=SGDs, voting='hard')

eclf.estimators_ = SGDs
eclf.le_ = LabelEncoder().fit(y_train)
eclf.classes_ = eclf.le_.classes_

# This will likely be a big file, that contains 5 classes, each is ~ 120mb
print(joblib.dump(eclf, f'{eclf.__class__.__name__}.pkl', compress = 1))
eclf = joblib.load('VotingClassifier.pkl')