In [None]:
!pip install scikit-learn-intelex >> /tmp/pip_sklearnex.log

from sklearnex import patch_sklearn
patch_sklearn(verbose=False)

In [None]:
import re, regex, os, sys, random, gc, logging
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

logging.getLogger().setLevel(logging.WARNING)

SEED = 42
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything()

INPUT_PATH = '/kaggle/input/translit-datasets/'

# Dataset

In [None]:
# Insert appropriate dataset reading code from dataset_readers.py

def read_dataset():
    pass

In [None]:
train_df, test_df, label_names, dataset_name, text_col = read_dataset()

train_df.label = train_df.label.cat.codes
test_df.label = test_df.label.cat.codes

class_weights = dict(enumerate(
    compute_class_weight(
        class_weight="balanced", 
        classes=np.unique(train_df['label']), 
        y=train_df['label']
    )
))

pd.set_option('max_colwidth', 200)
display(train_df.head())
display(test_df.head())

print(f'{len(train_df)=}, {len(test_df)=}')
print(dataset_name, label_names)

plt.figure(figsize=(6,2))
plt.bar(x=label_names, height=np.bincount(train_df['label']))

# Feature Extraction

In [None]:
remove_punc = regex.compile(r'[\p{C}|\p{M}|\p{P}|\p{S}|\p{Z}]+', regex.UNICODE)

def cleaner(text):
    text = text.lower()
    text = remove_punc.sub(" ", text).strip()
    text = re.sub('\s+', ' ', text)
    return text

count_vectorizer = CountVectorizer(
    preprocessor=cleaner,
    min_df=2
)

tfidf_vectorizer = TfidfVectorizer(
    preprocessor=cleaner,
    min_df=2
)

# Trainer code

In [None]:

def fit_predict_evaluate(
    clf, vectorizer,
    train_df, test_df,
    params: dict,
    seed: int = 42,
    text_col: str = text_col,
    dataset_name: str = dataset_name, 
    labels_names: list[str] = label_names, 
    class_weights: list[float] = class_weights,
):
    # Make train and text data
    y_train, y_test = train_df['label'], test_df['label']
    X_train = vectorizer.fit_transform(train_df[text_col])
    X_test  = vectorizer.transform(test_df[text_col])

    # Fit, predict
    clf = clf(random_state=seed, class_weight=class_weights, **params).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    # Display report
    print(classification_report(
        y_test, y_pred, target_names=labels_names, digits=5
    ))
    display(ConfusionMatrixDisplay.from_predictions(
        y_test, y_pred, display_labels=labels_names
    ))

# Bag of Words

## Logistic Regression

In [None]:
fit_predict_evaluate(
    LogisticRegression, count_vectorizer,
    train_df, test_df,
    params={
#         'solver': 'sag',
#         'max_iter': 200
    }
)

## Support Vector Classifier

In [None]:
fit_predict_evaluate(
    SVC, count_vectorizer,
    train_df, test_df,
    params={}
)

## Random Forest

In [None]:
fit_predict_evaluate(
    RandomForestClassifier, count_vectorizer,
    train_df, test_df,
    params={
#         'n_estimators': 150,
#         'max_depth': 9,
    }
)

## XGBoost

In [None]:
fit_predict_evaluate(
    XGBClassifier,count_vectorizer,
    train_df, test_df,
    params={
#         'n_estimators': 150,
#         'max_depth': 4,
    }
)

# TF-IDF

## Logistic Regression

In [None]:
fit_predict_evaluate(
    LogisticRegression, tfidf_vectorizer,
    train_df, test_df,
    params={
#         'solver': 'sag',
#         'max_iter': 200
    }
)

## Support Vector Classifier

In [None]:
fit_predict_evaluate(
    SVC, tfidf_vectorizer,
    train_df, test_df,
    params={}
)

## Random Forest

In [None]:
fit_predict_evaluate(
    RandomForestClassifier, tfidf_vectorizer,
    train_df, test_df,
    params={
#         'n_estimators': 150,
#         'max_depth': 9,
    }
)

## XGBoost

In [None]:
fit_predict_evaluate(
    XGBClassifier, tfidf_vectorizer,
    train_df, test_df,
    params={
#         'n_estimators': 150,
#         'max_depth': 4,
    }
)