# Applied Project in Big Data on Industrial Dataset

## MODELS SELECTION TECHNIQUES
## Part I. Vanilla pipeline

### 1. Libraries

In [None]:
import os
import re
import json
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import multiprocessing
from multiprocessing import Pool
from sklearn.feature_extraction.text import (
    TfidfVectorizer, 
    CountVectorizer
)
from sklearn.metrics import (
    roc_auc_score,
    accuracy_score, 
    confusion_matrix, 
    precision_score, 
    recall_score, 
    f1_score,
    roc_curve, 
    auc,
    confusion_matrix,
    ConfusionMatrixDisplay,
    precision_recall_curve,
    PrecisionRecallDisplay
)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import (
    cross_val_score, 
    train_test_split,
    StratifiedKFold
)
pd.set_option('display.max_columns', None)
N_CORES = min(
    multiprocessing.cpu_count(), 
    int(float(os.environ['CPU_LIMIT']))
)
print('cores:', N_CORES)

### 2. Raw dataset

#### 2.1. Load data

In [None]:
data_path = '/home/jovyan/apbdid_24/topic_2/articles_data/arcicles_dataset.csv'

In [None]:
df = pd.read_csv(f'{data_path}', sep=',', index_col=0)
#del df['index']
print(df.shape)
display(df.head())

In [None]:
df['text'] = df.apply(
    lambda row: ' '.join([str(row['title']), str(row['annotation'])]), 
    axis=1
)
del df['title'], df['annotation']
df.head()

In [None]:
df_gr = df.groupby('target').count()
df_gr

#### 2.2. Simplify our task

In [None]:
df_gr[df_gr.url == 1000]

In [None]:
targets = df_gr[df_gr.url > 900].index
list(targets)

In [None]:
df['target'] = df['target'].map({v:k for k, v in enumerate(targets[1:3])})
df = df[df['target'].notna()]
df.head()

In [None]:
df.groupby('target').count()

### 3. Basic NLP preprocessing

In [None]:
import pymorphy2 as pm
import nltk
import multiprocessing
from multiprocessing import Pool

LANG = 'russian'
MORPH = pm.MorphAnalyzer()
nltk.download('stopwords')
STOPWORDS = nltk.corpus.stopwords.words(LANG)

In [None]:
def preprocessing(sentence, as_list=False):
    s = sentence.replace('<b>', '').replace('</b>', '')
    s = re.sub('[^а-яА-Яa-zA-Z]+', ' ', s).strip().lower()
    s = re.sub('ё', 'е', s)
    funсtion_words = {'INTJ', 'PRCL', 'CONJ', 'PREP'}
    lemmatized_words = list(map(lambda word: MORPH.parse(word)[0], s.split()))
    result = []
    for word in lemmatized_words:
        if word.tag.POS not in funсtion_words:
            result.append(word.normal_form)
    result = [w for w in result if w not in STOPWORDS]
    if as_list:
        return result
    else:
        return ' '.join(result)
    
def apply_parallel(texts, func, n_cores=2):
    pool = Pool(n_cores)
    split = np.array_split(texts, n_cores)
    res = [item for sub in pool.map(func, split) for item in sub]
    pool.close()
    pool.join()
    return res

def preprocessing_list(sentences):
    return [preprocessing(s) for s in sentences]

In [None]:
%%time
proc = apply_parallel(
    df.text, 
    preprocessing_list, 
    n_cores=N_CORES
)

In [None]:
df.loc[:, 'proc'] = proc
print(df.shape)
display(df.head())

In [None]:
df[['target', 'proc']].to_csv('articles_data.csv', index=None)

In [None]:
df = pd.read_csv('articles_data.csv')
display(df.head())

### 3. Just make a model

In [None]:
MAX_DF = .95
MIN_DF = 5

In [None]:
def text_features(data, vectorizer):
    print('total texts:', len(data))
    features = vectorizer.fit_transform(data)
    print(
        'features shape:', features.shape, 
        'max:', np.max(features), 
        'min:', np.min(features)
    )
    return features, vectorizer

In [None]:
%%time
vectorizer=TfidfVectorizer(
    ngram_range=(1, 1), 
    max_df=MAX_DF, 
    min_df=MIN_DF
)
features, vectorizer = text_features(
    df['proc'], 
    vectorizer=vectorizer
)

In [None]:
print(features.todense())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df['proc'], 
    df['target'], 
    test_size=.3, 
    random_state=2022
)
X_train, vectorizer = text_features(
    X_train, 
    vectorizer=vectorizer
)
X_test = vectorizer.transform(X_test)
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
clf.get_params()

In [None]:
y_pred

In [None]:
roc_auc_score(y_test, y_pred)

In [None]:
y_score = clf.predict_proba(X_test)

In [None]:
roc_auc_score(y_test, y_score[:, 1])

In [None]:
for th in range(1, 10):
    print(
        'threshold = ', th / 10,
        '| ROC AUC score = ', roc_auc_score(
            y_test, 
            [1 if x > (th / 10) else 0 for x in y_score[:, 1]])
    )

In [None]:
# Scikit-learn Example of Receiver Operating Characteristic (ROC) 
# metric to evaluate classifier output quality.
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html

fpr, tpr, thresholds = roc_curve(y_test, y_score[:, 1])
roc_auc = auc(fpr, tpr)
roc_auc

In [None]:
f1_score(y_test, y_pred)

In [None]:
plt.figure()
lw = 2
plt.plot(
    fpr,
    tpr,
    color='darkorange',
    lw=lw,
    label='ROC curve (area = %0.2f)' % roc_auc,
)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc='lower right')
plt.show()

In [None]:
cm = confusion_matrix(y_test, y_pred)
cm_display = ConfusionMatrixDisplay(cm).plot()

In [None]:
cm

In [None]:
prec, recall, _ = precision_recall_curve(
    y_test,
    y_score[:, 1], 
    pos_label=clf.classes_[1]
)
pr_display = PrecisionRecallDisplay(precision=prec, recall=recall).plot()

### 4. More advanced model approach: cross-validation

In [None]:
def cross_val_model(X, y, 
                    folds, clf,
                    vectorizer, ngram_range=(1, 1), 
                    max_df=.2, min_df=8, seed=2022):
    scores = []
    skf = StratifiedKFold(n_splits=folds, random_state=seed, shuffle=True)
    for fold, (train_idxs, test_idxs) in enumerate(skf.split(X, y)):
        X_train, X_test = X.iloc[train_idxs], X.iloc[test_idxs]
        y_train, y_test = y.iloc[train_idxs], y.iloc[test_idxs]
        X_train, vectorizer = text_features(
            X_train, 
            vectorizer=vectorizer
        )
        X_test = vectorizer.transform(X_test)
        clf.fit(X_train, y_train)
        y_score = clf.predict_proba(X_test)
        scores_fold = roc_auc_score(y_test, y_score[:, 1])
        print(f'fold {fold} val score: {scores_fold:.2f}')
        scores.append(scores_fold)
    return scores

In [None]:
vectorizer=TfidfVectorizer(
    ngram_range=(1, 1), 
    max_df=MAX_DF, 
    min_df=MIN_DF
)
cross_val_model(
    X=df['proc'], 
    y=df['target'], 
    folds=5, 
    clf=LogisticRegression(),
    vectorizer=vectorizer,
    seed=2022
)

Let's explore the options how to manage your experiments...