# Applied Project in Big Data on Industrial Dataset

## MODELS SELECTION TECHNIQUES
## Part IV. Weights and Biases platform to manage experiments

### 1. Libraries

In [None]:
!pip install wandb

In [None]:
import os
import re
import json
import time
import wandb
import random
import datetime
import numpy as np
import pandas as pd
from joblib import dump, load
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import multiprocessing
from multiprocessing import Pool
from sklearn.feature_extraction.text import (
    TfidfVectorizer, 
    CountVectorizer
)
from sklearn.metrics import (
    roc_auc_score,
    accuracy_score, 
    confusion_matrix, 
    precision_score, 
    recall_score, 
    f1_score,
    roc_curve, 
    auc
)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier, 
    GradientBoostingClassifier
)
from sklearn.model_selection import (
    cross_val_score, 
    train_test_split,
    StratifiedKFold
)
pd.set_option('display.max_columns', None)
N_CORES = min(
    multiprocessing.cpu_count(), 
    int(float(os.environ['CPU_LIMIT']))
)
print('cores:', N_CORES)

### 2. Create config and place to store artifacts

We will expand our approach with use of [WandB](https://wandb.ai) platform.

In [None]:
start_time = time.time()

# here is our config dictionary
# we can use it to manage model's parameters
# and save it to disk for a history
VER = 'v0'
CONFIG = {
    'version': VER,
    'start_time': str(datetime.datetime.fromtimestamp(start_time)),
    'sample_size': 1000,
    'ngram_range': (1, 1), 
    'max_df': .95, 
    'min_df': 5,
    'clf': 'GradientBoostingClassifier', # 'RandomForestClassifier' or `GradientBoostingClassifier`
    'folds': 4,
    'seed': 2023,
    'n_iters': 10,
    'comments': 'my first model'
}

# path to store our model
# will create folder each time
# we run our training code
MDLS_PATH = f'./models_{VER}'
if not os.path.exists(MDLS_PATH):
    os.mkdir(MDLS_PATH)
with open(f'{MDLS_PATH}/config.json', 'w') as file:
    json.dump(CONFIG, file)

# useful trick to fix randomness
def seed_all(seed):
    """
    Sometimes it is useful to nail all randomness
    and fix all random seeds for reproducibility.
    
    This function fixes all random seeds for current pipline, 
    but it could be extended e.g. for Tensorflow library 
    you may want to add `tf.random.set_seed(seed)` in the code.
    
    """
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_all(CONFIG['seed'])

In [None]:
# you may run `wandb.login()` in notebook
# or `wandb login` in terminal

with open(f'/home/jovyan/.wandb', 'r') as file:
    api_key = file.read()
wandb.login(key=api_key.replace('\n', ''))

### 2. Dataset for modelling

In [None]:
df = pd.read_csv('articles_data.csv')
df = df.sample(CONFIG['sample_size']).reset_index()
del df['index']
print(df.shape)
display(df.head())

In [None]:
df.groupby('label').count()

### 3. Modelling with save of results with WandB

In [None]:
def text_features(data, vectorizer):
    print('total texts:', len(data))
    features = vectorizer.fit_transform(data)
    print(
        'features shape:', features.shape, 
        'max:', np.max(features), 
        'min:', np.min(features)
    )
    return features, vectorizer

In [None]:
def cross_val_model(X, y, 
                    folds, clf,
                    vectorizer, ngram_range=(1, 1), 
                    max_df=.2, min_df=8, seed=2022):
    scores = {}
    roc_auc_scores = []
    f1_scores = []
    skf = StratifiedKFold(n_splits=folds, random_state=seed, shuffle=True)
    for fold, (train_idxs, test_idxs) in enumerate(skf.split(X, y)):
        
        # train model
        
        X_train, X_test = X.iloc[train_idxs], X.iloc[test_idxs]
        y_train, y_test = y.iloc[train_idxs], y.iloc[test_idxs]
        X_train, vectorizer = text_features(
            X_train, 
            vectorizer=vectorizer
        )
        X_test = vectorizer.transform(X_test)
        clf.fit(X_train, y_train)
        
        # saving models
        # more about https://scikit-learn.org/stable/model_persistence.html
        # NOTE - not only model, but vectorizer too!
        
        file_name = f'{MDLS_PATH}/model_fold_{fold}.joblib'
        dump(clf, file_name)
        print('saved to', file_name)
        file_name = f'{MDLS_PATH}/vectorizer_fold_{fold}.joblib'
        dump(vectorizer, file_name)
        print('saved to', file_name)
        
        y_score = clf.predict_proba(X_test)
        roc_auc_score_ = roc_auc_score(y_test, y_score[:, 1])
        roc_auc_scores.append(roc_auc_score_)
        y_pred = clf.predict(X_test)
        f1_score_ = f1_score(y_test, y_pred)

        f1_scores.append(f1_score_)
        msg = f'fold {fold} - val ROC-AUC score: {roc_auc_score_:.2f}, val f1-score: {f1_score_:.2f}'
        print(msg)
        
        scores[f'fold {fold}'] = {
            'roc_auc_scores': roc_auc_scores,
            'f1_scores': f1_scores
        }
        
        # WandB visualize all classifier plots
        
        run = wandb.init(
            project='articles',
            name=f'Articles classification {VER} fold {fold}',
            settings=wandb.Settings(
                start_method="thread", 
                console="auto"
            )
        )
        wandb.config = CONFIG
        wandb.sklearn.plot_classifier(
            clf, 
            X_train, X_test, 
            y_train, y_test, 
            y_pred, y_score,
            labels=['good', 'bad'],
            is_binary=True,
            model_name=f'{CONFIG["clf"]} fold {fold}', 
            feature_names=vectorizer.get_feature_names_out()
        )
        
        # metrics to WandB
        wandb.summary['ROC AUC score'] = roc_auc_score_
        wandb.summary['F1 score'] = f1_score_
        
        # WandB data versioning
        data_path = f'./models_{CONFIG["version"]}/data_fold_{fold}'
        if not os.path.exists(data_path):
            os.mkdir(data_path)
        save_data_path = f'{data_path}/data_fold_{fold}.csv'
        X.iloc[train_idxs].to_csv(save_data_path)
        wandb_data = wandb.Artifact(f'dataset_{CONFIG["version"]}_fold_{fold}', type='raw_data')
        wandb_data.add_dir(data_path)
        run.log_artifact(wandb_data)
        
    return scores

In [None]:
# create vectorizer with respect to
# our configuration parameters
vectorizer=TfidfVectorizer(
    ngram_range=CONFIG['ngram_range'], 
    max_df=CONFIG['max_df'], 
    min_df=CONFIG['min_df']
)

# select the type of the model
if CONFIG['clf'] == 'RandomForestClassifier':
    clf = RandomForestClassifier(n_estimators=CONFIG['n_iters']) 
elif CONFIG['clf'] == 'GradientBoostingClassifier':
    clf = GradientBoostingClassifier(n_estimators=CONFIG['n_iters']) 
else:
    clf = LogisticRegression()
    
# start our training
scores = cross_val_model(
    X=df['proc'], 
    y=df['label'], 
    folds=CONFIG['folds'], 
    clf=clf,
    vectorizer=vectorizer,
    seed=CONFIG['seed']
)