# Modeling


## Read in Libraries & Data

In [1]:
%load_ext autoreload
%autoreload 2

# utility libraries
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3 as sql
from ydata_profiling import ProfileReport
import modin.pandas as pd
import modin.config as modin_config
from utils import load_pickle_file, save_pickle_file
from tqdm.notebook import tqdm
modin_config.Engine.put("dask")

# ML libraries 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
import numpy as np

  def hasna(x: np.ndarray) -> bool:


In [7]:
# read in the training and testing data
x_train = load_pickle_file('../data/x_train_vectorized.pkl')
x_test = load_pickle_file('../data/x_test_vectorized.pkl')

y_train = load_pickle_file('../data/y_train_encode.pkl')
y_test = load_pickle_file('../data/y_test_encode.pkl')

### Establish Model Training

In [None]:
# simple configs
model_name = 'rfr_500_trees_balance_subsample_1_2_words_v2'
training=False

In [3]:
if training:
    rfc = RandomForestClassifier(
        n_estimators=500,
        max_depth=30, 
        verbose=2,
        n_jobs=-1,
        class_weight='balanced_subsample',
        random_state=0
    )

    rfc.fit(x_train, y_train)

    save_pickle_file(rfc, f'../models/{model_name}.pkl')

else:
    # load model from disk
    rfc = load_pickle_file(f'../models/{model_name}.pkl')

In [5]:
def split_array(arr:np.array, chunk_size:int):
    """Split array into chunks for batch predictions. Helps manage memory consumption.

    Args:
        arr (np.array): Input array of data to be predicted.
        chunk_size (int): Size of chunk size to use.

    Yields:
        np.array: Sliced array with specified chunk
    """
    for idx in range(0, arr.shape[0], chunk_size):
        yield arr[idx: idx + chunk_size]

### Gather Predictions

Collect the training and testing results and save out the files for consumption later.

In [None]:
# get rid of the status updates for training
rfc.set_params(verbose=0)

selected_data = x_train
chunksize = 100000


total_iter = round(selected_data.shape[0] / chunksize)

y_pred = []
y_pred_proba = []
for chunk in tqdm(split_array(selected_data, chunksize), total=total_iter):
    preds_proba = rfc.predict_proba(chunk)
    preds = np.argmax(preds_proba, axis=-1)
    for pred, proba in zip(preds, preds_proba):
        y_pred.append(pred)
        y_pred_proba.append(proba)

save_pickle_file(y_pred, f'../models/{model_name}_train_pred.pkl')
save_pickle_file(y_pred_proba, f'../models/{model_name}_train_pred_proba.pkl')

# memory management
del y_pred
del y_pred_proba

In [8]:
# get rid of the status updates for training
print("STARTING TEST SET")
rfc.set_params(verbose=0)


selected_data = x_test
chunksize = 100000
total_iter = round(selected_data.shape[0] / chunksize)

y_pred_test = []
y_pred_proba_test = []
for chunk in tqdm(split_array(selected_data, chunksize), total=total_iter):
    #preds = rfc.predict(chunk)
    preds_proba = rfc.predict_proba(chunk)
    preds = np.argmax(preds_proba, axis=-1)
    for pred, proba in zip(preds, preds_proba):
        y_pred_test.append(pred)
        y_pred_proba_test.append(proba)



save_pickle_file(y_pred_test, f'../models/{model_name}_test_pred_.pkl')
print("y_pred_test saved")
save_pickle_file(y_pred_proba_test, f'../models/{model_name}_test_pred_proba.pkl')
print('y_pred_proba_test')

del y_pred_test
del y_pred_proba_test

STARTING TEST SET


  0%|          | 0/12 [00:00<?, ?it/s]

y_pred_test saved
y_pred_proba_test


### Gather Metrics from Predictions

In [37]:
def pcrs_report(y_pred, y):
    """Report for precision recall and F1 score across the training and testing sets. 

    Args:
        y_pred (np.array): Model predictions.
        y (np.array): Original, correct, predictions.
    """
    precision, recall, f1_score, support = precision_recall_fscore_support(
        y_true=y,
        y_pred=y_pred,
        average='macro')

    print("\nMacro______")
    print(f"Precision:\t{np.round(precision, 2)}")
    print(f"Recall:\t\t{np.round(recall, 2)}")
    print(f"F1 Score:\t{np.round(f1_score, 2)}")

    precision, recall, f1_score, support = precision_recall_fscore_support(
        y_true=y,
        y_pred=y_pred,
        average='weighted')

    print("\nWeighted_____")
    print(f"Precision:\t{np.round(precision, 2)}")
    print(f"Recall:\t\t{np.round(recall, 2)}")
    print(f"F1 Score:\t{np.round(f1_score, 2)}")

In [38]:

# training metrics
y_pred = load_pickle_file(f'../models/{model_name}_train_pred.pkl')
y = y_train

print("Training")
pcrs_report(y_pred, y)





Training

Macro______
Precision:	0.39
Recall:		0.46
F1 Score:	0.29

Weighted_____
Precision:	0.68
Recall:		0.3
F1 Score:	0.34


In [41]:
# testing metrics 
y_pred = load_pickle_file(f'../models/{model_name}_test_pred_.pkl')
y = y_test

print("Testing")
pcrs_report(y_pred, y)

Testing

Macro______
Precision:	0.37
Recall:		0.43
F1 Score:	0.28

Weighted_____
Precision:	0.66
Recall:		0.3
F1 Score:	0.33
