# ML Classifiers

**Goal:** Given a sentence as input, classify it as either a prediction or non-prediction.

In [1]:
import os
import sys
import warnings

import pandas as pd

from tqdm import tqdm

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

# import log_files
from data_processing import DataProcessing
from feature_extraction import SpacyFeatureExtraction
# from classification_models import SkLearnPerceptronModel, SkLearnSGDClassifier, EvaluationMetric
from classification_models import SkLearnModelFactory, EvaluationMetric

In [2]:
pd.set_option('max_colwidth', 800)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_columns', 40)
pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')

## Load Data

In [3]:
print("======= LOAD DATA =======")



In [4]:
base_data_path = os.path.join(notebook_dir, '../data/')
combine_data_path = os.path.join(base_data_path, 'combined_generated_fin_phrase_bank/combined_generated_fin_phrase_bank-v1.csv')

In [5]:
df = DataProcessing.load_from_file(combine_data_path, 'csv')
df.drop(columns=['Unnamed: 0'], inplace=True)
print(f"\tShape: {df.shape}, \nSubset of Data:{df.head(7)}")
df.shape, df.head(3)

	Shape: (105, 2), 
Subset of Data:                                                                                                                   Base Sentence  \
0                                     JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.   
1                                  On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.   
2                                                   Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.   
3                              According to Goldman Sachs, the research and development expenses at Facebook would fall in 2025.   
4  In 21 August 2024, Morgan Stanley envisions that the gross profit at Johnson & Johnson has some probability to remain stable.   
5                                              The stock price at Visa should stay same in Q2 of 2026, according to Wells Fargo.   
6                                         

((105, 2),
                                                                                    Base Sentence  \
 0     JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.   
 1  On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.   
 2                   Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.   
 
    Sentence Label  
 0               1  
 1               1  
 2               1  )

## Shuffle Data

In [6]:
print("======= SHUFFLE DATA =======")



In [7]:
shuffled_df = DataProcessing.shuffle_df(df)
print(f"\tShape: {shuffled_df.shape}, \nSubset of Data:{shuffled_df.head(7)}")

	Shape: (105, 2), 
Subset of Data:                                                                                                                                                                                                             Base Sentence  \
0                                                                                                                   On 2027/08/20, the Federal Reserve speculates that interest rates at major banks will likely increase.   
1                                                                                          Cybersecurity threats should stay same in 2026-06-01, according to the National Security Agency director, Dr. Sophia Rodriguez.   
2  The Brazilian unit of Finnish security solutions provider F-Secure signed up 1,500 new clients last year , online news source Reseller Web quoted the division 's commercial director , Vladimir Brand+_o , as saying .   
3                                                                             

## Extract Sentence Embeddings

In [8]:
print("======= EMBED SENTENCES: Spacy =======")



In [9]:
spacy_fe = SpacyFeatureExtraction(shuffled_df, 'Base Sentence')
spacy_fe

<feature_extraction.SpacyFeatureExtraction at 0x3483c99d0>

In [10]:
spacy_sentence_embeddings_df = spacy_fe.sentence_feature_extraction(attach_to_df=True)
# print(f"{spacy_sentence_embeddings_df.head(3)}")

100%|██████████| 105/105 [00:00<00:00, 237.11it/s]


## Normalize Embeddings

- Why: Getting the below warnings
    1. sklearn/utils/extmath.py:203: RuntimeWarning: divide by zero encountered in matmul ret = a @ b
    2. sklearn/utils/extmath.py:203: RuntimeWarning: overflow encountered in matmul ret = a @ b
    3. sklearn/utils/extmath.py:203: RuntimeWarning: invalid value encountered in matmul ret = a @ b

- Normalize will place data within "boundaries" to be all on one scale

In [11]:
print("======= NORMALIZE EMBEDDINGS =======")



In [12]:
from sklearn.preprocessing import StandardScaler

# Convert embeddings to matrix if not already
embeddings_matrix = pd.DataFrame(spacy_sentence_embeddings_df["Embedding"].tolist())

# Scale the embeddings
scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(embeddings_matrix)

spacy_sentence_embeddings_df['Normalized Embeddings'] = list(scaled_embeddings)

In [13]:
# spacy_sentence_embeddings_df.columns.

In [14]:
# print(f"{spacy_sentence_embeddings_df.head(3)}")
# spacy_sentence_embeddings_df
# print(f"{spacy_sentence_embeddings_df.to_dict()}")

for idx, row in spacy_sentence_embeddings_df.iterrows():
    text = row['Base Sentence']
    label = row['Sentence Label']
    embedding = row['Embedding']
    norm_embedding = row['Normalized Embeddings']
    if idx < 7:
        print(f"{idx}\n Sentence: {text}\n Label: {label}\n Embeddings Shape: {embedding.shape}\n\t Embeddings Subset [:6]: {embedding[:6]} \n Norm Embeddings: {norm_embedding.shape}, \n\tNorm Embeddings Subset [:6]: {norm_embedding[:6]}")

0
 Sentence: On 2027/08/20, the Federal Reserve speculates that interest rates at major banks will likely increase.
 Label: 1
 Embeddings Shape: (300,)
	 Embeddings Subset [:6]: [-0.1819624   0.25178966 -0.06822676 -0.01992646 -0.08903813 -0.06996482] 
 Norm Embeddings: (300,), 
	Norm Embeddings Subset [:6]: [-0.87786    -0.08358515 -1.2822452  -0.03210158 -1.6880763  -0.47773218]
1
 Sentence: Cybersecurity threats should stay same in 2026-06-01, according to the National Security Agency director, Dr. Sophia Rodriguez.
 Label: 1
 Embeddings Shape: (300,)
	 Embeddings Subset [:6]: [-0.09970222  0.22728248  0.08885632  0.0193849   0.0827627  -0.00611013] 
 Norm Embeddings: (300,), 
	Norm Embeddings Subset [:6]: [ 0.1331075  -0.4137284   1.2556356   0.61303735  0.6098036   0.5369961 ]
2
 Sentence: The Brazilian unit of Finnish security solutions provider F-Secure signed up 1,500 new clients last year , online news source Reseller Web quoted the division 's commercial director , Vladimir B

In [15]:
embeddings_col_name = 'Normalized Embeddings'

## Split Data

In [16]:
print("======= SPLIT DATA =======")



In [17]:
# spacy_embeds = spacy_sentence_embeddings_df['Embedding'].to_list()
labels_col = spacy_sentence_embeddings_df['Sentence Label']
X_train_df, X_test_df, y_train_df, y_test_df = DataProcessing.split_data(spacy_sentence_embeddings_df, labels_col)
# print(f"{X_train_df.head(3)}")

In [18]:
save_df = True

if save_df == True:
    print("Save test set so we can pass these into LLMs")
    save_path = os.path.join(base_data_path, 'combined_generated_fin_phrase_bank')
    DataProcessing.save_to_file(X_test_df, save_path, 'x_test_set', 'csv')
    DataProcessing.save_to_file(y_test_df, save_path, 'y_test_set', 'csv')

Save test set so we can pass these into LLMs
Saved to: 
	/Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_classification_experiments-v2/../data/combined_generated_fin_phrase_bank/x_test_set-v1.csv
Saved to: 
	/Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_classification_experiments-v2/../data/combined_generated_fin_phrase_bank/y_test_set-v1.csv


In [19]:
len(y_train_df)


84

In [20]:
X_train_df[embeddings_col_name].to_list()

[array([ 1.0256395e+00,  1.1956652e-03, -1.3175702e+00,  1.8974631e-01,
         1.0791918e+00, -1.3400919e+00,  4.6247709e-01, -1.2200363e+00,
        -1.2630324e+00, -3.4219918e-01,  2.4539840e+00, -6.5987140e-02,
        -2.9877266e-01, -1.8082724e+00, -1.9632182e-01, -1.7311450e+00,
        -3.4985027e-01, -1.0931572e+00,  3.2568693e-01,  7.7986531e-02,
        -2.7262715e-01, -1.9839166e-01,  1.5013008e+00,  1.2223474e+00,
        -1.0663500e-01,  3.6742646e-01, -8.3456028e-01, -1.1286747e+00,
         9.7854090e-01, -7.8832465e-01,  3.9183345e-01, -1.1645852e+00,
        -5.7191825e-01,  5.0171524e-01,  2.5964962e-02, -2.0028654e-01,
         1.1697486e-01,  8.9327526e-01,  8.7133777e-01, -1.2680293e+00,
         4.8640400e-02, -2.1037741e+00,  1.5504402e-01,  1.4674456e+00,
        -9.0349525e-01, -7.8127600e-02, -4.1050714e-01,  2.9415607e-01,
        -1.1125054e+00, -8.3685800e-02,  2.8264168e-01, -2.2872837e-01,
        -8.4336674e-01,  6.7680568e-01, -1.9812153e-01,  1.03210

## Models

In [21]:
print("======= TRAIN x TEST MODELS =======")



> Track loss: try BCE (Binary Cross Entropy)

In [22]:
sklmf = SkLearnModelFactory
perception_model = sklmf.select_model('perceptron')
sgd_classifier_model = sklmf.select_model('sgd_classifier')
logistic_regression_model = sklmf.select_model('logistic_regression')
ridge_classifier_model = sklmf.select_model('ridge_classifier')
# linear_regression_model = sklmf.select_model('linear_regression')
# elastic_net_model = sklmf.select_model('elastic_net')

ml_models = [perception_model, sgd_classifier_model, logistic_regression_model, ridge_classifier_model]

In [23]:
models_with_predictions = {}
for ml_model in ml_models:
    print(f"Train -> Predict for {ml_model.get_model_name()}")
    ml_model.train_model(X_train_df[embeddings_col_name].to_list(), y_train_df)
    ml_model_predictions = ml_model.predict(X_test_df[embeddings_col_name].to_list())
    models_with_predictions[ml_model.get_model_name()] = ml_model_predictions

models_with_predictions

Train -> Predict for Perceptron
Train -> Predict for SDG Classifier
Train -> Predict for Logistic Regression
Train -> Predict for Ridge Classifier


{'Perceptron': 0     1
 1     1
 2     1
 3     1
 4     1
 5     1
 6     1
 7     0
 8     1
 9     1
 10    1
 11    0
 12    1
 13    1
 14    0
 15    0
 16    0
 17    1
 18    0
 19    1
 20    0
 dtype: int64,
 'SDG Classifier': 0     1
 1     1
 2     1
 3     1
 4     1
 5     1
 6     1
 7     0
 8     1
 9     1
 10    1
 11    0
 12    1
 13    1
 14    0
 15    0
 16    0
 17    1
 18    0
 19    1
 20    0
 dtype: int64,
 'Logistic Regression': 0     1
 1     1
 2     1
 3     1
 4     1
 5     1
 6     1
 7     0
 8     1
 9     1
 10    1
 11    0
 12    1
 13    1
 14    0
 15    0
 16    0
 17    1
 18    0
 19    1
 20    0
 dtype: int64,
 'Ridge Classifier': 0     1
 1     1
 2     1
 3     1
 4     1
 5     1
 6     1
 7     0
 8     1
 9     1
 10    1
 11    0
 12    1
 13    1
 14    0
 15    0
 16    0
 17    1
 18    0
 19    1
 20    0
 dtype: int64}

In [24]:
# models_predictions_df = pd.DataFrame(models_to_predictions)
# models_predictions_df

In [25]:
y_test_df.rename(index='Actual Label', inplace=True)

30     1
65     1
64     1
53     1
45     1
94     1
104    1
47     0
10     1
0      1
18     1
31     0
89     1
96     1
77     0
4      0
80     0
33     1
12     0
26     1
99     0
Name: Actual Label, dtype: int64

In [26]:
test_and_models_df = pd.concat([X_test_df.loc[:, :], y_test_df], axis=1)
# test_and_models_df = pd.concat([test_df, models_predictions_df])

for key, value in models_with_predictions.items():
    test_and_models_df[key] = value.to_numpy().ravel()

test_and_models_df.head(3)

Unnamed: 0,Base Sentence,Sentence Label,Embedding,Normalized Embeddings,Actual Label,Perceptron,SDG Classifier,Logistic Regression,Ridge Classifier
30,The Center for Strategic and International Studies forecasts that the likelihood of bipartisan support for climate change legislation at the US Senate potentially decrease in Q4 2026.,1,"[-0.044099826, 0.19285299, 0.016336072, 0.05253703, 0.018092835, 0.020381128, -0.034171943, 0.15161791, 0.038831066, 2.1586397, -0.35418066, 0.058646653, 0.029613068, -0.059692997, -0.020634526, -0.0043085716, -0.06441634, 1.1099615, -0.13645032, -0.031500068, 0.013234937, 0.08524628, -0.017139759, -0.08008238, 0.0066328594, 0.04997158, -0.13199131, 0.050035454, -0.029173419, 0.09406366, -0.016810289, 0.05660189, 0.009551888, 0.00967607, 0.14959173, -0.120788395, 0.032680027, -0.04941273, -0.036122642, 0.01846796, 0.024394725, 0.1477796, 0.016336069, -0.05254817, 0.0011716206, -0.037403855, -0.08325229, -0.013680761, -0.09681591, -0.09434468, 0.025053378, 0.05348439, -0.020475952, -0.12262764, -0.016685966, -0.0023208845, -0.10311724, -0.011796364, -0.07584526, -0.09049897, -0.10044128...","[0.816454, -0.8775379, 0.08397705, 1.157097, -0.25517243, 0.9579742, -0.5545667, 1.955469, -0.087395765, 1.3475066, -0.7212994, 0.37387562, -0.2985515, -0.9794513, 0.2187811, 0.60348517, -0.84408605, 0.055705104, -0.37793154, -0.111679964, -0.30660114, 0.3786056, -0.25966403, 0.13719976, -0.5270167, -0.2586426, -0.090708144, 0.42103592, -0.8787988, 0.18710232, -0.1522448, 0.70922136, 0.19392672, -0.89473325, 1.9067508, -1.6955786, 0.78689104, -1.587544, -0.4833425, 0.56877357, 0.3915593, 1.570104, -0.94153196, -0.4240164, -0.3404005, -0.6333175, -0.32467616, 0.35964382, -2.1547184, -1.5963353, 0.21961679, 0.28525147, 0.71440333, -0.64820975, -0.7821317, 0.67238426, -1.7019898, 0.16083783, -1.0829115, 0.48376572, -1.3750477, 1.6393371, 0.049908657, -0.47228456, 1.0353076, 1.5355866, -0....",1,1,1,1,1
65,"Coach James Davis predicts on 2024-09-20, the touchdown rate at the New England Patriots may rise.",1,"[-0.14041036, 0.35971388, 0.014706722, 0.027508462, 0.10818055, -0.08467152, 0.08307963, 0.054365043, 0.029605813, 1.64869, -0.26587722, 0.0075096786, 0.10082531, -0.036913965, -0.08846381, -0.048731375, 0.01382061, 0.98642236, 0.0598464, -0.022103727, -0.058724724, 0.0011739959, 0.04838291, -0.16412185, 0.015100873, 0.2512572, -0.20859629, 0.063764274, 0.03527434, 0.22554432, -0.032475512, 0.16278066, -0.023857372, 0.046870865, 0.0369795, 0.09862655, -0.02092244, 0.104581445, -0.06821999, -0.024128841, -0.0070767705, -0.00058159506, 0.12866886, 0.019093795, 0.018468462, -0.08191604, -0.10063109, 0.051205993, 0.036010012, 0.17358238, 0.070379265, 0.09537673, -0.12335748, -0.14360687, 0.020327998, 0.017614305, 0.045406498, 0.041473504, -0.0030659975, -0.092216134, -0.015575487, 0.041140...","[-0.36719063, 1.370293, 0.057652798, 0.7463531, 0.94977385, -0.71143943, 1.647767, 0.6928795, -0.20773709, -0.39688042, 0.50432396, -0.27438945, 0.7424761, -0.5903933, -0.48003995, -0.24257852, 0.595707, -0.81144917, 2.896311, 0.04753759, -1.7946649, -0.8690097, 0.82284224, -1.0633191, -0.39717, 2.4259837, -1.2739799, 0.6540051, 0.21179289, 2.0884771, -0.43923187, 2.4003773, -0.33120742, -0.36927316, -0.3709113, 2.0086467, 0.07468775, 0.83524853, -1.0363972, -0.07077451, -0.1760083, -0.7643759, 0.5914349, 0.7931792, -0.012669547, -1.3857014, -0.5681628, 1.153752, 0.04678212, 2.2861812, 0.9880727, 0.9623238, -1.284769, -0.9430869, -0.2622276, 1.0229075, 0.7060511, 0.9098799, 0.20955999, 0.4534466, 0.3199799, 1.7617779, 0.35873216, -0.496644, -0.06797643, 0.61011523, -0.7856665, 1.562894...",1,1,1,1,1
64,"According to a study conducted by the World Health Organization, the obesity rates at European elementary schools would fall in 2032.",1,"[-0.08886137, 0.18626356, 0.010655866, -0.03436013, -0.031226907, -0.079142764, -0.048361927, 0.059287004, 0.018140318, 2.36586, -0.40948477, 0.032992244, 0.13606042, 0.06114504, -0.03633298, -0.108221434, 0.03656, 1.2485639, -0.10988889, -0.076726295, 0.07544726, -0.06469444, -0.012966561, -0.01772283, 0.020815652, 0.054644093, -0.14046738, 0.033351175, 0.058045242, 0.109737106, -0.0226367, 0.057666738, 0.07236696, 0.030882953, 0.18342595, -0.117384784, 0.021251842, 0.06815147, -0.020210264, 0.053521086, -0.020259997, 0.13366187, -0.05120378, -0.07823974, 0.083360866, 0.034669496, -0.08454053, -0.017499048, 0.097897485, -0.09662588, -0.15426321, 0.0076679103, -0.040651307, -0.13110718, 0.010086258, -0.069524094, 0.14353321, 0.08340517, 0.04547005, -0.20508295, -0.028218098, -0.0416605...","[0.2663402, -0.966306, -0.007794042, -0.26897252, -0.9148365, -0.6235808, -0.8210969, 0.7567791, -0.35730186, 2.056346, -1.4889026, 0.04865384, 1.2575662, 1.0844206, 0.057045504, -1.3756088, 1.0141795, 1.0285922, 0.06511496, -0.8780219, 0.9798961, -1.8464837, -0.19071825, 1.028017, -0.3095408, -0.19632347, -0.22163288, 0.13791442, 0.5971244, 0.41375992, -0.25898474, 0.7261817, 1.1812679, -0.5951384, 2.591072, -1.6381179, 0.63504744, 0.26209515, -0.20916359, 1.095061, -0.41375908, 1.3479599, -1.863225, -0.86051565, 1.2168758, 0.5849294, -0.34272516, 0.31291422, 1.0725255, -1.629392, -2.82052, -0.45524362, 0.32236007, -0.7673954, -0.4060847, -0.50925964, 2.2969964, 1.4994923, 1.0715015, -1.5393858, 0.0674686, 0.3539133, 0.5879885, -1.664084, -0.58951193, 1.6082355, -0.389313, -0.14805758...",1,1,1,1,1


## Evaluation

In [27]:
print("======= EVALUATION/RESULTS =======")



In [28]:
get_metrics = EvaluationMetric()
get_metrics

<classification_models.EvaluationMetric at 0x35a947450>

> - Results may differ (from previous runs and even terminal runs) because we shuffle the data.

In [29]:
actual_label = test_and_models_df['Actual Label'].values
for ml_model in ml_models:
    ml_model_name = ml_model.get_model_name()
    print(f"Actual Label:\t\t{actual_label}")
    ml_model_predictions = test_and_models_df[ml_model_name].values
    print(f"{ml_model_name}:\t\t{ml_model_predictions}")
    print()
    get_metrics.eval_classification_report(y_test_df, ml_model_predictions)

Actual Label:		[1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 0 0 1 0 1 0]
Perceptron:		[1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 0 0 1 0 1 0]

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      1.00      1.00        14

    accuracy                           1.00        21
   macro avg       1.00      1.00      1.00        21
weighted avg       1.00      1.00      1.00        21

Actual Label:		[1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 0 0 1 0 1 0]
SDG Classifier:		[1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 0 0 1 0 1 0]

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      1.00      1.00        14

    accuracy                           1.00        21
   macro avg       1.00      1.00      1.00        21
weighted avg       1.00      1.00      1.00        21

Actual Label:		[1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 0 0 1 0 1 0]
Logistic Regression:		[1 1 1 1 1 1 1 0 1 1 1 0 1 