# ML Classifiers

**Goal:** Given a sentence as input, classify it as either a prediction or non-prediction.

In [1]:
import os
import sys
import warnings

import pandas as pd

from tqdm import tqdm

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

# import log_files
from data_processing import DataProcessing
from feature_extraction import SpacyFeatureExtraction
# from classification_models import SkLearnPerceptronModel, SkLearnSGDClassifier, EvaluationMetric
from classification_models import SkLearnModelFactory, EvaluationMetric

In [2]:
pd.set_option('max_colwidth', 800)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_columns', 40)
pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')

## Load Data

In [3]:
print("======= LOAD DATA =======")



In [4]:
base_data_path = os.path.join(notebook_dir, '../data/')
combine_data_path = os.path.join(base_data_path, 'combined_generated_fin_phrase_bank')
data_path = os.path.join(combine_data_path, 'combined_generated_fin_phrase_bank-v1.csv')

In [5]:
df = DataProcessing.load_from_file(data_path, 'csv')
df.drop(columns=['Unnamed: 0'], inplace=True)
print(f"\tShape: {df.shape}, \nSubset of Data:{df.head(7)}")
df.shape, df.head(3)

	Shape: (117, 2), 
Subset of Data:                                                                                                                   Base Sentence  \
0                                     JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.   
1                                  On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.   
2                                                   Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.   
3                              According to Goldman Sachs, the research and development expenses at Facebook would fall in 2025.   
4  In 21 August 2024, Morgan Stanley envisions that the gross profit at Johnson & Johnson has some probability to remain stable.   
5                                              The stock price at Visa should stay same in Q2 of 2026, according to Wells Fargo.   
6                                         

((117, 2),
                                                                                    Base Sentence  \
 0     JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.   
 1  On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.   
 2                   Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.   
 
    Sentence Label  
 0               1  
 1               1  
 2               1  )

## Shuffle Data

In [6]:
print("======= SHUFFLE DATA =======")



In [7]:
shuffled_df = DataProcessing.shuffle_df(df)
print(f"\tShape: {shuffled_df.shape}, \nSubset of Data:{shuffled_df.head(7)}")

	Shape: (117, 2), 
Subset of Data:                                                                                                                                                                                                                          Base Sentence  \
0                                                                                                                                           Professor James Davis predicts on November 20, 2025, the average salary at Google may rise.   
1                                                                                                                                      According to Coach Emily Carter, the points per game at the Atlanta Hawks rose in December 2021.   
2                                                                                                The National Oceanic and Atmospheric Administration forecasts that the precipitation levels at New Orleans may decrease in 2024-08-21.   
3                         

## Extract Sentence Embeddings

In [8]:
print("======= EMBED SENTENCES: Spacy =======")



In [9]:
spacy_fe = SpacyFeatureExtraction(shuffled_df, 'Base Sentence')
spacy_fe

<feature_extraction.SpacyFeatureExtraction at 0x350a2d890>

In [10]:
spacy_sentence_embeddings_df = spacy_fe.sentence_feature_extraction(attach_to_df=True)
# print(f"{spacy_sentence_embeddings_df.head(3)}")

100%|██████████| 117/117 [00:00<00:00, 237.55it/s]


## Normalize Embeddings

- Why: Getting the below warnings
    1. sklearn/utils/extmath.py:203: RuntimeWarning: divide by zero encountered in matmul ret = a @ b
    2. sklearn/utils/extmath.py:203: RuntimeWarning: overflow encountered in matmul ret = a @ b
    3. sklearn/utils/extmath.py:203: RuntimeWarning: invalid value encountered in matmul ret = a @ b

- Normalize will place data within "boundaries" to be all on one scale

In [11]:
print("======= NORMALIZE EMBEDDINGS =======")



In [12]:
from sklearn.preprocessing import StandardScaler

# Convert embeddings to matrix if not already
embeddings_matrix = pd.DataFrame(spacy_sentence_embeddings_df["Embedding"].tolist())

# Scale the embeddings
scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(embeddings_matrix)

spacy_sentence_embeddings_df['Normalized Embeddings'] = list(scaled_embeddings)

In [13]:
# spacy_sentence_embeddings_df.columns.

In [14]:
# print(f"{spacy_sentence_embeddings_df.head(3)}")
# spacy_sentence_embeddings_df
# print(f"{spacy_sentence_embeddings_df.to_dict()}")

for idx, row in spacy_sentence_embeddings_df.iterrows():
    text = row['Base Sentence']
    label = row['Sentence Label']
    embedding = row['Embedding']
    norm_embedding = row['Normalized Embeddings']
    if idx < 7:
        print(f"{idx}\n Sentence: {text}\n Label: {label}\n Embeddings Shape: {embedding.shape}\n\t Embeddings Subset [:6]: {embedding[:6]} \n Norm Embeddings: {norm_embedding.shape}, \n\tNorm Embeddings Subset [:6]: {norm_embedding[:6]}")

0
 Sentence: Professor James Davis predicts on November 20, 2025, the average salary at Google may rise.
 Label: 1
 Embeddings Shape: (300,)
	 Embeddings Subset [:6]: [-0.24932177  0.3551489   0.12051506  0.03424606  0.08339451  0.01107412] 
 Norm Embeddings: (300,), 
	Norm Embeddings Subset [:6]: [-1.7706842   1.3908991   1.7445623   0.87102956  0.5170118   0.7541876 ]
1
 Sentence: According to Coach Emily Carter, the points per game at the Atlanta Hawks rose in December 2021.
 Label: 0
 Embeddings Shape: (300,)
	 Embeddings Subset [:6]: [-0.03980784  0.1981266   0.01338033 -0.10077263  0.1403449  -0.07003804] 
 Norm Embeddings: (300,), 
	Norm Embeddings Subset [:6]: [ 0.8177393  -0.7501531   0.03946133 -1.3795731   1.2707775  -0.5223274 ]
2
 Sentence: The National Oceanic and Atmospheric Administration forecasts that the precipitation levels at New Orleans may decrease in 2024-08-21.
 Label: 1
 Embeddings Shape: (300,)
	 Embeddings Subset [:6]: [-0.06797983  0.24726018  0.07944361  0

In [15]:
embeddings_col_name = 'Normalized Embeddings'

## Split Data

In [16]:
print("======= SPLIT DATA =======")



In [17]:
# spacy_embeds = spacy_sentence_embeddings_df['Embedding'].to_list()
labels_col = spacy_sentence_embeddings_df['Sentence Label']
X_train_df, X_test_df, y_train_df, y_test_df = DataProcessing.split_data(spacy_sentence_embeddings_df, labels_col)
# print(f"{X_train_df.head(3)}")

In [18]:
save_df = True

if save_df == True:
    print("Save test set so we can pass these into LLMs")
    save_path = os.path.join(base_data_path, 'combined_generated_fin_phrase_bank')
    DataProcessing.save_to_file(X_test_df, save_path, 'x_test_set', 'csv')
    DataProcessing.save_to_file(y_test_df, save_path, 'y_test_set', 'csv')

Save test set so we can pass these into LLMs
Saved to: 
	/Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_classification_experiments-v2/../data/combined_generated_fin_phrase_bank/x_test_set-v1.csv
Saved to: 
	/Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_classification_experiments-v2/../data/combined_generated_fin_phrase_bank/y_test_set-v1.csv


In [19]:
len(y_train_df)


93

In [20]:
X_train_df[embeddings_col_name].to_list()

[array([ 6.86985075e-01, -5.14341712e-01, -1.91396624e-01, -6.13568187e-01,
        -6.83449864e-01, -7.39991367e-01, -6.01996958e-01,  1.82488036e+00,
         3.26362848e-01,  1.10126638e+00, -2.51574373e+00, -1.40671551e+00,
         1.39690042e+00,  1.39296263e-01,  1.70565343e+00,  8.58668745e-01,
         7.54797101e-01,  1.05107439e+00, -1.36788440e+00, -1.20185912e+00,
        -9.90154147e-01,  7.52463400e-01, -6.14590524e-03, -8.18588197e-01,
         8.89337003e-01, -1.07188714e+00,  1.72584271e+00,  2.71707892e-01,
         5.27288973e-01,  3.61384219e-03, -1.43364644e+00, -1.95845246e-01,
        -2.18531623e-01, -1.15468895e+00,  3.89288336e-01,  5.93534470e-01,
         7.59104908e-01, -6.01876736e-01,  1.35936439e-01,  2.01694965e+00,
         9.59586501e-01,  2.06199837e+00, -1.59424520e+00, -7.11872995e-01,
        -6.83163941e-01, -1.81537402e+00, -5.04742563e-01,  5.72270453e-01,
        -4.88323033e-01, -1.44458139e+00, -2.64541292e+00, -1.31464803e+00,
         2.0

## Models

In [21]:
print("======= TRAIN x TEST MODELS =======")



> Track loss: try BCE (Binary Cross Entropy)

In [22]:
sklmf = SkLearnModelFactory
perception_model = sklmf.select_model('perceptron')
sgd_classifier_model = sklmf.select_model('sgd_classifier')
logistic_regression_model = sklmf.select_model('logistic_regression')
ridge_classifier_model = sklmf.select_model('ridge_classifier')
# linear_regression_model = sklmf.select_model('linear_regression')
# elastic_net_model = sklmf.select_model('elastic_net')

ml_models = [perception_model, sgd_classifier_model, logistic_regression_model, ridge_classifier_model]

In [23]:
models_with_predictions = {}
for ml_model in ml_models:
    print(f"Train -> Predict for {ml_model.get_model_name()}")
    ml_model.train_model(X_train_df[embeddings_col_name].to_list(), y_train_df)
    ml_model_predictions = ml_model.predict(X_test_df[embeddings_col_name].to_list())
    models_with_predictions[ml_model.get_model_name()] = ml_model_predictions

models_with_predictions

Train -> Predict for Perceptron
Train -> Predict for SDG Classifier
Train -> Predict for Logistic Regression
Train -> Predict for Ridge Classifier


{'Perceptron': 0     0
 1     0
 2     1
 3     0
 4     1
 5     0
 6     1
 7     1
 8     0
 9     0
 10    1
 11    0
 12    1
 13    1
 14    1
 15    1
 16    0
 17    1
 18    0
 19    1
 20    1
 21    1
 22    0
 23    1
 dtype: int64,
 'SDG Classifier': 0     1
 1     0
 2     1
 3     1
 4     1
 5     1
 6     1
 7     1
 8     0
 9     0
 10    1
 11    1
 12    1
 13    1
 14    1
 15    1
 16    0
 17    0
 18    0
 19    1
 20    1
 21    1
 22    1
 23    1
 dtype: int64,
 'Logistic Regression': 0     1
 1     1
 2     1
 3     1
 4     1
 5     0
 6     1
 7     1
 8     0
 9     0
 10    1
 11    1
 12    1
 13    1
 14    1
 15    1
 16    0
 17    1
 18    0
 19    1
 20    1
 21    1
 22    1
 23    1
 dtype: int64,
 'Ridge Classifier': 0     1
 1     1
 2     1
 3     0
 4     1
 5     0
 6     1
 7     1
 8     0
 9     0
 10    1
 11    1
 12    1
 13    1
 14    1
 15    1
 16    0
 17    1
 18    0
 19    1
 20    1
 21    1
 22    1
 23    1
 dtype: int64}

In [24]:
# models_predictions_df = pd.DataFrame(models_to_predictions)
# models_predictions_df

In [25]:
y_test_df.rename(index='Actual Label', inplace=True)

44     1
4      1
53     1
42     0
10     1
85     0
72     1
94     1
36     1
11     0
40     1
26     1
95     1
104    1
18     1
0      1
62     0
112    1
76     0
101    1
68     1
89     1
115    1
12     1
Name: Actual Label, dtype: int64

In [26]:
test_and_models_df = pd.concat([X_test_df.loc[:, :], y_test_df], axis=1)
# test_and_models_df = pd.concat([test_df, models_predictions_df])

for key, value in models_with_predictions.items():
    test_and_models_df[key] = value.to_numpy().ravel()

test_and_models_df.head(3)

Unnamed: 0,Base Sentence,Sentence Label,Embedding,Normalized Embeddings,Actual Label,Perceptron,SDG Classifier,Logistic Regression,Ridge Classifier
44,"In 08/2024, Coach Michael Brown envisions that the touchdown rate at the New England Patriots has some probability to remain stable.",1,"[-0.02958739, 0.18973726, -0.11819747, -0.041300606, 0.11042062, -0.049567938, 0.019185336, 0.003956393, 0.10927482, 1.9799795, -0.18533087, -0.045425095, 0.10816704, 0.025421124, -0.009493131, -0.06847136, 0.015523771, 1.0076592, -0.0060080886, -0.041570563, -0.07416421, -0.015073336, 0.013444346, -0.08062417, -0.07691947, 0.114793696, -0.21554036, 0.06737826, 0.036187526, 0.016580999, -0.044001795, 0.02612843, 0.04480113, -0.009688915, 0.034902446, 0.009659723, -0.02668986, 0.0074698576, -0.066964805, -0.037240602, 0.07123344, 0.0675264, 0.078591906, -0.007790331, 0.04300652, -0.0540832, -0.11761461, -0.02359158, -0.046808533, 0.0647246, 0.015711654, 0.103282735, -0.09804983, -0.08016924, 0.057319958, 0.0038818633, 0.03434675, 0.03524678, 0.018419268, -0.03328778, 0.0011636653, 0.011...","[0.94400704, -0.8645446, -2.054663, -0.3882444, 0.8747154, -0.2001761, 0.3727082, 0.06485146, 0.7578419, 0.74855906, 1.5445073, -0.9450778, 0.8223399, 0.4856363, 0.34803256, -0.53150874, 0.5283372, -0.55164915, 1.8045051, -0.24819341, -2.180726, -1.0896391, 0.08680368, 0.22307044, -1.5891142, 0.5755748, -1.1823585, 0.6673478, 0.25934806, -0.86385673, -0.6393052, 0.24252476, 0.7617, -1.2495198, -0.25059918, 0.46805078, -0.013159137, -0.6343914, -0.95103335, -0.23913547, 1.234197, 0.26890773, -0.012306502, 0.36895898, 0.34772688, -0.8687144, -0.7102191, 0.17720816, -1.3530828, 0.6399917, 0.0253048, 1.1315668, -0.82290274, 0.017806469, 0.33573022, 0.84572476, 0.5561635, 0.630834, 0.5291998, 1.5564367, 0.74758047, 1.1630359, 0.18509752, -1.5357434, -0.74911606, 1.425574, -1.0104076, 1.5935...",1,0,1,1,1
4,Coach Rachel Thompson forecasts that the points per game at the New York Knicks potentially decrease in August 2024.,1,"[-0.07312718, 0.25787935, -0.010016336, -0.029122552, 0.062031634, -0.018994832, 0.0283653, -0.023620797, 0.12503529, 1.6817259, -0.30820423, 0.030993972, 0.051932454, -0.0040970836, 0.053234648, -0.0097929435, -0.017720852, 0.9745005, -0.060591854, -0.11913413, 0.068550065, -0.011396958, 0.14084914, -0.19831455, -0.02559045, 0.22493431, -0.28505936, 0.13453686, -0.05016158, 0.10663557, -0.13473655, 0.068550006, 0.009542454, 0.12495885, 0.008981001, 0.05749259, 0.03364301, 0.03125364, -0.029734764, -0.00555008, -0.07198344, 0.12520279, 0.03191192, -0.08328045, -0.00805645, -0.09729185, -0.15862066, 0.0715504, -0.0030577504, -0.015250596, 0.04820242, 0.0054237954, -0.03745146, -0.17704877, -0.08034569, -0.074892215, 0.013706143, 0.10037221, 0.04870651, -0.13040425, -0.13243684, 0.012551...","[0.4060981, 0.06459576, -0.33290786, -0.18525055, 0.23426409, 0.28097287, 0.5438155, -0.29924256, 0.9618827, -0.30452695, -0.16134328, 0.048946925, -0.023275206, -0.020097172, 1.0117191, 0.54240227, -0.07506002, -0.78002316, 0.8962295, -1.5484266, 0.80796844, -1.033549, 2.02265, -1.4457239, -0.84145296, 2.089933, -2.1967309, 1.8129327, -1.2062268, 0.43286043, -2.3416817, 0.92772835, 0.19314055, 0.6750809, -0.73656416, 1.299064, 0.8058958, -0.26324862, -0.31853175, 0.24293266, -1.204331, 1.1874502, -0.641938, -0.9460187, -0.5985572, -1.5994209, -1.2832663, 1.3151182, -0.65960604, -0.5158133, 0.5862492, -0.48699406, 0.34597123, -1.3362038, -1.5870239, -0.51357347, 0.21139102, 1.4604084, 1.0736531, -0.19801451, -1.9606705, 1.1726837, -0.5702353, -2.2427983, 0.5018804, 0.4886444, -1.117261...",1,0,0,1,1
53,JPMorgan forecasts that the revenue at Microsoft potentially decrease in Q3 of 2027.,1,"[-0.22467756, 0.22830214, -0.032094143, -0.017848214, -0.05663485, 0.020519087, -0.03815743, 0.14183809, 0.16498272, 1.45077, -0.34771657, 0.048295233, 0.10903186, 0.041840874, 0.06936674, 0.007948999, -0.028099716, 1.022385, -0.11793091, -0.12938592, 0.047487717, 0.19360885, 0.062014066, -0.17926492, 0.061135743, 0.16699407, -0.092559285, 0.057476398, 0.02447207, 0.062002372, -0.039193127, -0.0035409224, -0.027321855, 0.18480529, -0.008365932, -0.00523124, 0.03498349, 0.16340052, 0.012447267, 0.017993355, 0.02809838, 0.12413694, 0.13181743, 0.013852571, 0.04251221, -0.090282716, -0.029795082, -0.07996637, -0.10838692, 0.02084122, -0.006943495, -0.01604021, -0.09111266, -0.10549972, 0.03028264, -0.05534979, -0.04395779, -0.17121895, -0.087924324, -0.14022243, -0.054000642, -0.1212813, ...","[-1.4662192, -0.33869946, -0.68428683, 0.0026793862, -1.3363433, 0.90282923, -0.69611585, 1.8852654, 1.479057, -1.1199952, -0.7098929, 0.27399394, 0.83534455, 0.7669548, 1.1824033, 0.8671089, -0.26343867, -0.45022815, -0.05789443, -1.7202821, 0.36688486, 2.0942023, 0.824794, -1.1756091, 0.42180544, 1.2932943, 0.6120948, 0.49844283, 0.0605055, -0.2098236, -0.54908437, -0.23670182, -0.4013103, 1.5305017, -1.0617776, 0.20934603, 0.82409364, 1.7988863, 0.39809933, 0.6010694, 0.49974424, 1.1704757, 0.70561284, 0.74596107, 0.33856642, -1.4808886, 0.51703244, -0.49704137, -2.3291378, 0.005787174, -0.36583024, -0.842003, -0.68909264, -0.33621812, -0.041895654, -0.17635614, -0.7518045, -1.9991481, -1.3824673, -0.37538418, -0.37066966, -1.0433954, -0.29011834, 0.60281897, 0.34506997, -0.43238124...",1,1,1,1,1


## Save Output

In [27]:
DataProcessing.save_to_file(test_and_models_df, combine_data_path, 'ml_classifiers', '.csv')

Saved to: 
	/Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_classification_experiments-v2/../data/combined_generated_fin_phrase_bank/ml_classifiers-v1.csv


## Evaluation

In [28]:
print("======= EVALUATION/RESULTS =======")



In [29]:
get_metrics = EvaluationMetric()
get_metrics

<classification_models.EvaluationMetric at 0x35126e550>

> - Results may differ (from previous runs and even terminal runs) because we shuffle the data.

In [30]:
actual_label = test_and_models_df['Actual Label'].values
for ml_model in ml_models:
    ml_model_name = ml_model.get_model_name()
    print(f"Actual Label:\t\t{actual_label}")
    ml_model_predictions = test_and_models_df[ml_model_name].values
    print(f"{ml_model_name}:\t\t{ml_model_predictions}")
    print()
    get_metrics.eval_classification_report(y_test_df, ml_model_predictions)

Actual Label:		[1 1 1 0 1 0 1 1 1 0 1 1 1 1 1 1 0 1 0 1 1 1 1 1]
Perceptron:		[0 0 1 0 1 0 1 1 0 0 1 0 1 1 1 1 0 1 0 1 1 1 0 1]

              precision    recall  f1-score   support

           0       0.50      1.00      0.67         5
           1       1.00      0.74      0.85        19

    accuracy                           0.79        24
   macro avg       0.75      0.87      0.76        24
weighted avg       0.90      0.79      0.81        24

Actual Label:		[1 1 1 0 1 0 1 1 1 0 1 1 1 1 1 1 0 1 0 1 1 1 1 1]
SDG Classifier:		[1 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 0 0 1 1 1 1 1]

              precision    recall  f1-score   support

           0       0.50      0.60      0.55         5
           1       0.89      0.84      0.86        19

    accuracy                           0.79        24
   macro avg       0.69      0.72      0.71        24
weighted avg       0.81      0.79      0.80        24

Actual Label:		[1 1 1 0 1 0 1 1 1 0 1 1 1 1 1 1 0 1 0 1 1 1 1 1]
Logistic Regression