# Models + Evaluation Metrics

- **Goal:** Prediction Recognition

- **Purpose:** To train our models and to make predictions on unseen data.

- **Misc:**
    - `%store`: Cell magic will store the variable of interest so we can load in another notebook

In [1]:
import os
import sys

import pandas as pd

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

from pipelines import BasePipeline
from data_processing import DataProcessing
from classification_models import SkLearnPerceptronModel, SkLearnSGDClassifier, SkLearnLogisticRegression, train_and_evaluate_model, EvaluationMetric

In [2]:
%store -r tfidf_vectorized_features_df
%store -r encoded_word_level_tags_entities_df

pd.set_option('max_colwidth', 800)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

## Combine Features (TF x IDF and POS & NER Encodings)

In [3]:
comparison_result = (tfidf_vectorized_features_df.iloc[:, :2] == encoded_word_level_tags_entities_df.iloc[:, :2]).all().all()
comparison_result

True

In [4]:
tfidf_vectorized_features_df

Unnamed: 0,Base Sentence,Prediction Label,000,01,02,03,04,06,07,08,...,white,will,wind,window,with,world,wrote,yoga,york,zealand
0,The music echoed through the empty hall.,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,"According to a policy analyst, Emily Chen, from the Congressional Budget Office, on 2024-08-22, the federal budget deficit is expected to decrease beyond $1 trillion in the timeframe of Q4 of 2027.",1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133298,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,"On 2024-10-15, Dr. David Lee, a health expert, predicts that the obesity rate at the World Health Organization will likely decrease by 3% in Q2 of 2026.",1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.148390,0.0,0.0,0.0,0.284744,0.0,0.0,0.0,0.0
3,"According to a senior level person from 3M, on 2024/08/22, the operating income is expected to increase as much as $500 million, reflecting a 20% increase, in the timeframe of Q2 of 2029.",1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.137857,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,"On 2024-10-15, Rachel Patel, a financial analyst, predicts that the operating income at General Motors will likely increase by $5 billion in Q2 of 2026.",1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.158872,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,"Michael Davis, a top executive, predicts on 15 October 2024 that the stock price at Visa may rise by 15% to $200 per share in 2027.",1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
76,The city lights twinkled at night time.,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
77,The little boy fed the hungry birds.,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
78,"In 2024/08/20, Senator James Davis from the Senate Committee on Energy and Natural Resources, forecasts that the renewable energy consumption will increase from 20% to 50% in 2028.",1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.130450,...,0.0,0.118971,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [5]:
# encoded_word_level_tags_entities_df.head(7)

In [6]:
# If we don't, the sentence and label cols will be duplicated. Confirm that the first two columns are the same in both dataframes, so we can use the columns from the tfidf_vectorized_features_df dataframe.

updated_encoded_word_level_tags_entities_df = encoded_word_level_tags_entities_df.iloc[:, 2:]
updated_encoded_word_level_tags_entities_df

Unnamed: 0,AUX,PUNCT,DET,NUM,VERB,ADP,SYM,PROPN,ADJ,NOUN,...,TIME_1,DATE_3,PERCENT_2,QUANTITY_1,DATE_1,ORG_2,GPE_1,ORG_1,MONEY_1,PERCENT_1
0,0,1,1,0,1,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,1,0,0,1,1,0
2,1,1,1,1,1,1,1,1,0,1,...,0,0,0,0,1,0,0,1,0,1
3,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,1,1,1
4,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,1,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,1,0,0,1,1,1
76,0,1,1,0,1,1,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
77,0,1,1,0,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
78,1,1,1,1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,1,0,1


In [7]:
sentence_label_features_df = DataProcessing.concat_dfs([tfidf_vectorized_features_df, updated_encoded_word_level_tags_entities_df], axis=1)
sentence_label_features_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,464,465,466,467,468,469,470,471,472,473
0,The music echoed through the empty hall.,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0,0,0,0,0,0,0,0,0,0
1,"According to a policy analyst, Emily Chen, from the Congressional Budget Office, on 2024-08-22, the federal budget deficit is expected to decrease beyond $1 trillion in the timeframe of Q4 of 2027.",1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133298,...,0,0,0,0,1,0,0,1,1,0
2,"On 2024-10-15, Dr. David Lee, a health expert, predicts that the obesity rate at the World Health Organization will likely decrease by 3% in Q2 of 2026.",1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0,0,0,0,1,0,0,1,0,1
3,"According to a senior level person from 3M, on 2024/08/22, the operating income is expected to increase as much as $500 million, reflecting a 20% increase, in the timeframe of Q2 of 2029.",1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.137857,...,0,0,0,0,0,0,0,1,1,1
4,"On 2024-10-15, Rachel Patel, a financial analyst, predicts that the operating income at General Motors will likely increase by $5 billion in Q2 of 2026.",1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0,0,0,0,1,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,"Michael Davis, a top executive, predicts on 15 October 2024 that the stock price at Visa may rise by 15% to $200 per share in 2027.",1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0,0,0,0,1,0,0,1,1,1
76,The city lights twinkled at night time.,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,1,0,0,0,0,0,0,0,0,0
77,The little boy fed the hungry birds.,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0,0,0,0,0,0,0,0,0,0
78,"In 2024/08/20, Senator James Davis from the Senate Committee on Energy and Natural Resources, forecasts that the renewable energy consumption will increase from 20% to 50% in 2028.",1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.130450,...,0,0,0,0,0,0,0,1,0,1


## Models

1. Perceptron
    - $ x_n \in X $, `tfidf_vectorized_features`
        - N, `tfidf_vectorized_features_n`. Each row (formally document).
        - D, `tfidf_vectorized_features_d`. Each column (formally unique terms/features)
        - Thus, $ X \in R^{N \times D} $
    - $ w^T $, Weights, which are randomly initialize (in sklearn)
        - N, `tfidf_vectorized_features_d`. Each column (formally unique terms/features)
        - D, 1
        - Thus, $ w^T \in R^{N \times D} $
    
    $$
    (w^T \cdot x_n) \Rightarrow (100 \times 1) \cdot (1 \times 100)
    $$

### Split Data

In [8]:
X_train, X_test, y_train, y_test = DataProcessing.split_data(sentence_label_features_df, sentence_label_features_df[2])
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,464,465,466,467,468,469,470,471,472,473
73,"According to a financial expert from Cisco, on 08/20/2024, the gross profit is expected to increase beyond $10 million in the timeframe of Q4 of 2027.",1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.171061,...,0,0,0,0,1,0,1,0,1,0
61,"On 2024/10/12, policy reporter, Olivia Patel, predicts that the unemployment rate at the United Kingdom will likely decrease by 1.2% in Q1 of 2026.",1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0,0,0,0,1,0,1,0,0,1
55,The flowers smelled sweet in the garden.,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0,0,0,0,0,0,0,0,0,0
40,Dr. Michael Brown predicts on 2024/08/22 that the prevalence of hypertension in the United Kingdom will fall by 10% in 2028.,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.188957,...,0,0,0,0,0,0,1,0,0,1
9,He ate a healthy breakfast every morning.,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20,The little girl played with her dolls house.,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0,0,0,0,0,0,0,0,0,0
60,Dr. John Smith predicts on 08/20/2024 that the global vaccination rate for influenza in the United States should stay stable at 90% in 2027.,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.152327,...,0,0,0,0,1,0,1,0,0,1
71,The bright sun shone through the window.,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0,0,0,0,0,0,0,0,0,0
14,"In Q4 of 2027, the number of students enrolled in online courses in the United States is expected to be 10 million, which is a 25% increase, as predicted by Daniel Hall, a policy analyst, on 2024-08-24.",1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.130468,...,0,0,0,0,1,0,1,0,0,1


In [9]:
def split_sentence_label_features(df: pd.DataFrame) -> tuple:
    sentences = df[0]
    prediction_labels = df[1]
    features_df = df.iloc[:, 2:]
    return sentences, prediction_labels, features_df

X_train_sentences, y_train_prediction_labels, X_train_features_df = split_sentence_label_features(X_train)

In [10]:
y_train_prediction_labels

73    1
61    1
55    0
40    1
9     0
     ..
20    0
60    1
71    0
14    1
51    1
Name: 1, Length: 64, dtype: int64

In [11]:
X_test_sentences, y_test_prediction_labels, X_test_features_df = split_sentence_label_features(X_test)
X_test_sentences

30                                                                                                                                                          They sat together on the couch watching TV.
0                                                                                                                                                              The music echoed through the empty hall.
22                                                                                                                                                             The dog wagged its tail with excitement.
31                                                                                                                                                               They shared a slice of pizza at lunch.
18           In 2028-03-01, the average wind speed in Chicago has a probability of 10% to decrease, as predicted by Samantha Brown, a weather analyst from the National Weather Service, on 2024-07-22.


In [12]:
y_test_prediction_labels

30    0
0     0
22    0
31    0
18    1
28    0
10    1
70    1
4     1
12    0
49    0
33    0
67    1
35    1
68    1
45    0
Name: 1, dtype: int64

In [13]:
# perception_model = SkLearnPerceptronModel()

# perception_model.train_model(X_train_features_df, y_train_prediction_labels)
# y_predictions = perception_model.predict(X_test_features_df)
# y_predictions.to_numpy().ravel()

In [14]:
model_predictions_df = pd.concat([X_test_sentences, y_test_prediction_labels], axis=1)
model_predictions_df.columns = ['Sentence', 'Actual Label']

model_predictions_df

Unnamed: 0,Sentence,Actual Label
30,They sat together on the couch watching TV.,0
0,The music echoed through the empty hall.,0
22,The dog wagged its tail with excitement.,0
31,They shared a slice of pizza at lunch.,0
18,"In 2028-03-01, the average wind speed in Chicago has a probability of 10% to decrease, as predicted by Samantha Brown, a weather analyst from the National Weather Service, on 2024-07-22.",1
28,He rode his bike down the steep hill.,0
10,"According to a top executive from the Meteorological Service of Canada, on 2024-11-25, the snowfall levels in Toronto are expected to increase beyond 20 inches in the timeframe of 2026-02-01.",1
70,"On Wednesday, November 20, 2024, Kevin White, a financial analyst, predicts that the net profit at AT&T will decrease by 5% to $3.5 billion in Q1 of 2026.",1
4,"On 2024-10-15, Rachel Patel, a financial analyst, predicts that the operating income at General Motors will likely increase by $5 billion in Q2 of 2026.",1
12,He listened to his favorite podcast episode.,0


In [15]:
# model_predictions_df['Perceptron Predicted Label'] = y_predictions.to_numpy().ravel()
# model_predictions_df

In [16]:
# sgdc_model = SkLearnSGDClassifier()

# sgdc_model.train_model(X_train_features_df, y_train_prediction_labels)
# sgdc_predictions = sgdc_model.predict(X_test_features_df)
# sgdc_predictions.to_numpy().ravel()

In [17]:
# model_predictions_df['SGDC Predicted Label'] = sgdc_predictions.to_numpy().ravel()
# model_predictions_df

In [18]:
# log_reg_model = SkLearnLogisticRegression()

# log_reg_model.train_model(X_train_features_df, y_train_prediction_labels)
# log_reg_model_predictions = log_reg_model.predict(X_test_features_df)
# # log_reg_model_predictions.to_numpy().ravel()
# model_predictions_df['Logistic Regression Predicted Label'] = log_reg_model_predictions.to_numpy().ravel()
# model_predictions_df

In [19]:
# eval_metric = EvaluationMetric()
# eval_metric.eval_classification_report(y_true=y_test_prediction_labels, y_prediction=y_predictions)
# eval_metric.eval_classification_report(y_test_prediction_labels, sgdc_predictions)

# perceptron_metrics = eval_metric.custom_evaluation_metrics(y_true=y_test_prediction_labels, y_prediction=y_predictions)
# print(perceptron_metrics)
# sgdc_metrics = eval_metric.custom_evaluation_metrics(y_test_prediction_labels, sgdc_predictions)
# print(sgdc_metrics)
# log_reg_model_metrics = eval_metric.custom_evaluation_metrics(y_test_prediction_labels, log_reg_model_predictions)
# log_reg_model_metrics

In [20]:
# model_metrics_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
# model_metrics_df['Model'] = ['Perceptron', 'SGDC', 'Logistic Regression']
# model_metrics_df['Accuracy'] = [perceptron_metrics['Accuracy'], sgdc_metrics['Accuracy'], log_reg_model_metrics['Accuracy']]
# model_metrics_df['Precision'] = [perceptron_metrics['Precision'], sgdc_metrics['Precision'], log_reg_model_metrics['Precision']]
# model_metrics_df['Recall'] = [perceptron_metrics['Recall'], sgdc_metrics['Recall'], log_reg_model_metrics['Recall']]
# model_metrics_df['F1 Score'] = [perceptron_metrics['F1 Score'], sgdc_metrics['F1 Score'], log_reg_model_metrics['F1 Score']]

# model_metrics_df 

In [21]:
model_predictions_df = pd.concat([X_test_sentences, y_test_prediction_labels], axis=1)
model_predictions_df.columns = ['Sentence', 'Actual Label']

model_predictions_df

Unnamed: 0,Sentence,Actual Label
30,They sat together on the couch watching TV.,0
0,The music echoed through the empty hall.,0
22,The dog wagged its tail with excitement.,0
31,They shared a slice of pizza at lunch.,0
18,"In 2028-03-01, the average wind speed in Chicago has a probability of 10% to decrease, as predicted by Samantha Brown, a weather analyst from the National Weather Service, on 2024-07-22.",1
28,He rode his bike down the steep hill.,0
10,"According to a top executive from the Meteorological Service of Canada, on 2024-11-25, the snowfall levels in Toronto are expected to increase beyond 20 inches in the timeframe of 2026-02-01.",1
70,"On Wednesday, November 20, 2024, Kevin White, a financial analyst, predicts that the net profit at AT&T will decrease by 5% to $3.5 billion in Q1 of 2026.",1
4,"On 2024-10-15, Rachel Patel, a financial analyst, predicts that the operating income at General Motors will likely increase by $5 billion in Q2 of 2026.",1
12,He listened to his favorite podcast episode.,0


In [22]:
perceptron_metrics, perceptron_predictions = train_and_evaluate_model(
    "perceptron", X_train_features_df, y_train_prediction_labels, X_test_features_df, y_test_prediction_labels
)
model_predictions_df['Perceptron Predicted Label'] = perceptron_predictions.to_numpy().ravel()

# Train and evaluate SGDClassifier model
sgdc_metrics, sgdc_predictions = train_and_evaluate_model(
    "sgdclassifier", X_train_features_df, y_train_prediction_labels, X_test_features_df, y_test_prediction_labels
)
model_predictions_df['SGDC Predicted Label'] = sgdc_predictions.to_numpy().ravel()

# Train and evaluate Logistic Regression model
log_reg_metrics, log_reg_predictions = train_and_evaluate_model(
    "logistic regression", X_train_features_df, y_train_prediction_labels, X_test_features_df, y_test_prediction_labels
)
model_predictions_df['Logistic Regression Predicted Label'] = log_reg_predictions.to_numpy().ravel()

# Combine metrics into a DataFrame
model_metrics_df = pd.DataFrame([perceptron_metrics, sgdc_metrics, log_reg_metrics])
model_metrics_df = model_metrics_df[['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score']]

# Display the metrics
model_metrics_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Perceptron Model,0.9375,1.0,0.857143,0.923077
1,SDGClassifier Model,1.0,1.0,1.0,1.0
2,Logistic Regression Model,1.0,1.0,1.0,1.0


In [23]:
model_predictions_df

Unnamed: 0,Sentence,Actual Label,Perceptron Predicted Label,SGDC Predicted Label,Logistic Regression Predicted Label
30,They sat together on the couch watching TV.,0,0,0,0
0,The music echoed through the empty hall.,0,0,0,0
22,The dog wagged its tail with excitement.,0,0,0,0
31,They shared a slice of pizza at lunch.,0,0,0,0
18,"In 2028-03-01, the average wind speed in Chicago has a probability of 10% to decrease, as predicted by Samantha Brown, a weather analyst from the National Weather Service, on 2024-07-22.",1,1,1,1
28,He rode his bike down the steep hill.,0,0,0,0
10,"According to a top executive from the Meteorological Service of Canada, on 2024-11-25, the snowfall levels in Toronto are expected to increase beyond 20 inches in the timeframe of 2026-02-01.",1,0,1,1
70,"On Wednesday, November 20, 2024, Kevin White, a financial analyst, predicts that the net profit at AT&T will decrease by 5% to $3.5 billion in Q1 of 2026.",1,1,1,1
4,"On 2024-10-15, Rachel Patel, a financial analyst, predicts that the operating income at General Motors will likely increase by $5 billion in Q2 of 2026.",1,1,1,1
12,He listened to his favorite podcast episode.,0,0,0,0
