# Prediction Observation Classification Pipeline

- **Goal:** Prediction Classification

In [1]:
import os
import sys

import importlib.util

import pandas as pd


from pathlib import Path
from IPython.display import Image

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

import log_files
from data_processing import DataProcessing
from feature_extraction import TfidfFeatureExtraction, SpacyFeatureExtraction

# Absolute path to your local classification_models.py file
project_root = "/Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions"
module_path = os.path.join(project_root, "classification_models.py")

# Dynamically load the module so it overrides any installed package
spec = importlib.util.spec_from_file_location("classification_models", module_path)
classification_models = importlib.util.module_from_spec(spec)
spec.loader.exec_module(classification_models)

# Inject into sys.modules so pickle uses this
sys.modules["classification_models"] = classification_models

# âœ… Now you can use the classes
perception_model = classification_models.SkLearnPerceptronModel()
sgd_model = classification_models.SkLearnSGDClassifier()
EvaluationMetric = classification_models.EvaluationMetric
from classification_models import SkLearnPerceptronModel, SkLearnSGDClassifier, EvaluationMetric

In [2]:
# Image(filename='../misc/base_pipeline.png')

## 1-Data Acquisition

In [3]:
pd.set_option('max_colwidth', 800)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
log_file_path = "data/prediction_logs"
predictions = True
predictions_df = log_files.read_data(notebook_dir, log_file_path, predictions)
predictions_df.head(7)

Start logging batch
log_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/pipelines/../data/prediction_logs
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/pipelines/../data/prediction_logs/batch_1-prediction
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/pipelines/../data/prediction_logs/batch_1-prediction/batch_1-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/pipelines/../data/prediction_logs/batch_2-prediction
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/pipelines/../data/prediction_logs/batch_2-prediction/batch_2-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/pipelines/../data/prediction_logs/batch_3-pre

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,1
1,"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,2
2,"Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,3
3,"According to Goldman Sachs, the research and development expenses at Facebook would fall in 2025.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,4
4,"In 21 August 2024, Morgan Stanley envisions that the gross profit at Johnson & Johnson has some probability to remain stable.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,5
5,"The stock price at Visa should stay same in Q2 of 2026, according to Wells Fargo.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,6
6,JPMorgan forecasts that the revenue at Microsoft potentially decrease in Q3 of 2027.,1,finance,llama-3.3-70b-instruct,NAVI_GATOR,0,1


In [5]:
log_file_path = "data/observation_logs"
predictions = False
observations_df = log_files.read_data(notebook_dir, log_file_path, predictions)
observations_df.head(7)

Start logging batch
log_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/pipelines/../data/observation_logs
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/pipelines/../data/observation_logs/batch_1-observation
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/pipelines/../data/observation_logs/batch_1-observation/batch_1-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/pipelines/../data/observation_logs/batch_2-observation
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/pipelines/../data/observation_logs/batch_2-observation/batch_2-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/pipelines/../data/observation_logs/b

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,The financial analyst at Goldman Sachs observed that the operating income at Tesla had increased in the first quarter of 2024.,0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,1
1,"On 2024-08-20 to 2025-08-20, Morgan Stanley speculates the stock price at Amazon will likely rise.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,2
2,"A young investor predicts on 2025-03-15, the S&P 500 index may rise.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,3
3,"According to Bank of America, the net profit at Microsoft would fall in the second quarter of 2026.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,4
4,"In 2027-01-01 to 2027-12-31, Wells Fargo envisions that the interest rates at the Federal Reserve have some probability to remain stable.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,5
5,"The trading volume at Apple should stay same in the fourth quarter of 2025, according to a financial expert at JPMorgan Chase.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,6
6,JPMorgan observed that the net profit at Microsoft had risen in September 2023.,0,finance,llama-3.3-70b-instruct,NAVI_GATOR,0,1


## 2-Data Processing

In [6]:
pred_obs_dfs = [predictions_df, observations_df]
base_df = DataProcessing.concat_dfs(pred_obs_dfs)
shuffled_base_df = DataProcessing.shuffle_df(base_df)
shuffled_base_df.head(7)

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,"In Q3 of 2029, policy strategist Rachel Patel reported that her green tech subsidy approvals declined.",0,policy,llama-3.1-8b-instant,GROQ_CLOUD,0,5
1,"On 01/01/2024, the miscellaneous top executive speculates that the number of remote workers in the tech sector will likely increase due to the continued adoption of flexible work arrangements.",1,miscellaneous,mixtral-8x7b-instruct,NAVI_GATOR,0,2
2,"According to the study conducted at Stanford University, the average body mass index at California high schools rose on 9/18/2049.",0,health,llama-3.1-8b-instruct,NAVI_GATOR,0,4
3,Professor Rachel Kim forecasts that the stock prices at Goldman Sachs will decrease in 2025.,1,miscellaneous,llama-3.1-8b-instant,GROQ_CLOUD,0,1
4,"On 2/10/2029, the World Health Organization monitored the obesity rates at urban schools changed.",0,health,llama-3.1-8b-instant,GROQ_CLOUD,0,2
5,"On August 21, 2024, Marketing expert Emily Patel speculates the customer engagement at Coca-Cola will likely increase.",1,miscellaneous,llama-3.1-70b-instruct,NAVI_GATOR,0,2
6,"According to Historian Elizabeth Rodriguez, the number of artifacts discovered at the dig site rose in June 2021.",0,miscellaneous,gemma2-9b-it,GROQ_CLOUD,0,4


## 3-Feature Extraction

### TF x IDF

In [7]:
max_features = 117

tf_idf_feature_extractor = TfidfFeatureExtraction(shuffled_base_df, 'Base Sentence')
tfidf_vectorized_features = tf_idf_feature_extractor.word_feature_extraction(max_features)
tfidf_vectorized_features_df = tf_idf_feature_extractor.feature_scores(max_features)
tfidf_vectorized_features_df.head(3)

Unnamed: 0,Base Sentence,Sentence Label,01,08,09,10,12,15,20,2022,2023,2024,2025,2026,2027,2028,2029,21,according,advisor,amazon,analyst,and,angeles,at,august,average,brown,changed,chen,chicago,coach,david,davis,decrease,decreased,dr,emily,envisioned,envisions,expert,fall,fell,financial,for,forecasts,from,had,has,health,in,increase,increased,james,january,johnson,kim,lee,levels,likely,los,may,meteorologist,michael,monitored,national,new,noted,number,obesity,observed,of,on,operating,patel,percentage,policy,potentially,precipitation,predicts,price,probability,professor,profit,q2,q3,q4,quarter,rachel,rate,rates,remain,remained,research,revenue,rise,rodriguez,rose,same,senior,should,some,speculates,stable,state,stay,stock,temperature,that,the,to,university,urban,weather,will,wind,world,would,york
0,"In Q3 of 2029, policy strategist Rachel Patel reported that her green tech subsidy approvals declined.",0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.389193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.153657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21784,0.0,0.0,0.438319,0.0,0.444896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.354558,0.0,0.0,0.473508,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.193617,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"On 01/01/2024, the miscellaneous top executive speculates that the number of remote workers in the tech sector will likely increase due to the continued adoption of flexible work arrangements.",1,0.651879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150494,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.099094,0.250992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.24352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.292097,0.0,0.0,0.280972,0.133192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.261062,0.0,0.0,0.0,0.0,0.0,0.124864,0.295138,0.126518,0.0,0.0,0.0,0.226767,0.0,0.0,0.0,0.0
2,"According to the study conducted at Stanford University, the average body mass index at California high schools rose on 9/18/2049.",0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.256524,0.0,0.0,0.0,0.0,0.0,0.340881,0.0,0.456754,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.444133,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25516,0.21876,0.496101,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Models

1. Perceptron

---

- Split: 80% train and 20% test

In [8]:
X_train, X_test, y_train, y_test = DataProcessing.split_data(tfidf_vectorized_features_df, tfidf_vectorized_features_df['Sentence Label'])
X_train.head(3)

Unnamed: 0,Base Sentence,Sentence Label,01,08,09,10,12,15,20,2022,2023,2024,2025,2026,2027,2028,2029,21,according,advisor,amazon,analyst,and,angeles,at,august,average,brown,changed,chen,chicago,coach,david,davis,decrease,decreased,dr,emily,envisioned,envisions,expert,fall,fell,financial,for,forecasts,from,had,has,health,in,increase,increased,james,january,johnson,kim,lee,levels,likely,los,may,meteorologist,michael,monitored,national,new,noted,number,obesity,observed,of,on,operating,patel,percentage,policy,potentially,precipitation,predicts,price,probability,professor,profit,q2,q3,q4,quarter,rachel,rate,rates,remain,remained,research,revenue,rise,rodriguez,rose,same,senior,should,some,speculates,stable,state,stay,stock,temperature,that,the,to,university,urban,weather,will,wind,world,would,york
1607,"According to the miscellaneous analyst, the number of cybersecurity threats at XYZ Corporation rose in Q2 of 2023.",0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.327615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.217083,0.0,0.0,0.310107,0.0,0.0,0.144235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.144998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.427409,0.0,0.0,0.411129,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.365628,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.375848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.215929,0.185126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1518,"The success rate of startups in Silicon Valley should stay the same in 2025, according to Venture Capitalist, Emily Wong.",0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.319283,0.0,0.0,0.0,0.0,0.0,0.211562,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.364559,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.282621,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.200336,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.330454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.357388,0.0,0.384753,0.0,0.0,0.0,0.0,0.372281,0.0,0.0,0.0,0.210437,0.180417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1366,"According to the lead forecaster, the precipitation levels at the observatory rose in November 2027.",0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.366738,0.0,0.0,0.0,0.233921,0.0,0.0,0.0,0.0,0.0,0.155423,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.156245,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.435943,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.496525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.405001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.349016,0.199485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
def split_sentence_label_features(df: pd.DataFrame) -> tuple:
    sentences = df['Base Sentence']
    prediction_labels = df['Sentence Label']
    features_df = df.iloc[:, 2:]
    return sentences, prediction_labels, features_df

X_train_sentences, y_train_prediction_labels, X_train_features_df = split_sentence_label_features(X_train)

In [10]:
X_test_sentences, y_test_prediction_labels, X_test_features_df = split_sentence_label_features(X_test)
X_test_sentences

613                                                                                                                 The batting average at the Chicago Cubs rose on 09/15/2024, according to Analyst Rachel Hall.
1234                                                                                                                 Dr. Robert Martinez noted on March 15, 2027, the interest rates at the Federal Reserve fell.
892                                                                                                                          Analyst Emily Patel predicts on 08/15/2024, the goal average at Barcelona will rise.
296                                                               According to a study published in the Journal of Nutrition, the fruit consumption at urban elementary schools would fall in the autumn of 2028.
611                                                                                                          The National Weather Service observed that the temp

In [11]:
y_test_prediction_labels

613     0
1234    0
892     1
296     0
611     0
93      0
239     0
2327    0
964     0
596     1
1047    0
1893    1
636     1
931     0
2013    0
1441    0
1023    1
2221    0
2042    0
1387    1
1545    0
2141    0
2334    0
420     0
2078    1
1004    1
838     0
621     0
134     0
1729    1
486     0
179     0
1302    1
1128    0
1659    0
259     0
1988    1
1058    0
1210    1
210     0
478     0
2380    0
196     1
1553    0
1769    0
367     0
742     0
2216    1
2409    0
2033    0
839     0
1550    0
2018    1
1137    0
1381    0
56      0
1744    0
1670    0
1812    0
237     0
598     0
1697    0
1189    0
457     1
1738    0
1867    0
507     0
480     1
532     1
1333    0
1104    0
1124    0
808     0
1313    0
1199    0
1765    0
2034    0
1675    1
651     0
1621    0
408     0
1702    0
1220    1
812     1
1056    0
1054    0
1822    1
1179    0
1221    0
1665    0
44      0
229     0
1745    1
873     0
978     0
495     0
1811    1
1879    0
789     0
443     0


In [12]:
# perception_model = SkLearnPerceptronModel()
perception_model.train_model(X_train_features_df, y_train_prediction_labels)
perceptron_predictions = perception_model.predict(X_test_features_df)
perceptron_predictions.to_numpy().ravel()
perceptron_predictions

  ret = a @ b
  ret = a @ b
  ret = a @ b


0      0
1      0
2      1
3      0
4      0
5      0
6      0
7      0
8      0
9      1
10     0
11     1
12     0
13     0
14     0
15     0
16     1
17     1
18     1
19     0
20     0
21     0
22     0
23     0
24     0
25     1
26     1
27     0
28     0
29     1
30     0
31     0
32     1
33     0
34     0
35     0
36     0
37     0
38     1
39     0
40     0
41     0
42     1
43     0
44     0
45     0
46     0
47     1
48     0
49     0
50     0
51     0
52     0
53     0
54     0
55     0
56     0
57     0
58     0
59     0
60     0
61     0
62     1
63     1
64     0
65     0
66     0
67     1
68     0
69     0
70     0
71     0
72     0
73     0
74     0
75     0
76     0
77     1
78     0
79     0
80     0
81     0
82     1
83     1
84     0
85     0
86     1
87     0
88     0
89     0
90     0
91     0
92     1
93     0
94     0
95     0
96     1
97     0
98     0
99     0
100    0
101    0
102    0
103    1
104    1
105    1
106    1
107    0
108    0
109    0
110    0
1

In [13]:
# sgd_model = SkLearnSGDClassifier()
sgd_model.train_model(X_train_features_df, y_train_prediction_labels)
sgd_predictions = sgd_model.predict(X_test_features_df)
sgd_predictions.to_numpy().ravel()
sgd_predictions

  ret = a @ b
  ret = a @ b
  ret = a @ b


0      0
1      0
2      1
3      0
4      0
5      0
6      0
7      0
8      0
9      1
10     0
11     1
12     1
13     0
14     0
15     0
16     1
17     1
18     0
19     1
20     0
21     0
22     0
23     0
24     0
25     1
26     0
27     0
28     0
29     1
30     0
31     0
32     0
33     0
34     0
35     0
36     1
37     0
38     1
39     0
40     0
41     0
42     1
43     0
44     0
45     0
46     0
47     1
48     0
49     0
50     0
51     0
52     0
53     0
54     0
55     0
56     0
57     0
58     0
59     0
60     0
61     0
62     1
63     1
64     0
65     0
66     0
67     1
68     0
69     0
70     0
71     0
72     0
73     0
74     0
75     0
76     0
77     0
78     0
79     0
80     0
81     0
82     1
83     1
84     0
85     0
86     1
87     0
88     0
89     0
90     0
91     0
92     1
93     0
94     0
95     0
96     1
97     0
98     0
99     0
100    0
101    0
102    0
103    1
104    1
105    1
106    1
107    0
108    0
109    1
110    0
1

In [14]:
model_predictions_df = pd.concat([X_test_sentences, y_test_prediction_labels], axis=1)
model_predictions_df.columns = ['Sentence', 'Actual Label']

model_predictions_df

Unnamed: 0,Sentence,Actual Label
613,"The batting average at the Chicago Cubs rose on 09/15/2024, according to Analyst Rachel Hall.",0
1234,"Dr. Robert Martinez noted on March 15, 2027, the interest rates at the Federal Reserve fell.",0
892,"Analyst Emily Patel predicts on 08/15/2024, the goal average at Barcelona will rise.",1
296,"According to a study published in the Journal of Nutrition, the fruit consumption at urban elementary schools would fall in the autumn of 2028.",0
611,The National Weather Service observed that the temperature in Phoenix remained stable on 08/21/2024.,0
93,"On 2/15/2029, the Congressional Budget Office speculated the budget allocation at federal agencies would likely increase.",0
239,"The CEO noted on 03/15/2023, the company's stock price fell.",0
2327,Morgan Stanley observed that the revenue at FUBU had increased for Q3 2028.,0
964,"In 2025, the research team envisioned that the air pressure at the weather station decreased.",0
596,"On 08/21/2024, policy analyst John Kim speculates that the number of clean energy initiatives at Fortune 500 companies will rise.",1


In [15]:
model_predictions_df['Perceptron Predicted Label'] = perceptron_predictions.to_numpy().ravel()
model_predictions_df['SGD Predicted Label'] = sgd_predictions.to_numpy().ravel()
model_predictions_df

Unnamed: 0,Sentence,Actual Label,Perceptron Predicted Label,SGD Predicted Label
613,"The batting average at the Chicago Cubs rose on 09/15/2024, according to Analyst Rachel Hall.",0,0,0
1234,"Dr. Robert Martinez noted on March 15, 2027, the interest rates at the Federal Reserve fell.",0,0,0
892,"Analyst Emily Patel predicts on 08/15/2024, the goal average at Barcelona will rise.",1,1,1
296,"According to a study published in the Journal of Nutrition, the fruit consumption at urban elementary schools would fall in the autumn of 2028.",0,0,0
611,The National Weather Service observed that the temperature in Phoenix remained stable on 08/21/2024.,0,0,0
93,"On 2/15/2029, the Congressional Budget Office speculated the budget allocation at federal agencies would likely increase.",0,0,0
239,"The CEO noted on 03/15/2023, the company's stock price fell.",0,0,0
2327,Morgan Stanley observed that the revenue at FUBU had increased for Q3 2028.,0,0,0
964,"In 2025, the research team envisioned that the air pressure at the weather station decreased.",0,0,0
596,"On 08/21/2024, policy analyst John Kim speculates that the number of clean energy initiatives at Fortune 500 companies will rise.",1,1,1


## Evaluation

In [16]:
get_metrics = EvaluationMetric()

In [17]:
metrics = get_metrics.eval_classification_report(y_test_prediction_labels, perceptron_predictions)
metrics

              precision    recall  f1-score   support

           0       0.92      0.94      0.93       363
           1       0.82      0.75      0.79       122

    accuracy                           0.90       485
   macro avg       0.87      0.85      0.86       485
weighted avg       0.89      0.90      0.90       485



In [18]:
metrics = get_metrics.eval_classification_report(y_test_prediction_labels, sgd_predictions)
metrics

              precision    recall  f1-score   support

           0       0.96      0.95      0.95       363
           1       0.85      0.88      0.86       122

    accuracy                           0.93       485
   macro avg       0.90      0.91      0.91       485
weighted avg       0.93      0.93      0.93       485



In [20]:
# import pickle
# base_dir = os.path.join(notebook_dir, '../models')
# sgd_path = os.path.join(base_dir, 'sgd_model_117.pkl')
# with open(sgd_path, 'wb') as file:
#     pickle.dump(sgd_model, file)

# sgd_path

## Further Exploration

In [21]:
custom_sentences = [
    "Emily Chen forecasts that the net profit at Tesla (TSLA) will decrease by 15% to $5 billion in FY 2027.",
    "Raj Patel speculates that the stock price of ExxonMobil (XOM) could rise by 8% to $120 by Q4 of 2026.",
    "There is a high probability that the revenue at Microsoft (MSFT) will reach $200 billion in FY 2029.",
    "On Thursday, April 10, 2025, Sarah Lee envisions that the operating income at Apple (AAPL) will increase by 10% to $80 billion in FY 2026.",
    "Michael Brown predicts that the dividend yield at Chevron (CVX) will rise to 5% by Q3 of 2027.",
    "It is anticipated that the market share of Alphabet (GOOGL) will grow by 3% in FY 2028.",
    "Hey, how are you?",
    "Joe Hall thinks that the earnings before interest and taxes (EBIT) at 3M (MMM) will decrease by 90% to $10 million in FY 2028, which Raj stated on Monday, December 16, 2024.",
    "Malique Mell, on Monday, December 16, 2024, predicted that the earnings before interest and taxes (EBIT) at 3M (MMM) will drop by 90%, reaching $10 million in FY 2028.",
    "Raj Jensen predicts that the earnings before interest and taxes (EBIT) at 3M (MMM) will decrease by 90% to $10 million in FY 2028.",
    "The weather today is sunny with a chance of rain in the evening.",
    "I enjoy reading books and watching movies during my free time."
]

data = {"Base Sentence": custom_sentences}
custom_sentences_df = pd.DataFrame(data)
custom_sentences_df

Unnamed: 0,Base Sentence
0,Emily Chen forecasts that the net profit at Tesla (TSLA) will decrease by 15% to $5 billion in FY 2027.
1,Raj Patel speculates that the stock price of ExxonMobil (XOM) could rise by 8% to $120 by Q4 of 2026.
2,There is a high probability that the revenue at Microsoft (MSFT) will reach $200 billion in FY 2029.
3,"On Thursday, April 10, 2025, Sarah Lee envisions that the operating income at Apple (AAPL) will increase by 10% to $80 billion in FY 2026."
4,Michael Brown predicts that the dividend yield at Chevron (CVX) will rise to 5% by Q3 of 2027.
5,It is anticipated that the market share of Alphabet (GOOGL) will grow by 3% in FY 2028.
6,"Hey, how are you?"
7,"Joe Hall thinks that the earnings before interest and taxes (EBIT) at 3M (MMM) will decrease by 90% to $10 million in FY 2028, which Raj stated on Monday, December 16, 2024."
8,"Malique Mell, on Monday, December 16, 2024, predicted that the earnings before interest and taxes (EBIT) at 3M (MMM) will drop by 90%, reaching $10 million in FY 2028."
9,Raj Jensen predicts that the earnings before interest and taxes (EBIT) at 3M (MMM) will decrease by 90% to $10 million in FY 2028.


In [22]:
inference_tf_idf_feature_extractor = TfidfFeatureExtraction(custom_sentences_df, 'Base Sentence')
inference_tfidf_vectorized_features = inference_tf_idf_feature_extractor.word_feature_extraction(max_features)
custom_sentences_predictions = perception_model.predict(inference_tfidf_vectorized_features)
DataProcessing.join_predictions_with_sentences(custom_sentences_df, custom_sentences_predictions, perception_model)




Unnamed: 0,Base Sentence,Perceptron Prediction
0,Emily Chen forecasts that the net profit at Tesla (TSLA) will decrease by 15% to $5 billion in FY 2027.,1
1,Raj Patel speculates that the stock price of ExxonMobil (XOM) could rise by 8% to $120 by Q4 of 2026.,0
2,There is a high probability that the revenue at Microsoft (MSFT) will reach $200 billion in FY 2029.,0
3,"On Thursday, April 10, 2025, Sarah Lee envisions that the operating income at Apple (AAPL) will increase by 10% to $80 billion in FY 2026.",1
4,Michael Brown predicts that the dividend yield at Chevron (CVX) will rise to 5% by Q3 of 2027.,0
5,It is anticipated that the market share of Alphabet (GOOGL) will grow by 3% in FY 2028.,1
6,"Hey, how are you?",0
7,"Joe Hall thinks that the earnings before interest and taxes (EBIT) at 3M (MMM) will decrease by 90% to $10 million in FY 2028, which Raj stated on Monday, December 16, 2024.",1
8,"Malique Mell, on Monday, December 16, 2024, predicted that the earnings before interest and taxes (EBIT) at 3M (MMM) will drop by 90%, reaching $10 million in FY 2028.",1
9,Raj Jensen predicts that the earnings before interest and taxes (EBIT) at 3M (MMM) will decrease by 90% to $10 million in FY 2028.,1
