# Prediction Observation Classification Pipeline

- **Goal:** Prediction Classification

In [1]:
import os
import sys

import importlib.util

import pandas as pd


from pathlib import Path
from IPython.display import Image

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

import log_files
from data_processing import DataProcessing
from feature_extraction import TfidfFeatureExtraction, SpacyFeatureExtraction

# Absolute path to your local classification_models.py file
project_root = "/Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions"
module_path = os.path.join(project_root, "classification_models.py")

# Dynamically load the module so it overrides any installed package
spec = importlib.util.spec_from_file_location("classification_models", module_path)
classification_models = importlib.util.module_from_spec(spec)
spec.loader.exec_module(classification_models)

# Inject into sys.modules so pickle uses this
sys.modules["classification_models"] = classification_models

# ✅ Now you can use the classes
perception_model = classification_models.SkLearnPerceptronModel()
sgd_model = classification_models.SkLearnSGDClassifier()
EvaluationMetric = classification_models.EvaluationMetric
from classification_models import SkLearnPerceptronModel, SkLearnSGDClassifier, EvaluationMetric

In [2]:
# Image(filename='../misc/base_pipeline.png')

## 1-Data Acquisition

In [3]:
pd.set_option('max_colwidth', 800)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
log_file_path = "../data/prediction_logs"
predictions = True
predictions_df = log_files.read_data(notebook_dir, log_file_path, predictions)
predictions_df.head(7)

Start logging batch
log_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/scratch-predictions/predictions/pipelines/../../data/prediction_logs
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/scratch-predictions/predictions/pipelines/../../data/prediction_logs/batch_1-prediction
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/scratch-predictions/predictions/pipelines/../../data/prediction_logs/batch_1-prediction/batch_1-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/scratch-predictions/predictions/pipelines/../../data/prediction_logs/batch_2-prediction
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/scratch-predictions/predictions/pipelines/../../data/prediction_logs/batch_2-prediction/batch_2-from_df.csv
save_batch_directory: /Users/detraviou

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,1
1,"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,2
2,"Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,3
3,"According to Goldman Sachs, the research and development expenses at Facebook would fall in 2025.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,4
4,"In 21 August 2024, Morgan Stanley envisions that the gross profit at Johnson & Johnson has some probability to remain stable.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,5
5,"The stock price at Visa should stay same in Q2 of 2026, according to Wells Fargo.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,6
6,JPMorgan forecasts that the revenue at Microsoft potentially decrease in Q3 of 2027.,1,finance,llama-3.3-70b-instruct,NAVI_GATOR,0,1


In [5]:
log_file_path = "../data/observation_logs"
predictions = False
observations_df = log_files.read_data(notebook_dir, log_file_path, predictions)
observations_df.head(7)

Start logging batch
log_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/scratch-predictions/predictions/pipelines/../../data/observation_logs
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/scratch-predictions/predictions/pipelines/../../data/observation_logs/batch_1-observation
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/scratch-predictions/predictions/pipelines/../../data/observation_logs/batch_1-observation/batch_1-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/scratch-predictions/predictions/pipelines/../../data/observation_logs/batch_2-observation
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/scratch-predictions/predictions/pipelines/../../data/observation_logs/batch_2-observation/batch_2-from_df.csv
save_batch_directory: /Users/

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,The financial analyst at Goldman Sachs observed that the operating income at Tesla had increased in the first quarter of 2024.,0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,1
1,"On 2024-08-20 to 2025-08-20, Morgan Stanley speculates the stock price at Amazon will likely rise.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,2
2,"A young investor predicts on 2025-03-15, the S&P 500 index may rise.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,3
3,"According to Bank of America, the net profit at Microsoft would fall in the second quarter of 2026.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,4
4,"In 2027-01-01 to 2027-12-31, Wells Fargo envisions that the interest rates at the Federal Reserve have some probability to remain stable.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,5
5,"The trading volume at Apple should stay same in the fourth quarter of 2025, according to a financial expert at JPMorgan Chase.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,6
6,JPMorgan observed that the net profit at Microsoft had risen in September 2023.,0,finance,llama-3.3-70b-instruct,NAVI_GATOR,0,1


## 2-Data Processing

In [6]:
pred_obs_dfs = [predictions_df, observations_df]
base_df = DataProcessing.concat_dfs(pred_obs_dfs)
shuffled_base_df = DataProcessing.shuffle_df(base_df)
shuffled_base_df.head(7)

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,JPMorgan observed that the net profit at Microsoft had risen in September 2023.,0,finance,llama-3.3-70b-instruct,NAVI_GATOR,0,1
1,"According to Professor Lisa Nguyen, the frequency of software updates at Microsoft would fall in the first quarter of 2023.",0,miscellaneous,llama-3.1-70b-instruct,NAVI_GATOR,0,4
2,Meteorologist Tom Harris observed that the air pressure at Denver had increased on 2024-08-20.,0,weather,llama-3.1-70b-instruct,NAVI_GATOR,0,1
3,Dr. Amelia Johnson observed that the obesity rates at the local high schools had decreased significantly from 2025 to 2027.,0,health,mistral-7b-instruct,NAVI_GATOR,0,1
4,"College student, David Lee, noted on 08/15/2023, that the tuition fees at Stanford University fell.",0,miscellaneous,llama-3.3-70b-versatile,GROQ_CLOUD,0,3
5,"In 21 August 2024, Wells Fargo envisions that the gross profit at Intel has some probability to remain stable.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,5
6,"According to Dr. David Kim, the regulatory compliance at financial institutions would fall in 2026-03-01.",1,policy,llama-3.3-70b-versatile,GROQ_CLOUD,0,4


## 3-Feature Extraction

### TF x IDF

In [7]:
max_features = 117

tf_idf_feature_extractor = TfidfFeatureExtraction(shuffled_base_df, 'Base Sentence')
tfidf_vectorized_features = tf_idf_feature_extractor.word_feature_extraction(max_features)
tfidf_vectorized_features_df = tf_idf_feature_extractor.feature_scores(max_features)
tfidf_vectorized_features_df.head(3)

Unnamed: 0,Base Sentence,Sentence Label,01,08,09,10,12,15,20,2022,2023,2024,2025,2026,2027,2028,2029,21,according,advisor,amazon,analyst,and,angeles,at,august,average,brown,by,changed,chen,chicago,coach,college,david,davis,decrease,decreased,dr,emily,envisioned,envisions,expert,fall,fell,financial,for,forecasts,from,had,has,health,in,increase,increased,james,january,johnson,kim,lee,level,levels,likely,los,may,meteorologist,michael,monitored,national,new,noted,number,obesity,observed,of,on,operating,patel,percentage,policy,predicts,price,probability,professor,profit,q2,q3,q4,quarter,rachel,rate,rates,remain,remained,research,revenue,rise,rose,same,schools,senior,should,some,speculates,stable,stay,stock,temperature,that,the,to,university,urban,weather,will,wind,world,would,york
0,JPMorgan observed that the net profit at Microsoft had risen in September 2023.,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.421643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.185613,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.398319,0.0,0.0,0.188689,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.429208,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.577173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.237504,0.140521,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"According to Professor Lisa Nguyen, the frequency of software updates at Microsoft would fall in the first quarter of 2023.",0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.312163,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.20896,0.0,0.0,0.0,0.0,0.0,0.137418,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.341128,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.139696,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.391216,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.438556,0.0,0.0,0.0,0.0,0.418412,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.208068,0.176478,0.0,0.0,0.0,0.0,0.0,0.0,0.33051,0.0
2,Meteorologist Tom Harris observed that the air pressure at Denver had increased on 2024-08-20.,0,0.0,0.329856,0.0,0.0,0.0,0.0,0.435108,0.0,0.0,0.231221,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.318632,0.0,0.0,0.0,0.0,0.36385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.428576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.343341,0.0,0.205335,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.189989,0.112408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Models

1. Perceptron

---

- Split: 80% train and 20% test

In [8]:
X_train, X_test, y_train, y_test = DataProcessing.split_data(tfidf_vectorized_features_df, tfidf_vectorized_features_df['Sentence Label'])
X_train.head(3)

Unnamed: 0,Base Sentence,Sentence Label,01,08,09,10,12,15,20,2022,2023,2024,2025,2026,2027,2028,2029,21,according,advisor,amazon,analyst,and,angeles,at,august,average,brown,by,changed,chen,chicago,coach,college,david,davis,decrease,decreased,dr,emily,envisioned,envisions,expert,fall,fell,financial,for,forecasts,from,had,has,health,in,increase,increased,james,january,johnson,kim,lee,level,levels,likely,los,may,meteorologist,michael,monitored,national,new,noted,number,obesity,observed,of,on,operating,patel,percentage,policy,predicts,price,probability,professor,profit,q2,q3,q4,quarter,rachel,rate,rates,remain,remained,research,revenue,rise,rose,same,schools,senior,should,some,speculates,stable,stay,stock,temperature,that,the,to,university,urban,weather,will,wind,world,would,york
240,"Analyst David Lee predicts on 08/21/2024, the touchdown rate at the Seattle Seahawks may rise.",1,0.0,0.285903,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.200411,0.0,0.0,0.0,0.0,0.0,0.265875,0.0,0.0,0.0,0.279192,0.0,0.0,0.128695,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.355517,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.331195,0.0,0.0,0.0,0.0,0.314866,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.177975,0.0,0.0,0.0,0.0,0.337341,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.306284,0.0,0.0,0.0,0.0,0.0,0.316885,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.19486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1670,"The passing yards at the New England Patriots should stay consistent in the second half of 2024, according to Coach Brian Kelly.",0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.230159,0.0,0.0,0.0,0.0,0.0,0.0,0.224743,0.0,0.0,0.0,0.0,0.0,0.147798,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.43311,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.404801,0.0,0.0,0.0,0.0,0.210382,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.399773,0.0,0.0,0.0,0.384536,0.0,0.0,0.0,0.335675,0.189807,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
692,"According to the study conducted by the National Institutes of Health (NIH), the average life expectancy in developed countries rose from 2023 to 2025.",0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.255931,0.0,0.256968,0.0,0.0,0.0,0.0,0.0,0.171319,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.312596,0.0,0.367226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.348184,0.0,0.0,0.316106,0.114531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.318286,0.0,0.0,0.0,0.0,0.0,0.160372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.309894,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.255882,0.289376,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
def split_sentence_label_features(df: pd.DataFrame) -> tuple:
    sentences = df['Base Sentence']
    prediction_labels = df['Sentence Label']
    features_df = df.iloc[:, 2:]
    return sentences, prediction_labels, features_df

X_train_sentences, y_train_prediction_labels, X_train_features_df = split_sentence_label_features(X_train)

In [10]:
X_test_sentences, y_test_prediction_labels, X_test_features_df = split_sentence_label_features(X_test)
X_test_sentences

1394                                                                                                                           According to the CDC, the obesity rates in adolescents would fall in Spring 2027.
353                                                                              According to Professor David Kim, the  adoption rate of artificial intelligence in the healthcare sector would fall in Q4 2027.
1334                                                                                              Coach Rachel Thompson observed that the turnover rate at the Golden State Warriors remained stable in 2024 Q2.
906                                                                                                         Noticing a trend, JP Morgan analysts observed that the net profit at Tesla had decreased in Q2 2023.
1290                                                        The Centers for Disease Control and Prevention's health screening participation should stay the same in 

In [11]:
y_test_prediction_labels

1394    1
353     1
1334    0
906     0
1290    1
1274    0
939     0
1733    0
65      0
1047    0
56      0
1033    0
1467    0
584     1
374     1
275     0
746     1
128     1
1444    0
1514    0
674     0
1328    0
1261    0
1084    0
1775    0
99      0
1102    0
965     0
792     1
29      1
628     1
1307    0
572     0
1220    0
1769    0
450     0
1610    0
1406    0
1206    0
254     0
124     0
1930    0
251     1
585     0
1268    0
1384    0
507     0
1969    0
70      0
1022    0
1745    0
247     0
1110    0
212     0
949     1
1868    0
968     0
1860    0
1662    0
361     0
1151    1
678     0
1385    1
1090    0
1990    1
1850    0
836     1
1855    0
111     1
936     0
838     0
383     0
1582    0
1055    0
755     1
324     0
1473    0
787     1
368     0
1822    0
69      0
1330    0
1817    0
1556    0
1693    0
1475    1
1801    0
478     0
281     0
1649    0
1347    0
855     0
210     0
297     0
1887    1
1571    0
651     0
1234    0
993     0
1934    1


In [12]:
# perception_model = SkLearnPerceptronModel()
perception_model.train_model(X_train_features_df, y_train_prediction_labels)
perceptron_predictions = perception_model.predict(X_test_features_df)
perceptron_predictions.to_numpy().ravel()
perceptron_predictions

  ret = a @ b
  ret = a @ b
  ret = a @ b


0      1
1      0
2      0
3      0
4      1
5      1
6      0
7      0
8      0
9      0
10     1
11     0
12     0
13     1
14     1
15     0
16     1
17     1
18     0
19     1
20     0
21     1
22     0
23     0
24     0
25     0
26     0
27     0
28     1
29     0
30     1
31     0
32     0
33     0
34     1
35     0
36     0
37     0
38     0
39     0
40     0
41     0
42     1
43     1
44     0
45     0
46     0
47     1
48     0
49     0
50     0
51     0
52     0
53     0
54     1
55     0
56     0
57     0
58     1
59     0
60     1
61     0
62     0
63     1
64     1
65     1
66     1
67     0
68     1
69     0
70     0
71     0
72     0
73     1
74     1
75     0
76     0
77     1
78     0
79     0
80     0
81     0
82     0
83     0
84     0
85     0
86     0
87     0
88     0
89     0
90     0
91     0
92     0
93     0
94     0
95     0
96     0
97     0
98     0
99     1
100    0
101    0
102    1
103    1
104    0
105    0
106    0
107    1
108    0
109    0
110    1
1

In [13]:
# sgd_model = SkLearnSGDClassifier()
sgd_model.train_model(X_train_features_df, y_train_prediction_labels)
sgd_predictions = sgd_model.predict(X_test_features_df)
sgd_predictions.to_numpy().ravel()
sgd_predictions

  ret = a @ b
  ret = a @ b
  ret = a @ b


0      1
1      1
2      0
3      0
4      1
5      1
6      0
7      0
8      0
9      0
10     1
11     0
12     0
13     1
14     1
15     0
16     1
17     1
18     0
19     1
20     0
21     1
22     0
23     0
24     0
25     0
26     0
27     0
28     1
29     0
30     1
31     0
32     0
33     0
34     1
35     0
36     0
37     0
38     0
39     0
40     0
41     0
42     1
43     1
44     0
45     0
46     0
47     1
48     0
49     0
50     0
51     0
52     0
53     0
54     1
55     0
56     0
57     0
58     1
59     0
60     1
61     0
62     1
63     1
64     1
65     1
66     1
67     0
68     1
69     0
70     0
71     0
72     0
73     1
74     1
75     0
76     0
77     1
78     0
79     0
80     0
81     0
82     0
83     0
84     0
85     0
86     0
87     0
88     0
89     0
90     0
91     0
92     0
93     0
94     1
95     0
96     0
97     0
98     0
99     1
100    0
101    0
102    1
103    1
104    0
105    0
106    0
107    1
108    0
109    0
110    1
1

In [14]:
model_predictions_df = pd.concat([X_test_sentences, y_test_prediction_labels], axis=1)
model_predictions_df.columns = ['Sentence', 'Actual Label']

model_predictions_df

Unnamed: 0,Sentence,Actual Label
1394,"According to the CDC, the obesity rates in adolescents would fall in Spring 2027.",1
353,"According to Professor David Kim, the adoption rate of artificial intelligence in the healthcare sector would fall in Q4 2027.",1
1334,Coach Rachel Thompson observed that the turnover rate at the Golden State Warriors remained stable in 2024 Q2.,0
906,"Noticing a trend, JP Morgan analysts observed that the net profit at Tesla had decreased in Q2 2023.",0
1290,"The Centers for Disease Control and Prevention's health screening participation should stay the same in late 2025, according to a report by the CDC.",1
1274,"According to top executive, Ms. Sophia Rodriguez, the employee retention rate at Microsoft would decrease in January 2021.",0
939,"The financial advisor at Wealth Management Solutions predicted on 09/2022, the stock prices for GreenTech Industries had risen.",0
1733,"In the fourth quarter of 2023, Meteorologist Michael Brown envisioned that the wind gusts at Chicago decreased.",0
65,"According to Senior Level Person Michael Brown, the employee satisfaction rate at Amazon dropped in Q2 of 2023.",0
1047,"On August 21, 2024, Dr. Richard Davis, a renowned weather expert, monitored the temperature at New York City changed.",0


In [15]:
model_predictions_df['Perceptron Predicted Label'] = perceptron_predictions.to_numpy().ravel()
model_predictions_df['SGD Predicted Label'] = sgd_predictions.to_numpy().ravel()
model_predictions_df

Unnamed: 0,Sentence,Actual Label,Perceptron Predicted Label,SGD Predicted Label
1394,"According to the CDC, the obesity rates in adolescents would fall in Spring 2027.",1,1,1
353,"According to Professor David Kim, the adoption rate of artificial intelligence in the healthcare sector would fall in Q4 2027.",1,0,1
1334,Coach Rachel Thompson observed that the turnover rate at the Golden State Warriors remained stable in 2024 Q2.,0,0,0
906,"Noticing a trend, JP Morgan analysts observed that the net profit at Tesla had decreased in Q2 2023.",0,0,0
1290,"The Centers for Disease Control and Prevention's health screening participation should stay the same in late 2025, according to a report by the CDC.",1,1,1
1274,"According to top executive, Ms. Sophia Rodriguez, the employee retention rate at Microsoft would decrease in January 2021.",0,1,1
939,"The financial advisor at Wealth Management Solutions predicted on 09/2022, the stock prices for GreenTech Industries had risen.",0,0,0
1733,"In the fourth quarter of 2023, Meteorologist Michael Brown envisioned that the wind gusts at Chicago decreased.",0,0,0
65,"According to Senior Level Person Michael Brown, the employee satisfaction rate at Amazon dropped in Q2 of 2023.",0,0,0
1047,"On August 21, 2024, Dr. Richard Davis, a renowned weather expert, monitored the temperature at New York City changed.",0,0,0


## Evaluation

In [16]:
get_metrics = EvaluationMetric()

In [17]:
metrics = get_metrics.eval_classification_report(y_test_prediction_labels, perceptron_predictions)
metrics

              precision    recall  f1-score   support

           0       0.96      0.87      0.91       316
           1       0.65      0.86      0.74        85

    accuracy                           0.87       401
   macro avg       0.80      0.87      0.83       401
weighted avg       0.89      0.87      0.88       401



In [18]:
metrics = get_metrics.eval_classification_report(y_test_prediction_labels, sgd_predictions)
metrics

              precision    recall  f1-score   support

           0       0.99      0.86      0.92       316
           1       0.66      0.96      0.78        85

    accuracy                           0.89       401
   macro avg       0.82      0.91      0.85       401
weighted avg       0.92      0.89      0.89       401



In [19]:
import pickle
base_dir = os.path.join(notebook_dir, '../models')
sgd_path = os.path.join(base_dir, 'sgd_model_117.pkl')
with open(sgd_path, 'wb') as file:
    pickle.dump(sgd_model, file)

sgd_path

FileNotFoundError: [Errno 2] No such file or directory: '/Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/scratch-predictions/predictions/pipelines/../models/sgd_model_117.pkl'

## Further Exploration

In [None]:
custom_sentences = [
    "Emily Chen forecasts that the net profit at Tesla (TSLA) will decrease by 15% to $5 billion in FY 2027.",
    "Raj Patel speculates that the stock price of ExxonMobil (XOM) could rise by 8% to $120 by Q4 of 2026.",
    "There is a high probability that the revenue at Microsoft (MSFT) will reach $200 billion in FY 2029.",
    "On Thursday, April 10, 2025, Sarah Lee envisions that the operating income at Apple (AAPL) will increase by 10% to $80 billion in FY 2026.",
    "Michael Brown predicts that the dividend yield at Chevron (CVX) will rise to 5% by Q3 of 2027.",
    "It is anticipated that the market share of Alphabet (GOOGL) will grow by 3% in FY 2028.",
    "Hey, how are you?",
    "Joe Hall thinks that the earnings before interest and taxes (EBIT) at 3M (MMM) will decrease by 90% to $10 million in FY 2028, which Raj stated on Monday, December 16, 2024.",
    "Malique Mell, on Monday, December 16, 2024, predicted that the earnings before interest and taxes (EBIT) at 3M (MMM) will drop by 90%, reaching $10 million in FY 2028.",
    "Raj Jensen predicts that the earnings before interest and taxes (EBIT) at 3M (MMM) will decrease by 90% to $10 million in FY 2028.",
    "The weather today is sunny with a chance of rain in the evening.",
    "I enjoy reading books and watching movies during my free time."
]

data = {"Base Sentence": custom_sentences}
custom_sentences_df = pd.DataFrame(data)
custom_sentences_df

In [None]:
inference_tf_idf_feature_extractor = TfidfFeatureExtraction(custom_sentences_df, 'Base Sentence')
inference_tfidf_vectorized_features = inference_tf_idf_feature_extractor.word_feature_extraction(max_features)
custom_sentences_predictions = perception_model.predict(inference_tfidf_vectorized_features)
DataProcessing.join_predictions_with_sentences(custom_sentences_df, custom_sentences_predictions, perception_model)
