# Prediction Observation Classification Pipeline

- **Goal:** Prediction Classification

In [1]:
import os
import sys

import importlib.util

import pandas as pd


from pathlib import Path
from IPython.display import Image

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

import log_files
from data_processing import DataProcessing
from feature_extraction import TfidfFeatureExtraction, SpacyFeatureExtraction

# Absolute path to your local classification_models.py file
project_root = "/Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions"
module_path = os.path.join(project_root, "classification_models.py")

# Dynamically load the module so it overrides any installed package
spec = importlib.util.spec_from_file_location("classification_models", module_path)
classification_models = importlib.util.module_from_spec(spec)
spec.loader.exec_module(classification_models)

# Inject into sys.modules so pickle uses this
sys.modules["classification_models"] = classification_models

# ✅ Now you can use the classes
perception_model = classification_models.SkLearnPerceptronModel()
sgd_model = classification_models.SkLearnSGDClassifier()
EvaluationMetric = classification_models.EvaluationMetric
from classification_models import SkLearnPerceptronModel, SkLearnSGDClassifier, EvaluationMetric

In [2]:
# Image(filename='../misc/base_pipeline.png')

## 1-Data Acquisition

In [3]:
pd.set_option('max_colwidth', 800)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
log_file_path = "data/prediction_logs"
predictions = True
predictions_df = log_files.read_data(notebook_dir, log_file_path, predictions)
predictions_df.head(7)

Start logging batch
log_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/pipelines/../data/prediction_logs
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/pipelines/../data/prediction_logs/batch_1-prediction
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/pipelines/../data/prediction_logs/batch_1-prediction/batch_1-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/pipelines/../data/prediction_logs/batch_2-prediction
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/pipelines/../data/prediction_logs/batch_2-prediction/batch_2-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/pipelines/../data/prediction_logs/batch_3-pre

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,JPMorgan Chase forecasts that the net profit at Amazon potentially decrease in Q3 of 2027.,1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,1
1,"On August 21, 2024, Bank of America speculates the revenue at Microsoft will likely increase.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,2
2,"Citigroup predicts on 2024-08-21, the operating income at Alphabet may rise.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,3
3,"According to Goldman Sachs, the research and development expenses at Facebook would fall in 2025.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,4
4,"In 21 August 2024, Morgan Stanley envisions that the gross profit at Johnson & Johnson has some probability to remain stable.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,5
5,"The stock price at Visa should stay same in Q2 of 2026, according to Wells Fargo.",1,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,6
6,JPMorgan forecasts that the revenue at Microsoft potentially decrease in Q3 of 2027.,1,finance,llama-3.3-70b-instruct,NAVI_GATOR,0,1


In [5]:
log_file_path = "data/observation_logs"
predictions = False
observations_df = log_files.read_data(notebook_dir, log_file_path, predictions)
observations_df.head(7)

Start logging batch
log_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/pipelines/../data/observation_logs
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/pipelines/../data/observation_logs/batch_1-observation
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/pipelines/../data/observation_logs/batch_1-observation/batch_1-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/pipelines/../data/observation_logs/batch_2-observation
CSV to DF
Load saved csv: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/pipelines/../data/observation_logs/batch_2-observation/batch_2-from_df.csv
save_batch_directory: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/pipelines/../data/observation_logs/b

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,The financial analyst at Goldman Sachs observed that the operating income at Tesla had increased in the first quarter of 2024.,0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,1
1,"On 2024-08-20 to 2025-08-20, Morgan Stanley speculates the stock price at Amazon will likely rise.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,2
2,"A young investor predicts on 2025-03-15, the S&P 500 index may rise.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,3
3,"According to Bank of America, the net profit at Microsoft would fall in the second quarter of 2026.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,4
4,"In 2027-01-01 to 2027-12-31, Wells Fargo envisions that the interest rates at the Federal Reserve have some probability to remain stable.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,5
5,"The trading volume at Apple should stay same in the fourth quarter of 2025, according to a financial expert at JPMorgan Chase.",0,finance,llama-3.1-70b-instruct,NAVI_GATOR,0,6
6,JPMorgan observed that the net profit at Microsoft had risen in September 2023.,0,finance,llama-3.3-70b-instruct,NAVI_GATOR,0,1


## 2-Data Processing

In [6]:
pred_obs_dfs = [predictions_df, observations_df]
base_df = DataProcessing.concat_dfs(pred_obs_dfs)
shuffled_base_df = DataProcessing.shuffle_df(base_df)
shuffled_base_df.head(7)

Unnamed: 0,Base Sentence,Sentence Label,Domain,Model Name,API Name,Batch ID,Template Number
0,"Dr. Lee predicts on 2029-03-21, the diabetes prevalence at regional hospitals may rise.",0,health,llama-3.1-8b-instruct,NAVI_GATOR,0,3
1,Coach Thompson observed that the free throw percentage at the Chicago Bulls decreased in 2024-08-21.,0,sport,llama-3.1-8b-instruct,NAVI_GATOR,0,1
2,"Research Advisor Michael Chen noted on January 10, 2022, the temperature at the University of California fell.",0,weather,llama-3.3-70b-instruct,NAVI_GATOR,0,3
3,"The voter turnout at local elections should stay the same in November 2025, according to research advisor Emily Chen.",1,policy,llama-3.1-70b-instruct,NAVI_GATOR,0,6
4,"According to sports analyst Sarah Johnson, the scoring average at the Los Angeles Lakers would fall in 2025 Q3.",1,sport,mixtral-8x7b-instruct,NAVI_GATOR,0,4
5,"On 11th of October 2021, Dr. Rachel Kim monitored that the obesity rates at rural high schools changed.",0,health,llama-3.3-70b-instruct,NAVI_GATOR,0,2
6,Dr. Lee predicted on 15th of March 2029 that the number of patients with chronic diseases at urban hospitals may decline.,0,health,llama-3.3-70b-instruct,NAVI_GATOR,0,3


## 3-Feature Extraction

### TF x IDF

In [7]:
max_features = 117

tf_idf_feature_extractor = TfidfFeatureExtraction(shuffled_base_df, 'Base Sentence')
tfidf_vectorized_features = tf_idf_feature_extractor.word_feature_extraction(max_features)
tfidf_vectorized_features_df = tf_idf_feature_extractor.feature_scores(max_features)
tfidf_vectorized_features_df.head(3)

Unnamed: 0,Base Sentence,Sentence Label,01,08,09,10,12,15,20,2022,2023,2024,2025,2026,2027,2028,2029,21,according,advisor,amazon,analyst,and,angeles,at,august,average,brown,by,changed,chen,chicago,coach,college,david,davis,decrease,decreased,dr,emily,envisioned,envisions,expert,fall,fell,financial,for,forecasts,from,had,has,health,in,increase,increased,james,january,johnson,kim,lee,level,levels,likely,los,may,meteorologist,michael,monitored,national,new,noted,number,obesity,observed,of,on,operating,patel,percentage,policy,predicts,price,probability,professor,profit,q2,q3,q4,quarter,rachel,rate,rates,remain,remained,research,revenue,rise,rose,same,schools,senior,should,some,speculates,stable,stay,stock,temperature,that,the,to,university,urban,weather,will,wind,world,would,york
0,"Dr. Lee predicts on 2029-03-21, the diabetes prevalence at regional hospitals may rise.",0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.397178,0.310064,0.0,0.0,0.0,0.0,0.0,0.0,0.150085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.303706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.386242,0.0,0.0,0.0,0.0,0.367199,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.207555,0.0,0.0,0.0,0.0,0.393409,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.369553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.113623,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Coach Thompson observed that the free throw percentage at the Chicago Bulls decreased in 2024-08-21.,0,0.0,0.296112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.207567,0.0,0.0,0.0,0.0,0.0,0.275368,0.0,0.0,0.0,0.0,0.0,0.0,0.13329,0.0,0.0,0.0,0.0,0.0,0.0,0.417085,0.390596,0.0,0.0,0.0,0.0,0.313518,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.135499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.308217,0.0,0.0,0.0,0.0,0.409446,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.170553,0.201817,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Research Advisor Michael Chen noted on January 10, 2022, the temperature at the University of California fell.",0,0.0,0.0,0.0,0.283257,0.0,0.0,0.0,0.256557,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.275913,0.0,0.0,0.0,0.0,0.103857,0.0,0.0,0.0,0.0,0.0,0.296879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.280347,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.329233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.305135,0.0,0.0,0.0,0.252504,0.0,0.0,0.0,0.147836,0.143626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.268731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.30921,0.0,0.157253,0.0,0.316225,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Models

1. Perceptron

---

- Split: 80% train and 20% test

In [8]:
X_train, X_test, y_train, y_test = DataProcessing.split_data(tfidf_vectorized_features_df, tfidf_vectorized_features_df['Sentence Label'])
X_train.head(3)

Unnamed: 0,Base Sentence,Sentence Label,01,08,09,10,12,15,20,2022,2023,2024,2025,2026,2027,2028,2029,21,according,advisor,amazon,analyst,and,angeles,at,august,average,brown,by,changed,chen,chicago,coach,college,david,davis,decrease,decreased,dr,emily,envisioned,envisions,expert,fall,fell,financial,for,forecasts,from,had,has,health,in,increase,increased,james,january,johnson,kim,lee,level,levels,likely,los,may,meteorologist,michael,monitored,national,new,noted,number,obesity,observed,of,on,operating,patel,percentage,policy,predicts,price,probability,professor,profit,q2,q3,q4,quarter,rachel,rate,rates,remain,remained,research,revenue,rise,rose,same,schools,senior,should,some,speculates,stable,stay,stock,temperature,that,the,to,university,urban,weather,will,wind,world,would,york
240,JPMorgan Chase observed that the net profit at Amazon had remained stable in Q2 2026.,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.346006,0.0,0.0,0.0,0.0,0.0,0.0,0.403406,0.0,0.0,0.0,0.129731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.278399,0.0,0.0,0.131881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.299988,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.403406,0.328616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.368213,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.268914,0.0,0.0,0.0,0.165999,0.098215,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1670,"On August 21, 2024, Meteorologist Alex Thompson speculates the temperature at New York City will likely increase.",0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.179713,0.0,0.0,0.0,0.0,0.0,0.238416,0.0,0.0,0.0,0.0,0.0,0.0,0.115404,0.277986,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.29385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.283702,0.0,0.0,0.333105,0.0,0.0,0.0,0.316077,0.0,0.0,0.0,0.0,0.0,0.159594,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.307188,0.0,0.0,0.0,0.343586,0.0,0.087368,0.0,0.0,0.0,0.0,0.264292,0.0,0.0,0.0,0.356649
692,"In June 2026, Student Sarah Jones envisions that the popularity of online learning will have some probability to remain stable.",1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.384069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.371259,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.146389,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.20498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.349019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.352319,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.356888,0.0,0.298497,0.0,0.0,0.0,0.184261,0.109019,0.184934,0.0,0.0,0.0,0.329788,0.0,0.0,0.0,0.0


In [9]:
def split_sentence_label_features(df: pd.DataFrame) -> tuple:
    sentences = df['Base Sentence']
    prediction_labels = df['Sentence Label']
    features_df = df.iloc[:, 2:]
    return sentences, prediction_labels, features_df

X_train_sentences, y_train_prediction_labels, X_train_features_df = split_sentence_label_features(X_train)

In [10]:
X_test_sentences, y_test_prediction_labels, X_test_features_df = split_sentence_label_features(X_test)
X_test_sentences

1394                                                                               In Q2 of 2028, the American Heart Association predicts that the blood pressure at community health clinics will fall.
353                                                   According to a study conducted by the University of California, the average physical activity levels at suburban high schools rose in Spring 2028.
1334                                                                     According to the National Institute of Health, the prevalence of asthma in children at public schools in Texas dropped in 2023.
906                                                                      The daily caloric intake at fast-food chains increased on 09/10/2029, according to a study by the Food and Drug Administration.
1290                                                                                     The World Health Organization noted on 25th of June 2025 that the life expectancy at developing countries f

In [11]:
y_test_prediction_labels

1394    0
353     0
1334    0
906     0
1290    0
1274    1
939     0
1733    0
65      0
1047    1
56      0
1033    0
1467    0
584     1
374     0
275     1
746     1
128     0
1444    0
1514    1
674     1
1328    0
1261    0
1084    1
1775    0
99      0
1102    0
965     0
792     0
29      0
628     0
1307    0
572     0
1220    0
1769    0
450     0
1610    0
1406    0
1206    1
254     0
124     0
1930    0
251     0
585     0
1268    0
1384    0
507     0
1969    0
70      0
1022    0
1745    0
247     0
1110    0
212     0
949     0
1868    0
968     0
1860    0
1662    0
361     0
1151    1
678     0
1385    0
1090    0
1990    0
1850    0
836     0
1855    0
111     0
936     0
838     0
383     0
1582    0
1055    0
755     0
324     0
1473    0
787     0
368     1
1822    1
69      0
1330    1
1817    0
1556    0
1693    0
1475    0
1801    1
478     0
281     1
1649    0
1347    1
855     1
210     0
297     0
1887    1
1571    1
651     0
1234    1
993     0
1934    0


In [12]:
# perception_model = SkLearnPerceptronModel()
perception_model.train_model(X_train_features_df, y_train_prediction_labels)
perceptron_predictions = perception_model.predict(X_test_features_df)
perceptron_predictions.to_numpy().ravel()
perceptron_predictions

  ret = a @ b
  ret = a @ b
  ret = a @ b


0      1
1      0
2      0
3      0
4      0
5      0
6      0
7      0
8      0
9      0
10     0
11     0
12     0
13     1
14     0
15     0
16     0
17     0
18     0
19     0
20     1
21     0
22     0
23     0
24     0
25     0
26     0
27     0
28     0
29     0
30     0
31     0
32     0
33     0
34     0
35     0
36     0
37     0
38     0
39     0
40     0
41     0
42     0
43     0
44     0
45     0
46     0
47     0
48     0
49     0
50     0
51     0
52     0
53     0
54     0
55     0
56     0
57     0
58     0
59     0
60     0
61     0
62     0
63     0
64     0
65     0
66     0
67     0
68     0
69     0
70     0
71     0
72     0
73     0
74     0
75     0
76     0
77     0
78     0
79     0
80     0
81     1
82     0
83     0
84     0
85     0
86     1
87     0
88     0
89     0
90     0
91     1
92     0
93     0
94     0
95     0
96     0
97     1
98     0
99     0
100    0
101    0
102    0
103    0
104    0
105    0
106    0
107    0
108    0
109    0
110    0
1

In [13]:
# sgd_model = SkLearnSGDClassifier()
sgd_model.train_model(X_train_features_df, y_train_prediction_labels)
sgd_predictions = sgd_model.predict(X_test_features_df)
sgd_predictions.to_numpy().ravel()
sgd_predictions

  ret = a @ b
  ret = a @ b
  ret = a @ b


0      1
1      0
2      0
3      0
4      0
5      0
6      0
7      1
8      0
9      1
10     0
11     1
12     0
13     1
14     1
15     1
16     1
17     0
18     0
19     1
20     1
21     0
22     0
23     0
24     0
25     0
26     0
27     0
28     0
29     0
30     0
31     0
32     0
33     0
34     0
35     1
36     0
37     0
38     1
39     0
40     1
41     0
42     0
43     1
44     0
45     0
46     0
47     1
48     0
49     0
50     0
51     0
52     0
53     0
54     0
55     0
56     0
57     0
58     0
59     0
60     0
61     0
62     0
63     1
64     0
65     0
66     0
67     0
68     0
69     0
70     0
71     0
72     0
73     0
74     0
75     0
76     0
77     0
78     0
79     1
80     0
81     1
82     0
83     0
84     0
85     0
86     1
87     0
88     0
89     0
90     1
91     1
92     0
93     0
94     1
95     1
96     0
97     1
98     0
99     0
100    0
101    0
102    0
103    0
104    0
105    1
106    0
107    0
108    0
109    1
110    0
1

In [14]:
model_predictions_df = pd.concat([X_test_sentences, y_test_prediction_labels], axis=1)
model_predictions_df.columns = ['Sentence', 'Actual Label']

model_predictions_df

Unnamed: 0,Sentence,Actual Label
1394,"In Q2 of 2028, the American Heart Association predicts that the blood pressure at community health clinics will fall.",0
353,"According to a study conducted by the University of California, the average physical activity levels at suburban high schools rose in Spring 2028.",0
1334,"According to the National Institute of Health, the prevalence of asthma in children at public schools in Texas dropped in 2023.",0
906,"The daily caloric intake at fast-food chains increased on 09/10/2029, according to a study by the Food and Drug Administration.",0
1290,The World Health Organization noted on 25th of June 2025 that the life expectancy at developing countries fell.,0
1274,"According to Economist Dr. Lisa Nguyen, the unemployment rate at the United States would fall in January 2026.",1
939,"On 2024-08-21, the team manager of the Golden State Warriors monitored the three-point percentage at the Los Angeles Lakers changed.",0
1733,"In August 21, 2024, Dr. Sarah Lee envisions that the cloud coverage at San Francisco has some probability to remain stable.",0
65,"In Q4 2024, the college student, Michael Brown, envisions that the net income at Apple has some probability to remain stable, as the company has a strong track record of consistent earnings.",0
1047,"According to a financial reporter, the operating income at Procter & Gamble would fall in 2024/08/21.",1


In [15]:
model_predictions_df['Perceptron Predicted Label'] = perceptron_predictions.to_numpy().ravel()
model_predictions_df['SGD Predicted Label'] = sgd_predictions.to_numpy().ravel()
model_predictions_df

Unnamed: 0,Sentence,Actual Label,Perceptron Predicted Label,SGD Predicted Label
1394,"In Q2 of 2028, the American Heart Association predicts that the blood pressure at community health clinics will fall.",0,1,1
353,"According to a study conducted by the University of California, the average physical activity levels at suburban high schools rose in Spring 2028.",0,0,0
1334,"According to the National Institute of Health, the prevalence of asthma in children at public schools in Texas dropped in 2023.",0,0,0
906,"The daily caloric intake at fast-food chains increased on 09/10/2029, according to a study by the Food and Drug Administration.",0,0,0
1290,The World Health Organization noted on 25th of June 2025 that the life expectancy at developing countries fell.,0,0,0
1274,"According to Economist Dr. Lisa Nguyen, the unemployment rate at the United States would fall in January 2026.",1,0,0
939,"On 2024-08-21, the team manager of the Golden State Warriors monitored the three-point percentage at the Los Angeles Lakers changed.",0,0,0
1733,"In August 21, 2024, Dr. Sarah Lee envisions that the cloud coverage at San Francisco has some probability to remain stable.",0,0,1
65,"In Q4 2024, the college student, Michael Brown, envisions that the net income at Apple has some probability to remain stable, as the company has a strong track record of consistent earnings.",0,0,0
1047,"According to a financial reporter, the operating income at Procter & Gamble would fall in 2024/08/21.",1,0,1


## Evaluation

In [16]:
get_metrics = EvaluationMetric()

In [17]:
metrics = get_metrics.eval_classification_report(y_test_prediction_labels, perceptron_predictions)
metrics

              precision    recall  f1-score   support

           0       0.81      0.99      0.89       300
           1       0.88      0.30      0.44       101

    accuracy                           0.81       401
   macro avg       0.84      0.64      0.67       401
weighted avg       0.83      0.81      0.78       401



In [18]:
metrics = get_metrics.eval_classification_report(y_test_prediction_labels, sgd_predictions)
metrics

              precision    recall  f1-score   support

           0       0.95      0.90      0.92       300
           1       0.74      0.85      0.79       101

    accuracy                           0.89       401
   macro avg       0.84      0.87      0.86       401
weighted avg       0.89      0.89      0.89       401



In [19]:
import pickle
base_dir = os.path.join(notebook_dir, '../models')
sgd_path = os.path.join(base_dir, 'sgd_model_117.pkl')
with open(sgd_path, 'wb') as file:
    pickle.dump(sgd_model, file)

sgd_path

FileNotFoundError: [Errno 2] No such file or directory: '/Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/pipelines/../models/sgd_model_117.pkl'

## Further Exploration

In [None]:
custom_sentences = [
    "Emily Chen forecasts that the net profit at Tesla (TSLA) will decrease by 15% to $5 billion in FY 2027.",
    "Raj Patel speculates that the stock price of ExxonMobil (XOM) could rise by 8% to $120 by Q4 of 2026.",
    "There is a high probability that the revenue at Microsoft (MSFT) will reach $200 billion in FY 2029.",
    "On Thursday, April 10, 2025, Sarah Lee envisions that the operating income at Apple (AAPL) will increase by 10% to $80 billion in FY 2026.",
    "Michael Brown predicts that the dividend yield at Chevron (CVX) will rise to 5% by Q3 of 2027.",
    "It is anticipated that the market share of Alphabet (GOOGL) will grow by 3% in FY 2028.",
    "Hey, how are you?",
    "Joe Hall thinks that the earnings before interest and taxes (EBIT) at 3M (MMM) will decrease by 90% to $10 million in FY 2028, which Raj stated on Monday, December 16, 2024.",
    "Malique Mell, on Monday, December 16, 2024, predicted that the earnings before interest and taxes (EBIT) at 3M (MMM) will drop by 90%, reaching $10 million in FY 2028.",
    "Raj Jensen predicts that the earnings before interest and taxes (EBIT) at 3M (MMM) will decrease by 90% to $10 million in FY 2028.",
    "The weather today is sunny with a chance of rain in the evening.",
    "I enjoy reading books and watching movies during my free time."
]

data = {"Base Sentence": custom_sentences}
custom_sentences_df = pd.DataFrame(data)
custom_sentences_df

In [None]:
inference_tf_idf_feature_extractor = TfidfFeatureExtraction(custom_sentences_df, 'Base Sentence')
inference_tfidf_vectorized_features = inference_tf_idf_feature_extractor.word_feature_extraction(max_features)
custom_sentences_predictions = perception_model.predict(inference_tfidf_vectorized_features)
DataProcessing.join_predictions_with_sentences(custom_sentences_df, custom_sentences_predictions, perception_model)
