# ML Classification-Inference

- Datasets:
    1. chronicle2050
    2. sentiment140

In [1]:
import os
import sys
import joblib
import warnings

import pandas as pd

from tqdm import tqdm

notebook_dir = os.getcwd()

sys.path.append(os.path.join(notebook_dir, '../'))

from data_processing import DataProcessing
from feature_extraction import SpacyFeatureExtraction
from classification_models import SkLearnModelFactory

In [2]:
warnings.filterwarnings('ignore')

## Load Dataset(s)

In [3]:
base_path = DataProcessing.load_base_data_path(notebook_dir)

### Load chronicle2050

- CODE: [chronicle2050](https://github.com/regevson/chronicle2050/tree/master/dataset): 6,800 sentences from several datasets (Longbets, Horizons, New York Times, and ChatGPT).
- PAPER: [Future Timelines: Extraction and Visualization of Future-Related](https://dl.acm.org/doi/10.1145/3616855.3635693)

- Mappings:
    - 0s: Non-Future-Related
    - 1s: Future-Related

In [4]:
chronicle2050_path = os.path.join(base_path, 'chronicle2050', 'data.csv')
chronicle2050_df = DataProcessing.load_from_file(chronicle2050_path, sep=',')
chronicle2050_df

Unnamed: 0,index,sentence,label
0,0,"By January 1st, 2037, Tesla will have been the...",1
1,1,An annual average temperature anomaly value ab...,1
2,2,Private Nonfarm business productivity growth w...,1
3,3,No Republican will be President of the USA bef...,1
4,4,The market capitalization of Berkshire Hathawa...,1
...,...,...,...
6397,2540,Many major technology players are [TeleNav Inc...,0
6398,2541,"WaterIQ Technologies, the leader in next-gener...",0
6399,2542,The Business Research Company's 'Clean Coal Te...,0
6400,2543,'Prophecy Market Insights offers a 20% discoun...,0


### Load sentiment140

In [5]:
sentiment140_path = os.path.join(base_path, 'sentiment140', 'data.csv')
sentiment140_df = pd.read_csv(sentiment140_path, encoding = "ISO-8859-1", header=None)
# sentiment140_df = DataProcessing.load_from_file(sentiment140_path, sep=',')
sentiment140_df.columns = [
    "target",      # Sentiment label: 0 = negative, 4 = positive
    "id",          # Tweet ID
    "date",        # Date of the tweet
    "flag",        # Query (unused, often 'NO_QUERY')
    "user",        # Username of the tweet author
    "sentence"         # The tweet content
]
sentiment140_df

Unnamed: 0,target,id,date,flag,user,sentence
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


## Embed Sentences

In [6]:
sentiment140_df = sentiment140_df.loc[:len(chronicle2050_df), :]

In [7]:
spacy_fe = SpacyFeatureExtraction(chronicle2050_df, 'sentence')
# spacy_fe = SpacyFeatureExtraction(sentiment140_df, 'sentence')
embeddings_spacy_fe_df = spacy_fe.sentence_embeddings_extraction()
embeddings_spacy_fe_df.rename(columns={'sentence Embedding': 'sentence embedding'}, inplace=True)
embeddings_spacy_fe_df

100%|██████████| 6402/6402 [00:31<00:00, 205.67it/s]


Unnamed: 0,index,sentence,label,sentence embedding
0,0,"By January 1st, 2037, Tesla will have been the...",1,"[-0.046937417, 0.18488835, -0.019333577, -0.09..."
1,1,An annual average temperature anomaly value ab...,1,"[-0.080327354, 0.18459626, 0.0097009195, -0.01..."
2,2,Private Nonfarm business productivity growth w...,1,"[-0.09731141, 0.27078336, 0.020201702, -0.0160..."
3,3,No Republican will be President of the USA bef...,1,"[0.029054716, 0.19021763, 0.065829575, -0.1418..."
4,4,The market capitalization of Berkshire Hathawa...,1,"[-0.058103725, 0.20959128, 0.09174133, -0.0162..."
...,...,...,...,...
6397,2540,Many major technology players are [TeleNav Inc...,0,"[-0.070845485, 0.28657782, 0.01468689, -0.1156..."
6398,2541,"WaterIQ Technologies, the leader in next-gener...",0,"[-0.11760226, 0.16071157, -0.038852997, -0.098..."
6399,2542,The Business Research Company's 'Clean Coal Te...,0,"[-0.12741901, 0.2028387, 0.023308, -0.05081283..."
6400,2543,'Prophecy Market Insights offers a 20% discoun...,0,"[-0.109398074, 0.19629641, -0.07043434, 0.0243..."


## Load Models

In [8]:
ml_model_checkpoints = {}

model_checkpoint_path = os.path.join(base_path, 'model_checkpoint')
model_checkpoint_files = os.listdir(model_checkpoint_path)

for model_checkpoint_filename in model_checkpoint_files:
    specific_model_checkpoint_path = os.path.join(model_checkpoint_path, model_checkpoint_filename)
    # print(specific_model_checkpoint_path)
    model_name = specific_model_checkpoint_path[169:-4]
    # print(model_name)
    if "Sentence Label" in model_name:
        print(model_name)
        ml_model_checkpoint = joblib.load(specific_model_checkpoint_path)
        ml_model_checkpoints[model_name] = ml_model_checkpoint

logistic_regression-Sentence Label
sgd_classifier-Sentence Label
random_forest_classifier-Sentence Label
decision_tree_classifier-Sentence Label
gradient_boosting_classifier-Sentence Label
ridge_classifier-Sentence Label
perceptron-Sentence Label


In [9]:
ml_model_checkpoints

{'logistic_regression-Sentence Label': <classification_models.SkLearnLogisticRegression at 0x34ca42110>,
 'sgd_classifier-Sentence Label': <classification_models.SkLearnSGDClassifier at 0x3bd21e3d0>,
 'random_forest_classifier-Sentence Label': <classification_models.SkLearnRandomForestClassifier at 0x3d8025a90>,
 'decision_tree_classifier-Sentence Label': <classification_models.SkLearnDecisionTreeClassifier at 0x3bd21e110>,
 'gradient_boosting_classifier-Sentence Label': <classification_models.SkLearnGradientBoostingClassifier at 0x3d80548d0>,
 'ridge_classifier-Sentence Label': <classification_models.SkLearnRidgeClassifier at 0x3d809c5d0>,
 'perceptron-Sentence Label': <classification_models.SkLearnPerceptronModel at 0x3d809ff90>}

## Inference

In [10]:
ml_models_with_predictions = {}

chronicle2050_sentence_embeddings  = embeddings_spacy_fe_df['sentence embedding'].to_list()
ml_models_with_predictions['chronicle2050_dataset'] = {}
# ml_models_with_predictions['sentiment140_dataset'] = {}

for model_name, ml_model in ml_model_checkpoints.items():
    print(f"Predict for {ml_model.get_model_name()}")
    ml_model_predictions = ml_model.predict(chronicle2050_sentence_embeddings)
    ml_models_with_predictions['chronicle2050_dataset'][model_name] = ml_model_predictions
    # ml_models_with_predictions['sentiment140_dataset'][model_name] = ml_model_predictions

Predict for Logistic Regression
Predict for SDG Classifier
Predict for Random Forest
Predict for Decision Tree
Predict for Gradient Boosting Machine
Predict for Ridge Classifier
Predict for Perceptron


### Align chronicle2050 Sentences with Predicted Sentence Label from Models

In [11]:
results_df = embeddings_spacy_fe_df.copy()
for key, value in ml_models_with_predictions['chronicle2050_dataset'].items():
# for key, value in ml_models_with_predictions['sentiment140_dataset'].items():
    results_df[key] = value.to_list()
results_df

Unnamed: 0,index,sentence,label,sentence embedding,logistic_regression-Sentence Label,sgd_classifier-Sentence Label,random_forest_classifier-Sentence Label,decision_tree_classifier-Sentence Label,gradient_boosting_classifier-Sentence Label,ridge_classifier-Sentence Label,perceptron-Sentence Label
0,0,"By January 1st, 2037, Tesla will have been the...",1,"[-0.046937417, 0.18488835, -0.019333577, -0.09...",0,0,0,0,0,0,0
1,1,An annual average temperature anomaly value ab...,1,"[-0.080327354, 0.18459626, 0.0097009195, -0.01...",1,1,0,0,0,1,1
2,2,Private Nonfarm business productivity growth w...,1,"[-0.09731141, 0.27078336, 0.020201702, -0.0160...",0,1,0,0,0,0,1
3,3,No Republican will be President of the USA bef...,1,"[0.029054716, 0.19021763, 0.065829575, -0.1418...",0,0,0,0,1,0,1
4,4,The market capitalization of Berkshire Hathawa...,1,"[-0.058103725, 0.20959128, 0.09174133, -0.0162...",0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...
6397,2540,Many major technology players are [TeleNav Inc...,0,"[-0.070845485, 0.28657782, 0.01468689, -0.1156...",0,0,0,0,0,0,0
6398,2541,"WaterIQ Technologies, the leader in next-gener...",0,"[-0.11760226, 0.16071157, -0.038852997, -0.098...",0,0,0,0,0,0,0
6399,2542,The Business Research Company's 'Clean Coal Te...,0,"[-0.12741901, 0.2028387, 0.023308, -0.05081283...",0,0,0,0,0,0,0
6400,2543,'Prophecy Market Insights offers a 20% discoun...,0,"[-0.109398074, 0.19629641, -0.07043434, 0.0243...",0,0,0,0,0,0,0


## Save Output

In [12]:
save_chronicle2050_results_path = os.path.join(base_path, 'inference/chronicle2050_results')
# save_sentiment140_results_path = os.path.join(base_path, 'inference/sentiment140_results')

In [13]:
DataProcessing.save_to_file(results_df, save_chronicle2050_results_path, 'chronicle2050_results', '.csv')
# DataProcessing.save_to_file(results_df, save_sentiment140_results_path, 'sentiment140_results', '.csv')

Using file number: 6
Saving CSV file to: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_classification_experiments-v2/../data/inference/chronicle2050_results/chronicle2050_results-v6.csv


### Check if properly saved

In [14]:
load_chronicle2050_results_path = os.path.join(base_path, save_chronicle2050_results_path, 'chronicle2050_results-v5.csv')
load_chronicle2050_results_df = DataProcessing.load_from_file(load_chronicle2050_results_path, sep=',')
load_chronicle2050_results_df

Unnamed: 0,index,sentence,label,sentence embedding,logistic_regression-Sentence Label,sgd_classifier-Sentence Label,random_forest_classifier-Sentence Label,decision_tree_classifier-Sentence Label,gradient_boosting_classifier-Sentence Label,ridge_classifier-Sentence Label,perceptron-Sentence Label
0,0,"By January 1st, 2037, Tesla will have been the...",1,[-4.69374172e-02 1.84888348e-01 -1.93335768e-...,0,0,0,0,0,0,0
1,1,An annual average temperature anomaly value ab...,1,[-8.03273544e-02 1.84596255e-01 9.70091950e-...,0,0,0,0,1,0,1
2,2,Private Nonfarm business productivity growth w...,1,[-9.73114073e-02 2.70783365e-01 2.02017017e-...,0,0,0,1,0,0,1
3,3,No Republican will be President of the USA bef...,1,[ 2.90547162e-02 1.90217629e-01 6.58295751e-...,0,0,0,1,0,0,1
4,4,The market capitalization of Berkshire Hathawa...,1,[-5.81037253e-02 2.09591284e-01 9.17413309e-...,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6397,2540,Many major technology players are [TeleNav Inc...,0,[-7.08454847e-02 2.86577821e-01 1.46868899e-...,0,0,0,0,0,0,0
6398,2541,"WaterIQ Technologies, the leader in next-gener...",0,[-1.17602259e-01 1.60711572e-01 -3.88529971e-...,0,0,0,1,0,0,0
6399,2542,The Business Research Company's 'Clean Coal Te...,0,[-1.27419010e-01 2.02838704e-01 2.33079996e-...,0,0,0,0,0,0,0
6400,2543,'Prophecy Market Insights offers a 20% discoun...,0,[-1.09398074e-01 1.96296409e-01 -7.04343393e-...,0,0,0,0,0,0,0
