# ML Classification-Inference

- Datasets:
    1. chronicle2050
    2. sentiment140

In [1]:
import os
import sys
import joblib
import warnings

import pandas as pd

from tqdm import tqdm

notebook_dir = os.getcwd()

sys.path.append(os.path.join(notebook_dir, '../'))

from data_processing import DataProcessing
from feature_extraction import SpacyFeatureExtraction
from classification_models import SkLearnModelFactory

In [2]:
warnings.filterwarnings('ignore')

## Load Dataset(s)

In [3]:
base_path = DataProcessing.load_base_data_path(notebook_dir)

### Load chronicle2050

- CODE: [chronicle2050](https://github.com/regevson/chronicle2050/tree/master/dataset): 6,800 sentences from several datasets (Longbets, Horizons, New York Times, and ChatGPT).
- PAPER: [Future Timelines: Extraction and Visualization of Future-Related](https://dl.acm.org/doi/10.1145/3616855.3635693)

- Mappings:
    - 0s: Non-Future-Related
    - 1s: Future-Related

In [4]:
chronicle2050_path = os.path.join(base_path, 'chronicle2050', 'data.csv')
chronicle2050_df = DataProcessing.load_from_file(chronicle2050_path, sep=',')
chronicle2050_df

Unnamed: 0,index,sentence,label
0,0,"By January 1st, 2037, Tesla will have been the...",1
1,1,An annual average temperature anomaly value ab...,1
2,2,Private Nonfarm business productivity growth w...,1
3,3,No Republican will be President of the USA bef...,1
4,4,The market capitalization of Berkshire Hathawa...,1
...,...,...,...
6397,2540,Many major technology players are [TeleNav Inc...,0
6398,2541,"WaterIQ Technologies, the leader in next-gener...",0
6399,2542,The Business Research Company's 'Clean Coal Te...,0
6400,2543,'Prophecy Market Insights offers a 20% discoun...,0


### Load sentiment140

In [5]:
sentiment140_path = os.path.join(base_path, 'sentiment140', 'data.csv')
sentiment140_df = pd.read_csv(sentiment140_path, encoding = "ISO-8859-1", header=None)
# sentiment140_df = DataProcessing.load_from_file(sentiment140_path, sep=',')
sentiment140_df.columns = [
    "target",      # Sentiment label: 0 = negative, 4 = positive
    "id",          # Tweet ID
    "date",        # Date of the tweet
    "flag",        # Query (unused, often 'NO_QUERY')
    "user",        # Username of the tweet author
    "sentence"         # The tweet content
]
sentiment140_df

Unnamed: 0,target,id,date,flag,user,sentence
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


## Embed Sentences

In [6]:
sentiment140_df = sentiment140_df.loc[:len(chronicle2050_df), :]

In [7]:
# spacy_fe = SpacyFeatureExtraction(chronicle2050_df, 'sentence')
spacy_fe = SpacyFeatureExtraction(sentiment140_df, 'sentence')
embeddings_spacy_fe_df = spacy_fe.sentence_embeddings_extraction()
embeddings_spacy_fe_df.rename(columns={'sentence Embedding': 'sentence embedding'}, inplace=True)
embeddings_spacy_fe_df

100%|██████████| 6403/6403 [00:23<00:00, 274.32it/s]


Unnamed: 0,target,id,date,flag,user,sentence,sentence embedding
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[-0.045709178, 0.20297684, -0.12165805, -0.106..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,"[-0.11649713, 0.2126041, -0.15201212, -0.08377..."
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,"[-0.0027364073, 0.096247524, -0.0858642, -0.07..."
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,"[-0.045068603, 0.1058173, -0.1635969, 0.064762..."
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","[-0.07083678, 0.22407694, -0.22421975, -0.1930..."
...,...,...,...,...,...,...,...
6398,0,1469414760,Tue Apr 07 06:18:33 PDT 2009,NO_QUERY,SarahPsyDeal,"@mikeyzito last night, there was no wawa","[0.008054379, 0.22411261, -0.036844626, -0.120..."
6399,0,1469414791,Tue Apr 07 06:18:34 PDT 2009,NO_QUERY,ljmcq,@Schofe http://twitpic.com/2yd1o - this one is...,"[0.046313547, 0.13098091, -0.17229362, 0.03321..."
6400,0,1469414844,Tue Apr 07 06:18:34 PDT 2009,NO_QUERY,skoop,I still dearly miss the symfony dev environmen...,"[0.15832208, 0.05969725, -0.086398706, 0.00068..."
6401,0,1469415083,Tue Apr 07 06:18:37 PDT 2009,NO_QUERY,Vicki_Owen,Stressing over the methodolgy section of the m...,"[-0.010205724, 0.24983001, -0.039484996, 0.072..."


## Load Models

In [8]:
ml_model_checkpoints = {}

model_checkpoint_path = os.path.join(base_path, 'model_checkpoint')
model_checkpoint_files = os.listdir(model_checkpoint_path)

for model_checkpoint_filename in model_checkpoint_files:
    specific_model_checkpoint_path = os.path.join(model_checkpoint_path, model_checkpoint_filename)
    # print(specific_model_checkpoint_path)
    model_name = specific_model_checkpoint_path[169:-4]
    print(model_name)
    ml_model_checkpoint = joblib.load(specific_model_checkpoint_path)
    ml_model_checkpoints[model_name] = ml_model_checkpoint

logistic_regression
random_forest_classifier
gradient_boosting_classifier
perceptron
ridge_classifier
sgd_classifier
decision_tree_classifier


In [9]:
ml_model_checkpoints

{'logistic_regression': <classification_models.SkLearnLogisticRegression at 0x398564c90>,
 'random_forest_classifier': <classification_models.SkLearnRandomForestClassifier at 0x3599ad850>,
 'gradient_boosting_classifier': <classification_models.SkLearnGradientBoostingClassifier at 0x365dfb3d0>,
 'perceptron': <classification_models.SkLearnPerceptronModel at 0x3cb57ba10>,
 'ridge_classifier': <classification_models.SkLearnRidgeClassifier at 0x3c92f4a10>,
 'sgd_classifier': <classification_models.SkLearnSGDClassifier at 0x3cb57acd0>,
 'decision_tree_classifier': <classification_models.SkLearnDecisionTreeClassifier at 0x3cb5793d0>}

## Inference

In [10]:
ml_models_with_predictions = {}

chronicle2050_sentence_embeddings  = embeddings_spacy_fe_df['sentence embedding'].to_list()
# ml_models_with_predictions['chronicle2050_dataset'] = {}
ml_models_with_predictions['sentiment140_dataset'] = {}

for model_name, ml_model in ml_model_checkpoints.items():
    print(f"Predict for {ml_model.get_model_name()}")
    ml_model_predictions = ml_model.predict(chronicle2050_sentence_embeddings)
    # ml_models_with_predictions['chronicle2050_dataset'][model_name] = ml_model_predictions
    ml_models_with_predictions['sentiment140_dataset'][model_name] = ml_model_predictions

Predict for Logistic Regression
Predict for Random Forest
Predict for Gradient Boosting Machine
Predict for Perceptron
Predict for Ridge Classifier
Predict for SDG Classifier
Predict for Decision Tree


### Align chronicle2050 Sentences with Predicted Sentence Label from Models

In [11]:
results_df = embeddings_spacy_fe_df.copy()
# for key, value in ml_models_with_predictions['chronicle2050_dataset'].items():
for key, value in ml_models_with_predictions['sentiment140_dataset'].items():
    results_df[key] = value.to_list()
results_df

Unnamed: 0,target,id,date,flag,user,sentence,sentence embedding,logistic_regression,random_forest_classifier,gradient_boosting_classifier,perceptron,ridge_classifier,sgd_classifier,decision_tree_classifier
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[-0.045709178, 0.20297684, -0.12165805, -0.106...",1,1,1,1,1,1,1
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,"[-0.11649713, 0.2126041, -0.15201212, -0.08377...",1,1,1,1,1,1,1
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,"[-0.0027364073, 0.096247524, -0.0858642, -0.07...",1,1,1,1,1,1,1
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,"[-0.045068603, 0.1058173, -0.1635969, 0.064762...",1,1,1,1,1,1,0
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","[-0.07083678, 0.22407694, -0.22421975, -0.1930...",1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6398,0,1469414760,Tue Apr 07 06:18:33 PDT 2009,NO_QUERY,SarahPsyDeal,"@mikeyzito last night, there was no wawa","[0.008054379, 0.22411261, -0.036844626, -0.120...",1,1,1,1,1,1,1
6399,0,1469414791,Tue Apr 07 06:18:34 PDT 2009,NO_QUERY,ljmcq,@Schofe http://twitpic.com/2yd1o - this one is...,"[0.046313547, 0.13098091, -0.17229362, 0.03321...",1,1,1,1,1,1,1
6400,0,1469414844,Tue Apr 07 06:18:34 PDT 2009,NO_QUERY,skoop,I still dearly miss the symfony dev environmen...,"[0.15832208, 0.05969725, -0.086398706, 0.00068...",1,1,1,1,1,1,1
6401,0,1469415083,Tue Apr 07 06:18:37 PDT 2009,NO_QUERY,Vicki_Owen,Stressing over the methodolgy section of the m...,"[-0.010205724, 0.24983001, -0.039484996, 0.072...",1,1,1,1,1,1,1


## Save Output

In [13]:
# save_chronicle2050_results_path = os.path.join(base_path, 'inference/chronicle2050_results')
save_sentiment140_results_path = os.path.join(base_path, 'inference/sentiment140_results')

In [14]:
# DataProcessing.save_to_file(results_df, save_chronicle2050_results_path, 'chronicle2050_results', '.csv')
DataProcessing.save_to_file(results_df, save_sentiment140_results_path, 'sentiment140_results', '.csv')

Using file number: 1
Saving CSV file to: /Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_classification_experiments-v2/../data/inference/sentiment140_results/sentiment140_results-v1.csv


### Check if properly saved

In [15]:
load_chronicle2050_results_path = os.path.join(base_path, save_sentiment140_results_path, 'sentiment140_results-v1.csv')
load_chronicle2050_results_df = DataProcessing.load_from_file(load_chronicle2050_results_path, sep=',')
load_chronicle2050_results_df

Unnamed: 0,target,id,date,flag,user,sentence,sentence embedding,logistic_regression,random_forest_classifier,gradient_boosting_classifier,perceptron,ridge_classifier,sgd_classifier,decision_tree_classifier
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",[-4.57091779e-02 2.02976838e-01 -1.21658050e-...,1,1,1,1,1,1,1
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,[-1.16497129e-01 2.12604105e-01 -1.52012125e-...,1,1,1,1,1,1,1
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,[-2.73640733e-03 9.62475240e-02 -8.58642012e-...,1,1,1,1,1,1,1
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,[-4.50686030e-02 1.05817303e-01 -1.63596898e-...,1,1,1,1,1,1,0
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",[-7.08367825e-02 2.24076942e-01 -2.24219754e-...,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6398,0,1469414760,Tue Apr 07 06:18:33 PDT 2009,NO_QUERY,SarahPsyDeal,"@mikeyzito last night, there was no wawa",[ 8.05437937e-03 2.24112615e-01 -3.68446261e-...,1,1,1,1,1,1,1
6399,0,1469414791,Tue Apr 07 06:18:34 PDT 2009,NO_QUERY,ljmcq,@Schofe http://twitpic.com/2yd1o - this one is...,[ 4.63135466e-02 1.30980909e-01 -1.72293618e-...,1,1,1,1,1,1,1
6400,0,1469414844,Tue Apr 07 06:18:34 PDT 2009,NO_QUERY,skoop,I still dearly miss the symfony dev environmen...,[ 1.58322081e-01 5.96972518e-02 -8.63987058e-...,1,1,1,1,1,1,1
6401,0,1469415083,Tue Apr 07 06:18:37 PDT 2009,NO_QUERY,Vicki_Owen,Stressing over the methodolgy section of the m...,[-1.02057243e-02 2.49830008e-01 -3.94849963e-...,1,1,1,1,1,1,1
