In [None]:
import re
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
nltk.download('stopwords')
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Mounted at /content/drive


### Data Load and Preprocessing 

In [None]:
data = pd.read_csv('/content/drive/MyDrive/CS410Project/train_data.csv', low_memory=False)
data['sentiment'].value_counts()

 0.0    79152
 1.0    72356
-1.0    17232
Name: sentiment, dtype: int64

In [None]:
def process(text) : # remove capitalization, stopwords, and punctuation
    text = str(text)
    text = text.lower()
    text = re.sub(r'@[^\s]+', '', text) # remove username
    text = re.sub(r'http\S+', '', text, flags=re.MULTILINE) # remove http link
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    procList = [word for word in nopunc.split() if word not in stopwords.words('english')]
    return ' '.join(procList)

data['text'] = data['text'].apply(lambda row: process(row))

data = data[['text', 'sentiment']]
data = data[data['sentiment'].notna()]


### Train/Test/Validation Split

In [None]:
import scipy.sparse as sp
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(data['text'], data['sentiment'], test_size=0.3, random_state=42)
x_test, x_valid, y_test, y_valid = train_test_split(x_valid,y_valid, test_size=0.5, random_state=42)
y_train = y_train.fillna(0)
y_valid = y_valid.fillna(0)
y_test = y_test.fillna(0)
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)
print(x_test.shape, y_test.shape)


(118118,) (118118,)
(25311,) (25311,)
(25311,) (25311,)


In [None]:
x_test

8018                             eeenf new highs today🚀🚀🚀🚀🚀
83665     rt earn bitcoins using google chrome bitcoin f...
129028    hate vs coreblockstream bs propaganda thats ca...
46148                          sii think im done see ya 290
127826            best chart analyst bitcoin live 2 minutes
                                ...                        
3686                                 ttcm real nft play fyi
160977    eth ethereum blockchain smartcontracts crypto ...
65188     coinbase talks buy one bitcoins best funded st...
39597     ltc giveaway ill giving 05 ltc 1 lucky followe...
93199     name metaverse etp symbol etp 24 hour change 1...
Name: text, Length: 25311, dtype: object

### Binary Bag-of-Words / TF-IDF Representation and Cross-validation splits

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# max_vocab = 3000
# bow_transformer = CountVectorizer(max_features=max_vocab, ngram_range=(1,2), binary=True ).fit(x_train.values.astype('U'))
# x_train = bow_transformer.transform(x_train.values.astype('U'))
# x_valid = bow_transformer.transform(x_valid.values.astype('U'))
# x_test = bow_transformer.transform(x_test.values.astype('U'))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

max_vocab = 4000
tfidf_transformer = TfidfVectorizer(max_features=max_vocab, ngram_range=(1,2)).fit(x_train.values.astype('U'))
x_train = tfidf_transformer.transform(x_train.values.astype('U'))
x_valid = tfidf_transformer.transform(x_valid.values.astype('U'))
x_test = tfidf_transformer.transform(x_test.values.astype('U'))


In [None]:
from sklearn.model_selection import GridSearchCV, PredefinedSplit

my_test_fold = []
for i in range(x_train.shape[0]):
    my_test_fold.append(-1)
for i in range(x_valid.shape[0]):
    my_test_fold.append(0)

fold = PredefinedSplit(test_fold=my_test_fold)
y_cv = np.append(y_train, y_valid)
x_cv = sp.vstack((x_train , x_valid)) 

### Hyperparameter value search 

In [None]:
from sklearn.linear_model import LogisticRegression

def cross_val(parameters, x, y, fold): 
    n_folds = 5
    logreg = LogisticRegression(multi_class='ovr', solver='liblinear', max_iter = 15000, dual=False)
    logreg_cv = GridSearchCV(logreg, parameters, cv=fold, scoring="f1_macro", refit=False)
    logreg_cv.fit(x, y) 
    scores = logreg_cv.cv_results_['mean_test_score']
    params = logreg_cv.cv_results_['params']
    return logreg_cv.best_params_

values = [ 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]

scale = 5
c_optimal = None
previous_best = 100000
while True: 
    param_grid = {'C': values}
    best_params = cross_val(param_grid, x_cv, y_cv, fold)
    current_best = best_params['C']
    if (abs(current_best - previous_best ) > .0001) : 
        increment = current_best/10
        lower_bound = current_best - increment*scale
        upper_bound = current_best + increment*scale
        values = np.arange(lower_bound, upper_bound, increment)
        previous_best = current_best
    else : 
        c_optimal = current_best
        break

print(f'Best C: {c_optimal}')

Best C: 90.0


### Train With Detected Hyperparameter and Predict on Test Set

In [None]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, average_precision_score


logreg = LogisticRegression(C=c_optimal, multi_class='ovr', solver='liblinear',  max_iter = 15000, dual=False)
logreg.fit(x_train, y_train)

y_hat_train = logreg.predict(x_train)
    
print(y_train.value_counts())
print(pd.Series(y_hat_train).value_counts())
print('\t Training accuracy: \t \t', accuracy_score(y_train, y_hat_train)  )
print('\t Training balanced accuracy: \t', balanced_accuracy_score(y_train, y_hat_train)  )
print('\t Training F1: \t \t \t', f1_score(y_train, y_hat_train, average='micro')  )

#predict on test set
y_hat_test = logreg.predict(x_test)
print(y_test.value_counts())
print(pd.Series(y_hat_test).value_counts())
print('\t Test accuracy: \t \t', accuracy_score(y_test, y_hat_test)  )
print('\t Test balanced accuracy: \t', balanced_accuracy_score(y_test, y_hat_test)  )
print('\t Test F1: \t \t \t', f1_score(y_test, y_hat_test, average='micro')  )


 0.0    55332
 1.0    50712
-1.0    12074
Name: sentiment, dtype: int64
 0.0    60566
 1.0    48375
-1.0     9177
dtype: int64
	 Training accuracy: 	 	 0.8606224284190386
	 Training balanced accuracy: 	 0.7969818925701193
	 Training F1: 	 	 	 0.8606224284190386
 0.0    11843
 1.0    10943
-1.0     2525
Name: sentiment, dtype: int64
 0.0    12980
 1.0    10431
-1.0     1900
dtype: int64
	 Test accuracy: 	 	 0.8479712378017463
	 Test balanced accuracy: 	 0.7780522723688735
	 Test F1: 	 	 	 0.8479712378017463


### Train LogReg on full set and save model state / tf-idf transformer

In [None]:
tfidf_transformer = TfidfVectorizer(max_features=max_vocab, ngram_range=(1,2)).fit(data['text'].values.astype('U'))
x_train = tfidf_transformer.transform(data['text'].values.astype('U'))

logreg = LogisticRegression(C=c_optimal, multi_class='ovr', solver='liblinear',  max_iter = 15000, dual=False)
logreg.fit(x_train, data['sentiment'].fillna(0))

LogisticRegression(C=90.0, max_iter=15000, multi_class='ovr',
                   solver='liblinear')

In [13]:
import pickle

with open('logreg.pkl', 'wb') as f: 
    pickle.dump(logreg, f)
    
with open('tfidf_transformer.pkl', 'wb') as f: 
    pickle.dump(tfidf_transformer, f)