In [None]:
import pandas as pd

df = pd.read_csv("data/sentiment140-subset.csv", nrows=100000)
df = df.drop(index=90) #this tweet's formatting is just really messed up
df.head(100)

Unnamed: 0,polarity,text
0,0,@kconsidder You never tweet
1,0,Sick today coding from the couch.
2,1,"@ChargerJenn Thx for answering so quick,I was ..."
3,1,Wii fit says I've lost 10 pounds since last ti...
4,0,@MrKinetik Not a thing!!! I don't really have...
5,1,and its pretty cool I never thought I would e...
6,0,finallly! twitter wouldnt let me in! x
7,1,"Just took a shower, i feel so refreshed now"
8,0,Love the show! You and Jasmine got me watching...
9,1,I have a story to tell you... its my birthday ...


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
import re
import string
import unidecode

def remove_punct(s):
    punct = string.punctuation + 'Ã§º¯³|¡¿'
    for char in punct:
        s = s.replace(char, '')
    return s

def remove_accents(accented_string):
    unaccented_string = unidecode.unidecode(accented_string)
    return unaccented_string

def clean(df):
    df['text'] = df['text'].map(lambda x:re.sub(r'\w*@\w*', '', x) ) #remove twitter handles
    df['text'] = df['text'].map(lambda x:re.sub(r'http\S+', '', x) ) #remove urls
    df['text'] = df['text'].map(lambda x:re.sub(r'[0-9]', '', x) ) #remove numbers
    df['text'] = df['text'].map(lambda x:remove_punct(x)) #remove punctuation marks
    df['text'] = df['text'].map(lambda x:remove_accents(x)) #remove accents
    df['text'] = df['text'].map(lambda x: x.lower()) #lower case
    
    
    
#     df['text'] = df['text'].map(lambda x:re.sub(r'\w*¿½\w*', '', x) )
#     df['text'] = df['text'].map(lambda x:re.sub(r'&amp*', '', x) )

clean(df)

In [None]:
vectorizer = TfidfVectorizer(max_features=1000)
vectors = vectorizer.fit_transform(df.text)
words_df = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names())
words_df.head()

results = {}

In [None]:
y = df.polarity #sentiments
X = words_df

### Classifiers

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.gaussian_process import ne

from sklearn.model_selection import GridSearchCV

#### Log Reg

In [None]:
%%time

clf = GridSearchCV(estimator=LogisticRegression(solver='newton-cg'), param_grid={
        }, cv=10
      )

clf.fit(X, y)
clf_df = pd.DataFrame(clf.cv_results_)
clf_df


Wall time: 2min 38s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,14.13958,0.492632,0.022261,0.001734,{},0.76,0.7628,0.7612,0.768,0.7634,0.7612,0.7584,0.7556,0.7605,0.766477,0.761758,0.003457,1


In [None]:
results[clf.best_score_] = ['LogsiticRegression', clf.best_params_]

In [None]:
results

{0.7617576647664767: ['LogsiticRegression', {}]}

#### Lin Reg

In [None]:
%%time

clf = GridSearchCV(estimator=LinearRegression(), param_grid={
        'normalize': (True, False),
        }, cv=10
      )
clf.fit(X, y)
clf_df = pd.DataFrame(clf.cv_results_)
# clf_df

Wall time: 33.6 s


In [None]:
results[clf.best_score_] = ['LinearRegression', clf.best_params_]

In [None]:
results

{0.7551583750138936: ['LogsiticRegression', {'solver': 'newton-cg'}],
 0.3084938169917256: ['LinearRegression', {'normalize': True}]}

#### Linear SVC

In [None]:
%%time

clf = GridSearchCV(estimator=LinearSVC(loss='squared_hinge'), param_grid={
        'max_iter': (5000, 10000, 50000),
        'dual': (True, False),
        }, cv=10
      )
clf.fit(X, y)
clf_df = pd.DataFrame(clf.cv_results_)


Wall time: 22.7 s


In [None]:
results[clf.best_score_] = ['LinearSVC', clf.best_params_]

In [None]:
results

{0.7551583750138936: ['LogsiticRegression', {'solver': 'newton-cg'}],
 0.3084938169917256: ['LinearRegression', {'normalize': True}],
 0.7523916638879626: ['LinearSVC', {'dual': True, 'max_iter': 5000}]}

#### Multinomial Naive Bayes

In [None]:
%%time

clf = GridSearchCV(estimator=MultinomialNB(), param_grid={
        'alpha': (0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0, 7.5, 8.0, 8.5, 9.0, 9.5, 10.0)
        }, cv=10
      )
clf.fit(X, y)
clf_df = pd.DataFrame(clf.cv_results_)

Wall time: 40.6 s


In [None]:
results[clf.best_score_] = ['MultinomialNB', clf.best_params_]

In [None]:
results

{0.7551583750138936: ['LogsiticRegression', {'solver': 'newton-cg'}],
 0.3084938169917256: ['LinearRegression', {'normalize': True}],
 0.7523916638879626: ['LinearSVC', {'dual': True, 'max_iter': 5000}],
 0.746458241636101: ['MultinomialNB', {'alpha': 6.5}]}

#### Random Forest

In [None]:
%%time

clf = GridSearchCV(estimator=RandomForestClassifier(), param_grid={
        'n_estimators': (10, 100, 150),
        }, cv=10
      )
clf.fit(X, y)
clf_df = pd.DataFrame(clf.cv_results_)

Wall time: 25min 50s


In [None]:
results[clf.best_score_] = ['RandomForest', clf.best_params_]

In [None]:
results

{0.7551583750138936: ['LogsiticRegression', {'solver': 'newton-cg'}],
 0.3084938169917256: ['LinearRegression', {'normalize': True}],
 0.7523916638879626: ['LinearSVC', {'dual': True, 'max_iter': 5000}],
 0.746458241636101: ['MultinomialNB', {'alpha': 6.5}],
 0.7402913193286651: ['RandomForest', {'n_estimators': 150}]}

#### AdaBoost

In [None]:
%%time

clf = GridSearchCV(estimator=AdaBoostClassifier(), param_grid={
        'learning_rate': (1.0, 2.0, 3.0),
        }, cv=10
      )
clf.fit(X, y)
clf_df = pd.DataFrame(clf.cv_results_)

Wall time: 14min 54s


In [None]:
results[clf.best_score_] = ['AdaBoost', clf.best_params_]

In [None]:
results

{0.7551583750138936: ['LogsiticRegression', {'solver': 'newton-cg'}],
 0.3084938169917256: ['LinearRegression', {'normalize': True}],
 0.7523916638879626: ['LinearSVC', {'dual': True, 'max_iter': 5000}],
 0.746458241636101: ['MultinomialNB', {'alpha': 6.5}],
 0.7402913193286651: ['RandomForest', {'n_estimators': 150}],
 0.6860564632655329: ['AdaBoost', {'learning_rate': 1.0}]}

#### Gaussian Process Classifier

In [None]:
#this one doesn't work

### Using BERT

In [None]:
import pandas as pd

df = pd.read_csv("data/sentiment140-subset.csv", nrows=30000)
df = df.drop(index=90) #this tweet's formatting is just really messed up

In [None]:
clean(df)

In [None]:
import numpy as np
import torch
import transformers as ppb
from sklearn.model_selection import train_test_split

In [None]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [None]:
tokenized = df['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
global_max_len = len(tokenized.values[0])
for li in tokenized.values:
    global_max_len = max(global_max_len, len(li))
print(global_max_len)

86


In [None]:
num_batches = 20

batch_1, batch_2, batch_3, batch_4, batch_5, batch_6, batch_7, batch_8, batch_9, batch_10, batch_11, batch_12, batch_13, batch_14, batch_15, batch_16, batch_17, batch_18, batch_19, batch_20 = np.array_split(df, num_batches)

batches = [batch_1, batch_2, batch_3, batch_4, batch_5, batch_6, batch_7, batch_8, batch_9, batch_10, batch_11, batch_12, batch_13, batch_14, batch_15, batch_16, batch_17, batch_18, batch_19, batch_20]

In [None]:
#this is tokenizing the tweets and encoding them so that BERT can understand

for batch in batches:
    tokenized = batch['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
    #add padding

    max_len = len(tokenized.values[0])
    for li in tokenized.values:
        max_len = max(max_len, len(li))

    padded = np.array([li + [0]*(max_len-len(li)) for li in tokenized.values])
    
    #masking
    attention_mask = np.where(padded != 0, 1, 0)
    
    input_ids = torch.tensor(padded)  
    attention_mask = torch.tensor(attention_mask)

    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)
    features = last_hidden_states[0][:,0,:].numpy()
    print(features)
    


RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:75] data. DefaultCPUAllocator: not enough memory: you tried to allocate 251904000 bytes. Buy new RAM!

In [None]:
batch = df[:1500]

In [None]:

tokenized = batch['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
#add padding

max_len = len(tokenized.values[0])
for li in tokenized.values:
    max_len = max(max_len, len(li))

padded = np.array([li + [0]*(max_len-len(li)) for li in tokenized.values])

#masking
attention_mask = np.where(padded != 0, 1, 0)

input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)
features = last_hidden_states[0][:,0,:].numpy()
print(features)


[[ 0.08996668  0.00164525 -0.08337827 ... -0.08960109  0.27462918
   0.30040133]
 [-0.04538557 -0.00406603  0.07527261 ... -0.17595594  0.39300975
   0.17493433]
 [ 0.14887248  0.07762916 -0.0452455  ... -0.10213505  0.400743
   0.37653345]
 ...
 [ 0.07920368  0.4093396   0.18753985 ...  0.05454692  0.2608588
   0.30745062]
 [-0.00567827 -0.06137022  0.08889808 ... -0.1516546   0.2996365
   0.11575194]
 [-0.15403602 -0.04427864  0.25128025 ... -0.14146762  0.59053
   0.0090637 ]]


In [None]:
labels = batch['text']
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [None]:
# clf = LinearSVC(loss='squared_hinge')
clf = LogisticRegression(solver='newton-cg')
clf.fit(train_features, train_labels)



LogisticRegression(solver='newton-cg')

In [None]:
clf.score(test_features, test_labels)

0.0026666666666666666

#### Ready for training

In [None]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:75] data. DefaultCPUAllocator: not enough memory: you tried to allocate 15666677760 bytes. Buy new RAM!