In [51]:
import pandas as pd

# plotting
import seaborn as sns
import matplotlib.pyplot as plt

# nltk
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB


#tensorflow
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds

# Utility
import pandas as pd
import numpy as np 
import warnings
warnings.filterwarnings('ignore')
import re
import string
import pickle

#df = pd.read_csv("sentiment140-subset.csv", nrows=100000)
df = pd.read_csv('training.1600000.processed.noemoticon.csv',encoding='latin', names = ['polarity','id','date','query','user','text'])
df = df.drop(index=90) #this tweet's formatting is just really messed up
df.head(10)

Unnamed: 0,polarity,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
6,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
7,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
9,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?


In [53]:
positives = df['polarity'][df.polarity == 4 ]
negatives = df['polarity'][df.polarity == 0 ]

print('Total length of the data is:         {}'.format(df.shape[0]))
print('No. of positve tagged sentences is:  {}'.format(len(positives)))
print('No. of negative tagged sentences is: {}'.format(len(negatives)))

Total length of the data is:         1599999
No. of positve tagged sentences is:  800000
No. of negative tagged sentences is: 799999


In [26]:
import re
import string
import unidecode

def word_count(words):
    return len(words.split())

def remove_punct(s):
    punct = string.punctuation + 'Ã§º¯³|¡¿'
    for char in punct:
        s = s.replace(char, '')
    return s

def remove_accents(accented_string):
    unaccented_string = unidecode.unidecode(accented_string)
    return unaccented_string

def clean(df):
    df['text'] = df['text'].map(lambda x:re.sub(r'\w*@\w*', '', x) ) #remove twitter handles
    df['text'] = df['text'].map(lambda x:re.sub(r'http\S+', '', x) ) #remove urls
    df['text'] = df['text'].map(lambda x:re.sub(r'[0-9]', '', x) ) #remove numbers
    df['text'] = df['text'].map(lambda x:remove_punct(x)) #remove punctuation marks
    df['text'] = df['text'].map(lambda x:remove_accents(x)) #remove accents
    df['text'] = df['text'].map(lambda x: x.lower()) #lower case

In [54]:
clean(df)

In [28]:
nltk.download('stopwords')
stopword = set(stopwords.words('english'))
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vidursinha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vidursinha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vidursinha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [55]:
urlPattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
userPattern = '@[^\s]+'
def process_tweets(tweet):
    #tokenizing words
    tokens = word_tokenize(tweet)
    #Removing Stop Words
    final_tokens = [w for w in tokens if w not in stopword]
    #reducing a word to its word stem 
    wordLemm = WordNetLemmatizer()
    finalwords=[]
    for w in final_tokens:
        if len(w)>1:
            word = wordLemm.lemmatize(w)
            finalwords.append(word)
    return ' '.join(finalwords)

In [56]:
df['text'] = df['text'].apply(lambda x: process_tweets(x))

In [57]:
df.head(10)

Unnamed: 0,polarity,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,awww thats bummer shoulda got david carr third...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,upset cant update facebook texting might cry r...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,dived many time ball managed save rest go bound
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,whole body feel itchy like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,behaving im mad cant see
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,whole crew
6,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,need hug
7,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,hey long time see yes rain bit bit lol im fine...
8,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,nope didnt
9,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,que muera


In [59]:
X = df['text'].values
y = df['polarity'].values

In [74]:
vector = TfidfVectorizer(sublinear_tf=True)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

AttributeError: lower not found

In [70]:
from sklearn.model_selection import cross_val_score
lr_xval_model = LogisticRegression(random_state=3, solver='liblinear', verbose=True)
lr_xval_scores = cross_val_score(lr_xval_model, X, y, cv=4)
lr_xval_scores.mean()

[LibLinear][LibLinear][LibLinear][LibLinear]

0.7782486124777812

In [65]:
from sklearn.model_selection import cross_val_score
lr_xval_model = LogisticRegression(random_state=3, solver='newton-cg', verbose=True, n_jobs=-1)
lr_xval_scores = cross_val_score(lr_xval_model, X, y, cv=4)
lr_xval_scores.mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  1.1min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  1.1min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  1.1min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  1.1min finished


0.7782486124793437

### Final model to be used on covid dataset

In [71]:
lr_xval_model = LogisticRegression(random_state=3, solver='liblinear', verbose=True)
lr_xval_model.fit(X, y)

[LibLinear]

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=3, solver='liblinear', tol=0.0001, verbose=True,
                   warm_start=False)

In [72]:
import pickle

file = open('logisticRegressionModel.pickle','wb')
pickle.dump(lr_xval_model, file)
file.close()

### All testing below was done on a 100,000 samples subset of sentiment 140

In [43]:
rf = RandomForestClassifier(n_estimators = 20, criterion = 'entropy', max_depth=50)
rf_xval_scores = cross_val_score(rf, X, y, cv=10)
rf_xval_scores.mean()

0.7142671957195719

In [44]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
import keras

Using TensorFlow backend.


In [46]:
max_words = 5000
max_len = 200

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df.text)
sequences = tokenizer.texts_to_sequences(df.text)
tweets = pad_sequences(sequences, maxlen=max_len)
print(tweets)

[[   0    0    0 ...    0   91   92]
 [   0    0    0 ...   10 4488 1158]
 [   0    0    0 ...   43 4489  459]
 ...
 [   0    0    0 ...    0   88 3469]
 [   0    0    0 ...  188   19   58]
 [   0    0    0 ... 1121 3287 1385]]


In [48]:
X_train, X_test, y_train, y_test = train_test_split(tweets, df.polarity.values, test_size=0.2, random_state=101)

### Interrupted the training early due to the validation accuracy dropping while the training accuracy was rising (overfitting)
### Still didn't outperform logistic regression trained on the 100,000 samples subset and takes much longer to train

In [50]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import ModelCheckpoint
model2 = Sequential()
model2.add(layers.Embedding(max_words, 128))
model2.add(layers.LSTM(64,dropout=0.5))
model2.add(layers.Dense(16, activation='relu'))
model2.add(layers.Dense(8, activation='relu'))
model2.add(layers.Dense(1,activation='sigmoid'))
model2.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])
checkpoint2 = ModelCheckpoint("rnn_model.hdf5", monitor='val_accuracy', verbose=1,save_best_only=True, mode='auto', period=1,save_weights_only=False)
history = model2.fit(X_train, y_train, epochs=10,validation_data=(X_test, y_test),callbacks=[checkpoint2])

Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.76385, saving model to rnn_model.hdf5
Epoch 2/10

Epoch 00002: val_accuracy improved from 0.76385 to 0.76430, saving model to rnn_model.hdf5
Epoch 3/10

Epoch 00003: val_accuracy improved from 0.76430 to 0.76500, saving model to rnn_model.hdf5
Epoch 4/10

Epoch 00004: val_accuracy did not improve from 0.76500
Epoch 5/10

Epoch 00005: val_accuracy did not improve from 0.76500
Epoch 6/10
 100/2500 [>.............................] - ETA: 5:23 - loss: 0.3781 - accuracy: 0.8238

KeyboardInterrupt: 