In [1]:
# Python 3+

# 3rd party imports (not present in the standard python library)
# To install, pip install numpy pandas

import numpy as np
import pandas as pd

# Standard python library imports

import glob

In [2]:
# A large dataset with 1.6 million tweets are being used to train the model
# Due to its size, the file is not included in this repository
# The dataset can be downloaded from https://www.kaggle.com/kazanova/sentiment140

# File in current workspace
glob.glob('*.csv')

['training.1600000.processed.noemoticon.csv']

In [6]:
# Import data

df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding = 'ISO-8859-1', names = ["Score", "Id", "Date", "Flag", "User", "Tweet"])

In [7]:
# First 5 records

df.head()

Unnamed: 0,Score,Id,Date,Flag,User,Tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [8]:
# Last 5 records

df.tail()

Unnamed: 0,Score,Id,Date,Flag,User,Tweet
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
1599999,4,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


In [16]:
# To train the model, our primary data points are the tweet and the score assoociated with the score
# Score here is the sentiment where 0 = negative, 4 = positive
# Columns that are not required are removed and the score is normalized to be in the 0 - 1 range

df.drop(["Id", "Date", "Flag", "User"], axis = 1, inplace = True)
df['Score'] = df['Score'].apply(lambda i : i / 4)

In [17]:
df.head()

Unnamed: 0,Score,Tweet
0,0.0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0.0,is upset that he can't update his Facebook by ...
2,0.0,@Kenichan I dived many times for the ball. Man...
3,0.0,my whole body feels itchy and like its on fire
4,0.0,"@nationwideclass no, it's not behaving at all...."


In [21]:
df['Tweet'][0]

"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"

In [9]:
## Tweet cleanup (this process takes a significant amount of time)
# Use the df-cleaned.pickle to load a cleaned up dataframe
# Removing stop words, @ mentions, webpages and special characters

from nltk.corpus import stopwords # nltk.download('stopwords') before importing
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

def clean(tweet):
    stage1 = [word for word in tweet.lower().split() if word not in stopwords.words('english')] # stopword removal
    stage2 = [word[1:] if word.startswith('#') else word for word in stage1] # Hashtag symbol removal
    stage3 = [stemmer.stem(word) for word in stage2 if not any([word.startswith('@'), word.startswith('http'), word.startswith('www')])] # @ mentions and websites removal and stemming
    return ' '.join(stage3)

In [22]:
%%time

df['TweetStripped'] = df['Tweet'].apply(clean)

CPU times: user 33min 5s, sys: 7min 23s, total: 40min 28s
Wall time: 40min 32s


In [24]:
df.head()

Unnamed: 0,Score,Tweet,TweetStripped
0,0.0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- awww, that' bummer. shoulda got david carr t..."
1,0.0,is upset that he can't update his Facebook by ...,upset can't updat facebook text it... might cr...
2,0.0,@Kenichan I dived many times for the ball. Man...,dive mani time ball. manag save 50% rest go bound
3,0.0,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0.0,"@nationwideclass no, it's not behaving at all....","no, behav all. i'm mad. here? can't see there."


In [1]:
# Import dataframe from pickle

import pickle

with open('Pickled data/df-cleaned-final.pickle', 'rb') as f:
    df = pickle.load(f)

In [3]:
df.head()

Unnamed: 0,Score,Tweet,TweetStripped
0,0.0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- awww, that' bummer. shoulda got david carr t..."
1,0.0,is upset that he can't update his Facebook by ...,upset can't updat facebook text it... might cr...
2,0.0,@Kenichan I dived many times for the ball. Man...,dive mani time ball. manag save 50% rest go bound
3,0.0,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0.0,"@nationwideclass no, it's not behaving at all....","no, behav all. i'm mad. here? can't see there."


In [3]:
## Analysis

from sklearn.feature_extraction.text import TfidfVectorizer # Perfoms the TF-IDF
from sklearn.model_selection import train_test_split # Used to split the data into training and testing

# Data is split in the ratio of 0.9 (train) : 0.1 (test)
train_x, test_x, train_y, test_y = train_test_split(df['TweetStripped'], df['Score'], test_size = 0.1, shuffle = True)

# To compare the accuracy when the raw tweet is used to train the model, the original data is split as well
train_x2, test_x2, train_y2, test_y2 = train_test_split(df['Tweet'], df['Score'], test_size = 0.1, shuffle = True)


In [4]:
# Initialize and fit the TfTfidfVectorizer
vector = TfidfVectorizer(max_features = 10000, ngram_range = (1,2), stop_words='english')
%time vector.fit(train_x)

CPU times: user 36.5 s, sys: 816 ms, total: 37.3 s
Wall time: 36.6 s


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=10000,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [5]:
# Transform the data to pass it into various classifiers
train_x_transformed = vector.transform(train_x)

In [6]:
# The data will be trained on several models to find the one with the highest accuracy

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

models = []
models.append(('LR', LogisticRegression()))
models.append(('NB', MultinomialNB()))

### Models below for this dataset take significantly longer
#models.append(('LDA', LinearDiscriminantAnalysis()))
#models.append(('KNN', KNeighborsClassifier()))
#models.append(('CART', DecisionTreeClassifier()))
#models.append(('SVM', SVC()))

In [31]:
%%time

# Train the models

results = dict()
scoring = 'accuracy'

for name, model in models:
    kfold = model_selection.KFold(n_splits = 10, random_state = 9)
    cv_results = model_selection.cross_val_score(model, train_x_transformed, train_y, cv = kfold, scoring = 'accuracy', n_jobs = -1, verbose = 1)
    results[name] = cv_results
    print('{}: Average: {}, std: {}'.format(name, cv_results.mean(), cv_results.std()))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   39.8s finished


LR: Average: 0.7687840277777778, std: 0.0010597752371332022


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


NB: Average: 0.7535305555555556, std: 0.001792321155222122
CPU times: user 1.96 s, sys: 1.02 s, total: 2.98 s
Wall time: 43.5 s


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    3.2s finished


In [7]:
from sklearn.neural_network import MLPClassifier

In [6]:
# Unpacking trained models

with open('Pickled data/nn.pickle', 'rb') as f3:
    NN = pickle.load(f3)

with open('Pickled data/LR.pickle', 'rb') as f3:
    LR = pickle.load(f3)

with open('Pickled data/naive-bayes.pickle', 'rb') as f3:
    NB = pickle.load(f3)

In [20]:
### Models to train
# Neural Network (Single layer with 100 units)
# Logistic Regression
# Multinomial Naive-Bayes

In [10]:
## Neural Network 
# (Note - training is suspended after seeing diminishing gain at around the 43rd iteration)

NN = MLPClassifier(verbose=2)
NN.fit(train_x_transformed, train_y)

Iteration 1, loss = 0.48831997
Iteration 2, loss = 0.47386068
Iteration 3, loss = 0.46782249
Iteration 4, loss = 0.46039490
Iteration 5, loss = 0.45133318
Iteration 6, loss = 0.44118853
Iteration 7, loss = 0.43009612
Iteration 8, loss = 0.41738729
Iteration 9, loss = 0.40312573
Iteration 10, loss = 0.38755965
Iteration 11, loss = 0.37132863
Iteration 12, loss = 0.35526305
Iteration 13, loss = 0.34024330
Iteration 14, loss = 0.32619042
Iteration 15, loss = 0.31359916
Iteration 16, loss = 0.30232044
Iteration 17, loss = 0.29245417
Iteration 18, loss = 0.28393783
Iteration 19, loss = 0.27643297
Iteration 20, loss = 0.26972821
Iteration 21, loss = 0.26398571
Iteration 22, loss = 0.25880516
Iteration 23, loss = 0.25422614
Iteration 24, loss = 0.25004071
Iteration 25, loss = 0.24643164
Iteration 26, loss = 0.24311043
Iteration 27, loss = 0.24015910
Iteration 28, loss = 0.23728538
Iteration 29, loss = 0.23484182
Iteration 30, loss = 0.23259847
Iteration 31, loss = 0.23025172
Iteration 32, los



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=2, warm_start=False)

In [8]:
# Logistic Regression

LR = LogisticRegression()
LR.fit(train_x_transformed, train_y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [9]:
# Multinomial Naive-Bayes

NB = MultinomialNB()
NB.fit(train_x_transformed, train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [15]:
### Predictions from various models

predNN = NN.predict(vector.transform(test_x))
predLR = LR.predict(vector.transform(test_x))
predNB = NB.predict(vector.transform(test_x))

In [12]:
# Calculate accuracy and confusion matrix

from sklearn.metrics import confusion_matrix, accuracy_score

In [16]:
for model, prediction in zip(['Neural Network', 'Logistic Regression', 'Naive Bayes'], [predNN, predLR, predNB]):
    print('Model: {}'.format(model))
    print('Accuracy - {}'.format(accuracy_score(test_y, prediction)))
    print('Confusion matrix - {}\n'.format(confusion_matrix(test_y, prediction)))

Model: Neural Network
Accuracy - 0.5812875
Confusion matrix - [[46587 33319]
 [33675 46419]]

Model: Logistic Regression
Accuracy - 0.77554375
Confusion matrix - [[59887 20019]
 [15894 64200]]

Model: Naive Bayes
Accuracy - 0.76004375
Confusion matrix - [[59788 20118]
 [18275 61819]]



In [7]:
## Function to test a tweet, defaults to LR due to its higher accuracy

def predict(tweet, model = LR):
    return model.predict(vector.transform([clean(tweet)]))

In [10]:
# 1: positive, 0: negative

print('NN: {}'.format(predict('I love math!', model = NN)))
print('LR: {}'.format(predict('I love math!', model = LR)))
print('NB: {}'.format(predict('I love math!', model = NB)))

NN: [1.]
LR: [1.]
NB: [1.]


In [55]:
## Shelve the model, vector and predict objects

import shelve

with shelve.open('shelve.model', 'c') as shelf:
    shelf['model'] = LR
    shelf['vector'] = vector

### Model 4 (Experimental)

In [2]:
## Using a BERT model to train
# Experimenting with BERT, training took over 8 hours on a GPU boosted system. 
# pip install BertLibrary

from BertLibrary import BertFTModel





In [30]:
# Download and unzip a pre-trained BERT model with 12-layer, 768-hidden, 12-heads, 110M parameters

!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip uncased_L-12_H-768_A-12.zip

--2019-11-11 19:43:11--  https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.8.176
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.8.176|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 407727028 (389M) [application/zip]
Saving to: ‘uncased_L-12_H-768_A-12.zip’


2019-11-11 19:45:22 (2.97 MB/s) - ‘uncased_L-12_H-768_A-12.zip’ saved [407727028/407727028]

Archive:  uncased_L-12_H-768_A-12.zip
   creating: uncased_L-12_H-768_A-12/
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.meta  
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001  
  inflating: uncased_L-12_H-768_A-12/vocab.txt  
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.index  
  inflating: uncased_L-12_H-768_A-12/bert_config.json  


In [27]:
df_train, df_test = train_test_split(df.drop('Tweet', axis=1), test_size = 0.2, shuffle = True)
df_dev, df_dev2 = train_test_split(df_test, test_size=0.9, shuffle=True)

In [28]:
df_dev.head()

Unnamed: 0,Score,TweetStripped
442917,0.0,miss church today... need prayer &amp; encoura...
239920,0.0,sun shine i'm tire enjoy
911843,1.0,long weekend. thank god sleep monday
316239,0.0,enjoy time off...back work
520169,0.0,actual spot leed


In [29]:
# Create the dataset for the BERT model

!mkdir dataset

df_train.reset_index(drop=True).to_csv('dataset/train.csv', sep='\t', index=None, header=None)
df_test.to_csv('dataset/test.csv', sep='\t', index=None, header=None)
df_dev.to_csv('dataset/dev.csv', sep='\t', index=None, header=None)

In [4]:
bert_model = BertFTModel(model_dir='uncased_L-12_H-768_A-12',
                        ckpt_name='bert_model.ckpt',
                        labels = ['0', '1'],
                        ckpt_output_dir='output',
                        num_train_steps=20000,
                        num_warmup_steps=500,
                        save_check_steps=500,
                        do_lower_case=False,
                        max_seq_len=50,
                        batch_size=32)

bert_trainer = bert_model.get_trainer()
bert_evaluator = bert_model.get_evaluator()



INFO:tensorflow:Using config: {'_model_dir': 'output', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': device_count {
  key: "GPU"
  value: 1
}
gpu_options {
  per_process_gpu_memory_fraction: 0.5
  allow_growth: true
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fe960ea1ef0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [56]:
TRAIN_SIZE = 0.75
VAL_SIZE = 0.05
dataset_count = len(df)

df_train_val, df_test = train_test_split(temp, test_size=1-TRAIN_SIZE-VAL_SIZE, random_state=42)
df_train, df_val = train_test_split(df_train_val, test_size=VAL_SIZE / (VAL_SIZE + TRAIN_SIZE), random_state=42)

In [53]:
temp = df.drop('Tweet', axis=1)

In [54]:
temp['Score'] = temp['Score'].apply(lambda x : int(x))

In [55]:
temp.head()

Unnamed: 0,Score,TweetStripped
0,0,"- awww, that' bummer. shoulda got david carr t..."
1,0,upset can't updat facebook text it... might cr...
2,0,dive mani time ball. manag save 50% rest go bound
3,0,whole bodi feel itchi like fire
4,0,"no, behav all. i'm mad. here? can't see there."


In [57]:
!mkdir dataset

df_train.sample(frac=1.0).reset_index(drop=True).to_csv('dataset/train.tsv', sep='\t', index=None, header=None)
df_val.to_csv('dataset/dev.tsv', sep='\t', index=None, header=None)
df_test.to_csv('dataset/test.tsv', sep='\t', index=None, header=None)

In [59]:
#bert_trainer.train_from_file('dataset', 35000)

In [60]:
# ft_evaluator.evaluate_from_file('dataset', checkpoint="output/model.ckpt-35000") 

In [35]:
a = predict('Ad')

In [38]:
a[0] == 1

True