This project was heavily inspired by work by XiaoFan LEI on Medium

# Split Train / Test

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split

# create input and output directories
import os
inputpath = 'input'
outputpath = 'output'
if os.path.exists(inputpath) is False:
    os.mkdir(inputpath)
if os.path.exists(outputpath) is False:
    os.mkdir(outputpath)

#input file path
sentiment140_file = 'input/training.1600000.processed.noemoticon.csv'

# read csv
colnames = ['polarity', 'id', 'date', 'query', 'user', 'tweet']
df_tweets = pd.read_csv(sentiment140_file, encoding='UTF', names=colnames, encoding_errors='ignore')

# get 1600 tweets
df = df_tweets[['polarity','tweet']].sample(n=1600, random_state=0)
df.to_csv("output/selected_tweets.csv", index=False)

# ---------

# X is the list of tweets
x = df.tweet.values
# Y is their polarity
y = df.polarity.replace(4, 1) # Positive is 1, 0 is negative

# split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

print('x train size', len(x_train), 'y train size', len(y_train))
print('test size', len(x_test))

x train size 1120 y train size 1120
test size 480


# Text Cleaning

In [4]:
import re
import nltk
from autocorrect import Speller
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

spell = Speller(lang='en')
# nltk.download('punkt')
# nltk.download('wordnet')
lemm = WordNetLemmatizer()

# Fixing Word Lengthening: hiiiiii -> hii; helllllooo -> hellloo
def reduce_length(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

def text_preprocess(doc):
    #Lowercasing all the letters
    temp = doc.lower()
    #Removing hashtags and mentions
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("#[A-Za-z0-9_]+","", temp)
    #Removing links
    temp = re.sub(r"http\S+", "", temp) # \S stops at whitespace
    temp = re.sub(r"www.\S+", "", temp) # \S stops at whitespace
    #removing numbers
    temp = re.sub("[0-9]","", temp)
    #Removing '
    temp = re.sub("'"," ",temp)

    #Tokenization
    temp = word_tokenize(temp)
    #Fixing Word Lengthening
    temp = [reduce_length(w) for w in temp]
    #spell corrector
    temp = [spell(w) for w in temp]
    #stem
    temp = [lemm.lemmatize(w) for w in temp]
    #Removing short words
    temp = [w for w in temp if len(w)>2]
    temp = " ".join(w for w in temp) # back to a string
    
    return temp

# Naive Bayes

In [5]:
from nltk import FreqDist
from nltk.classify import NaiveBayesClassifier, accuracy

# build the dataset (list of tuples)
# dataset = [(['this', 'is', 'a', 'tweet'], 0), (...), ... ]
def build_dataset(x, y):
    words = [text_preprocess(word).split(" ") for word in x]
    dataset = list(zip(words, y)) # list of tuples
    
    return dataset

all_words = FreqDist(sum([w.split(" ") for w in x_train],[]))
word_features = list(all_words)[:2000] # list of the 2000 most common words

# Says whether or not the most commonly used words are within the given list of words
# So the dimension of the feature vector is as long as 2000 most common words
# All features are false if none of the most common words are in that tweet
def document_features(words):
    features = {}
    for word in word_features:
        features[word] = (word in set(words))

    return features

trainset = build_dataset(x_train, y_train)
testset = build_dataset(x_test, y_test)
train_set = [(document_features(d), y) for (d,y) in trainset]
test_set = [(document_features(d), y) for (d,y) in testset]


nb_classifier = nltk.NaiveBayesClassifier.train(train_set)

# Test the classifier
print("accuracy score on test set:", accuracy(nb_classifier, test_set))

accuracy score on test set: 0.6895833333333333


## Naive Bayes Accuracy: ~69%
Decent.

In [6]:
nb_classifier.show_most_informative_features()

Most Informative Features
                    look = True                1 : 0      =      6.5 : 1.0
                    miss = True                0 : 1      =      6.2 : 1.0
                   those = True                1 : 0      =      5.9 : 1.0
                  thanks = True                1 : 0      =      5.4 : 1.0
                    dont = True                0 : 1      =      5.2 : 1.0
                    wish = True                0 : 1      =      5.2 : 1.0
                  follow = True                1 : 0      =      5.2 : 1.0
                   later = True                1 : 0      =      5.2 : 1.0
                   lunch = True                1 : 0      =      5.2 : 1.0
                     yes = True                1 : 0      =      5.2 : 1.0


Let's take a look at trying our classifier on a brand new sentence

In [7]:
test_sentence = "This is the best band I've ever heard!"
test_sent_features = {word: (word in text_preprocess(test_sentence).split(" ")) for word in word_features}
print('feature length is:', len(test_sent_features))

feature length is: 2000


In [8]:
nb_classifier.classify(test_sent_features)

1

Good! It looks like our classifier correctly identifies the sentence as positive. Now, let's see if we can do better...

# RNN Classifier: LSTM

In [9]:
import numpy as np
import tensorflow as tf
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Dense, Dropout, Bidirectional, LSTM

#################data preparation
cleaned_train = [text_preprocess(d) for d in x_train]
#building a dictionary
tk = Tokenizer(num_words=None) #the maximum number of words to keep, based on word frequency
tk.fit_on_texts(cleaned_train)
#1.1 get the size of the dictionary
dico_size = len(tk.word_counts.items()) # This gets the number of unique words in our training set
print('dictionary size:', dico_size)
num_tokens = dico_size + 1
#2. building sequneces
seq_X = tk.texts_to_sequences(cleaned_train) # This just converts each word to a single number that corresponds to a dictionary index in the Tokenizer
print('text_to_sequences:', seq_X[:2])
#2.1 calculate maxi length of tweets
max_len = np.max(np.array([len(d) for d in seq_X]))
marg_len=10
print(f'longest tweet contains {max_len} words')
maxlen = max_len + marg_len
print(f'pad_sequences will produce an array that is ({len(seq_X)} x {maxlen})')
#3. padding the sequences
Xtrain = pad_sequences(seq_X,maxlen=maxlen,padding='post') # padding='post' means padding goes after the tweet is over
print(f'Xtrain is {len(Xtrain)} x {len(Xtrain[0])}')

dictionary size: 2653
text_to_sequences: [[685, 45, 5, 6, 686, 76, 337, 31, 165, 1069, 1070], [687, 295, 19, 12, 1071, 41, 77, 256, 179, 504, 17]]
longest tweet contains 27 words
pad_sequences will produce an array that is (1120 x 37)
Xtrain is 1120 x 37


Now we make a function that maps each word to a vector (np array) of it's embedding (can think of it as coordinates of how similar the words are).

This one is pre-trained from wiki-news. So we just load it. It returns a dictionary.


EX: embedding_model['my_word'] -> [123, 13, 3 , 23, 1, ... , 123]

In [10]:
###########loading glove and fasttext embedding vectors
def load_embedding_model(file):
    embedding_model = {}
    with open(file,'r', encoding='utf-8') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            embedding_model[word] = embedding
    return embedding_model

embedding_index_fasttext = load_embedding_model('input/wiki-news-300d-1M.vec')
print('found %s word vectors in loaded fasttext model.' % len(embedding_index_fasttext))

found 999995 word vectors in loaded fasttext model.


In [11]:
##########Preparing a corresponding embedding matrix
def embedding_matrix(num_tokens,embedding_dim,embedding_index):
    hits=0
    misses=[]

    # Prepare embedding matrix
    embedding_matrix = np.zeros((num_tokens, embedding_dim)) # Each word in train set will be mapped to a vector of dimension 300

    for word, i in tk.word_index.items(): # Going through all the words and their indecies in the training set
        embedding_vector = embedding_index.get(word) # Get the word vector of that SAME WORD in the wiki embeddings
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            # This includes the representation for "padding" and "OOV"
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses.append(word) # If the word in the tweet is not in the wiki dictionary add it to a list and it will not be included in the model
    print("Converted %d words (%d misses)" % (hits, len(misses)))
    print("words not included in pretrained model:",misses)
    return embedding_matrix

embedding_dim=300
embedding_matrix_fasttext = embedding_matrix(num_tokens,embedding_dim,embedding_index_fasttext)

Converted 2579 words (74 misses)
words not included in pretrained model: ['loveyouu', 'lmaz', 'gottabesomebody', 'auctionsniper', 'funcionou', 'ncis', 'spartak', 'twitterberry', 'copiedandpasted', 'omgosh', 'catal', 'cookiedough', 'tagaytay', 'arrgghh', 'krystinas', 'kutnerr', 'camila', 'obnoxciously', 'rushmore', 'honeytint', 'cahntilli', 'fuckingtastic', 'allyssas', 'huhuhu', '…needed', 'bassotti', 'citrixcloud', 'wolfmother', 'kirstie', 'cheol', 'vilmarie', 'xaviermedia', 'heartburny', 'pnas', 'doliviawilder', 'techhelp', 'grimshaw', 'gyokoro', 'reblipping', 'followfriday', 'mbb', 'organization�', 'twitpics', 'lastlatter', 'pawngame', 'arangurens', 'konstantino', 'goodmorning', 'beeteedubs', 'pahonorsocietyst', 'farewellness', 'bellarlly', 'saymyspacetwitters', 'gottwitter', 'wwd', 'fianc�', 'triginometery', 'atikah', 'ilovejb', 'lismore', 'bodypump', 'groundctrl', 'lemmiin', 'tysonritteraar', 'anacecii', 'btnreply', 'wesseltof', 'maryanne', 'maltesers', 'elyshia', 'muhhwahh', 'haah

## Building the LSTM NN
- Embedding layer
- Dropout
- Bidirectional LSTM
- 64 Dense ReLU
- 16 Dense ReLU
- Sigmoid

In [12]:
##########Building the model with the embedding layer non trainable
embedding_layer_fasttext = Embedding( # Turns positive integers (indexes) into dense vectors of fixed size.
    input_dim=num_tokens, # Num words in train set
    output_dim=embedding_dim, # Each word represented by a 300 dim vector
    input_length=maxlen, # the length of input sequences. it takes each word for each input at the same time as one sequence
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix_fasttext), # embedding layer is set to pre-trained embeddings (?)
    trainable=False, # word embeddings are pre-trained so no need to adjust them
)
lstm_model = Sequential() # Makes a new sequential model with one input tensor and one output tensor(LSTM)
# add embedding lyaer
lstm_model.add(embedding_layer_fasttext)
lstm_model.add(Dropout(0.5))
#LSTM
lstm_model.add(Bidirectional(LSTM(8,dropout=0.5,recurrent_dropout=0.2)))
# add a vanilla hidden layer:
lstm_model.add(Dense(64, activation='relu'))
lstm_model.add(Dense(16, activation='relu'))
lstm_model.add(Dropout(0.5))
lstm_model.add(Dense(units=1, activation='sigmoid',name='predictions'))

#compiling the model
lstm_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-3, epsilon=1e-08, clipnorm=1.0), 
              loss="binary_crossentropy",
              metrics=['accuracy'])

print(lstm_model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 37, 300)           796200    
                                                                 
 dropout_38 (Dropout)        (None, 37, 300)           0         
                                                                 
 bidirectional (Bidirection  (None, 16)                19776     
 al)                                                             
                                                                 
 dense (Dense)               (None, 64)                1088      
                                                                 
 dense_1 (Dense)             (None, 16)                1040      
                                                                 
 dropout_39 (Dropout)        (None, 16)                0         
                                                        

## Training

In [13]:
###########Trainning and evoluating the model
parameterization = {
    'batch_size': 32,
    # Add other parameters as needed
}
NUM_EPOCHS = 15

history = lstm_model.fit(Xtrain, y_train, 
                            batch_size=parameterization.get('batch_size'), 
                            epochs=NUM_EPOCHS,
                             validation_split=0.2, verbose=2)
#test
cleaned_test = [text_preprocess(d) for d in x_test]
Qtest = tk.texts_to_sequences(cleaned_test)
Ptest = pad_sequences(Qtest,maxlen=maxlen,padding='post' )
print("LSTM model evaluation with fasttext 300d embedding:")
print(lstm_model.evaluate(Ptest, y_test))

Epoch 1/15


28/28 - 5s - loss: 0.6915 - accuracy: 0.5078 - val_loss: 0.6915 - val_accuracy: 0.5000 - 5s/epoch - 166ms/step
Epoch 2/15
28/28 - 1s - loss: 0.6887 - accuracy: 0.5391 - val_loss: 0.6717 - val_accuracy: 0.6161 - 677ms/epoch - 24ms/step
Epoch 3/15
28/28 - 1s - loss: 0.6767 - accuracy: 0.5625 - val_loss: 0.6534 - val_accuracy: 0.6161 - 663ms/epoch - 24ms/step
Epoch 4/15
28/28 - 1s - loss: 0.6279 - accuracy: 0.6719 - val_loss: 0.6035 - val_accuracy: 0.6562 - 616ms/epoch - 22ms/step
Epoch 5/15
28/28 - 1s - loss: 0.6108 - accuracy: 0.6853 - val_loss: 0.6480 - val_accuracy: 0.6518 - 631ms/epoch - 23ms/step
Epoch 6/15
28/28 - 1s - loss: 0.6345 - accuracy: 0.6395 - val_loss: 0.6076 - val_accuracy: 0.6473 - 668ms/epoch - 24ms/step
Epoch 7/15
28/28 - 1s - loss: 0.6139 - accuracy: 0.6663 - val_loss: 0.5928 - val_accuracy: 0.6920 - 665ms/epoch - 24ms/step
Epoch 8/15
28/28 - 1s - loss: 0.6081 - accuracy: 0.6685 - val_loss: 0.5913 - val_accuracy: 0.6964 - 641ms/epoch - 23ms/step
Epoch 9/

After 15 epochs, it looks like we are achieving about the same accuracy as out naive bayes classifier. Next, we'll use BERT to perform better.

# Using BERT for Sentiment Analysis

In [10]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
# Hide GPU from visible devices
tf.config.set_visible_devices([], 'GPU')
from transformers import BertTokenizer, TFBertForSequenceClassification
#load the model
bert_model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
#################text cleaning
def preprocess(X):
    import re
    def text_clean(text):
        temp = text.lower()
        temp = re.sub("@[A-Za-z0-9_]+","", temp)
        temp = re.sub("#[A-Za-z0-9_]+","", temp)
        temp = re.sub(r"http\S+", "", temp)
        temp = re.sub(r"www.\S+", "", temp)
        temp = re.sub("[0-9]","", temp)
        return temp
    X_cleaned = [text_clean(text) for text in X]
    return X_cleaned

############transforming raw data to an appropriate format ready to feed into the BERT model
def convert_example_to_feature(text):
    return bert_tokenizer.encode_plus(text,
            add_special_tokens = True, # add [CLS], [SEP]
            max_length = 128, # max length of the text that can go to BERT
            pad_to_max_length = True, # add [PAD] tokens
            return_attention_mask = True, # add attention mask to not focus on pad tokens
          )

def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
    return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
    }, label

def encode_examples(X,y):
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []
    for text, label in zip(X, y):
        bert_input = convert_example_to_feature(text)
        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append([label])
    return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)

## Encoding Train, Validation, and Test for BERT

Not really sure why, but the length of y_train changed from the beginning of file, so re run dataset code.

In [17]:
#input file path
sentiment140_file = 'input/training.1600000.processed.noemoticon.csv'

# read csv
colnames = ['polarity', 'id', 'date', 'query', 'user', 'tweet']
df_tweets = pd.read_csv(sentiment140_file, encoding='UTF', names=colnames, encoding_errors='ignore')

# get 1600 tweets
df = df_tweets[['polarity','tweet']].sample(n=3000, random_state=0)
df.to_csv("output/selected_tweets.csv", index=False)

# ---------

# X is the list of tweets
x = df.tweet.values
# Y is their polarity
y = df.polarity.replace(4, 1) # Positive is 1, 0 is negative

# split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

print('x train size', len(x_train), 'y train size', len(y_train))
print('test size', len(x_test))

x train size 2100 y train size 2100
test size 900


In [18]:
# train dataset
# Splitting train set into train and validation to help with training
X_train, X_validation, y_train, y_validation = train_test_split(x_train, y_train, test_size=0.2, random_state=0)
ds_train_encoded = encode_examples(preprocess(X_train), y_train).shuffle(100).batch(32).repeat(2)
ds_val_encoded = encode_examples(preprocess(X_validation), y_validation).batch(32)
# test dataset
ds_test_encoded = encode_examples(preprocess(x_test), y_test).batch(32)



In [19]:
######### compiling the model
learning_rate = 3e-5
# choosing Adam optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)
# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
bert_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [20]:
#############training and evaluating
bert_model.fit(ds_train_encoded, epochs=4, validation_data=ds_val_encoded)

loss, acc = bert_model.evaluate(ds_test_encoded, verbose=0)
print("accuracy: {:5.2f}%".format(100 * acc))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
accuracy: 83.78%


In [22]:
##################Saving the model
bert_model.save_pretrained("outputs/bert_model", saved_model=True)

INFO:tensorflow:Assets written to: outputs/bert_model\saved_model\1\assets


INFO:tensorflow:Assets written to: outputs/bert_model\saved_model\1\assets
