Borrowed Heavily from https://www.kaggle.com/nitin194/twitter-sentiment-analysis-word2vec-doc2vec

In [None]:
import re
import nltk
import string
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import gensim
from nltk.stem.porter import *


%matplotlib inline

First off let's access some of the data files for this task

In [None]:
names = ['TweetID', 'Sentiment', 'Tweet']

train = pd.read_csv('../Datasets/dataset/train/twitter-2016train-A.txt', delimiter='\t', names=names)
test = pd.read_csv('../Datasets/dataset/train/twitter-2016test-A.txt', delimiter='\t', names=['TweetID', 'Sentiment', 'Tweet', 'tidy_tweet'])

Now let's examine what the content looks like for positive, negative, and neutral tweets

In [None]:
train[train['Sentiment']=='positive'].head()

In [None]:
train[train['Sentiment']=='negative'].head()

In [None]:
train[train['Sentiment']=='neutral'].head()

Basic statistics on data

In [None]:
train['Sentiment'].value_counts()

In [None]:
plt.hist(train.Tweet.str.len(), label='train', bins=14, range=[0,140])
plt.hist(test.Tweet.str.len(), label='test', bins=14, range=[0,140])
plt.legend()
plt.show()

So now we want to do some data cleaning to get rid of unwanted stuff not relevant to sentiment classification

In [None]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt

We want to remove several things out of raw tweets
1. Twitter handles due to privacy concerns
2. Punctuation, numbers, and special characters
3. Small words
4. Normalize textual data so we don't have different bases

1. Removing Twitter Handles

In [None]:
train['tidy_tweet'] = np.vectorize(remove_pattern)(train['Tweet'], "@[\w]*")
train.head()

2. Removing Punctuations, Numbers, and Special Characters

In [None]:
train.tidy_tweet = train.tidy_tweet.str.replace("[^a-zA-Z#]", " ")
train.head()

3. Removing Short Words

In [None]:
train.tidy_tweet = train.tidy_tweet.apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))
train.head()

4. Text Normalization

In [None]:
tokenized_tweet = train.tidy_tweet.apply(lambda x: x.split())
tokenized_tweet.head()

In [None]:
stemmer = PorterStemmer()
tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x])
tokenized_tweet.head()

In [None]:
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
train['tidy_tweet'] = tokenized_tweet
train.head()

Just for convenience sake I'll make a single function that does all that for you

In [None]:
test.head()

In [None]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt

def tidy_tweet(dataset, min_len=3):
    dataset['tidy_tweet'] = np.vectorize(remove_pattern)(dataset['Tweet'], "@[\w]*")
    dataset.tidy_tweet = dataset.tidy_tweet.str.replace("[^a-zA-Z#]", " ")
    dataset.tidy_tweet = dataset.tidy_tweet.apply(lambda x: ' '.join([w for w in x.split() if len(w) > min_len]))
    tokenized_tweet = dataset.tidy_tweet.apply(lambda x: x.split())

    stemmer = PorterStemmer()
    tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x])
    tokenized_tweet.head()
    for i in range(len(tokenized_tweet)):
        tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
    dataset['tidy_tweet'] = tokenized_tweet
    return dataset

test = tidy_tweet(test)
test.head()

Next Step Is We Need to Train Word2Vec on our Corpus

In [None]:
%%time

tokenized_tweet = train['tidy_tweet'].apply(lambda x: x.split())

model_w2v = gensim.models.Word2Vec(
            tokenized_tweet,
            size = 200, #desired number of features, 200 seems to be a common width, no idea why
            window = 5, #context window size
            min_count =2, #ignores all words with total freq lower than 2
            sg = 1, #encoding for skip-gram model
            hs = 0,
            negative = 10, #for negative sampling
            workers = 2, #no. of cores
            seed = 34
)

model_w2v.train(tokenized_tweet, total_examples = len(train['tidy_tweet']), epochs=20)

In [None]:
model_w2v.wv.most_similar(positive="trump")

The dimensionality of our embedding space is 200, that means that each word is represented by a vector of 200 floating point numbers, isn't that convenient :).

But in order to make this dataset directly applicable to our neural networks we will need to produce a feature vector for all of the tweets.

Now since we are eliminating any words that are less than 3 chars in length, the max number of words we could have is 140/5, because each word has a space after it,  but we have to account for the fact that there will always be n-1 spaces so we'll add one and make the vector length 29.

In [None]:
model_w2v['trump']

In [None]:
len(model_w2v['trump'])

Here we need to define a function that we can map onto our Tweets, the only reason being we'll get an error if the model can't find the word embedding. My solution to this is just to return a 0 vector if the word doesn't exist in our corpus.

In [None]:
def apply_model(model, token, dim=200):
    try:
        return model[token]
    except(KeyError):
        return np.zeros(dim)
    
apply_model(model_w2v, 'schmup') #Testing, yep, this word doesn't exist
        

In [None]:
inputs = []
for tweet_ind in range(len(tokenized_tweet)):
    feature_map = []
    for word in tokenized_tweet[tweet_ind]:
        feature_map.append(apply_model(model_w2v, word))
    feature_map = np.array(feature_map)
    zero_pad = np.zeros([29,200])
    zero_pad[:feature_map.shape[0], :feature_map.shape[1]] = feature_map
    feature_map = zero_pad
    inputs.append(feature_map)
    
inputs[5]

In [None]:
print(tokenized_tweet[0])

In [None]:
print([i for i,v in enumerate(tokenized_tweet) if len(v) > 55])

In [None]:
tokenized_tweet[1878]