In [184]:
import regex
import numpy as np
import pandas as pd
import fasttext
import sys
from gensim.models import KeyedVectors
from gensim.models.wrappers import FastText
from sklearn.metrics import accuracy_score
from sklearn import linear_model

`Load Tweets`

In [114]:
neg_train = pd.read_fwf('data/tweets/train_neg.txt', header=None, names=['tweets'])
pos_train = pd.read_fwf('data/tweets/train_pos.txt', header=None, names=['tweets'])

`Clean Tweets`

In [115]:
# load stop words
f = open("data/stop_words.txt", 'r')
stop_words = list()
for word in f:
    if word != '\n':
        stop_words += [word.strip("\n").strip(" ").strip("\t")]
f.close()

stop_words = set(stop_words)

In [116]:
def cleanTweet(tweet):
    
    cleaned_tweet = str()
    for word in tweet.lower().split():
            
        # remove non-latin characters
        clean_word = regex.sub(u'[^\p{Latin}]', u'', word)
        
        # don't include stop words
        if clean_word not in stop_words:
            cleaned_tweet += clean_word + " "
    
    return cleaned_tweet

In [117]:
# clean tweets
neg_train_clean = neg_train['tweets'].apply(cleanTweet)
pos_train_clean = pos_train['tweets'].apply(cleanTweet)

`Load Word Embeddings`

`Note that length of word embeddings for Word2vec and Fasttext is 300 and 200 for Glove.`

`Word2Vec`

In [118]:
word2vec_model = KeyedVectors.load_word2vec_format("data/embeddings/word2vec.en.bin", binary=True)

`Fasttext`

In [290]:
fasttext_model = KeyedVectors.load_word2vec_format("data/embeddings/fasttext.en.vec",\
                                                   binary=False, limit=100000)
# fasttext_model = FastText.load_fasttext_format('data/embeddings/fasttext.en')
# fasttext_model = fasttext.load_model("data/embeddings/fasttext.1.en.vec")

`Glove`

In [318]:
def loadGloveModel(gloveFile):
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    return model

glove_model = loadGloveModel("data/embeddings/glove/glove.twitter.27B.200d.txt")

`Pre-processing`

In [319]:
pos_tweets = pos_train_clean.values
neg_tweets = neg_train_clean.values

num_samples = 100000
pos_labels = num_samples * [1] 
neg_labels = num_samples * [0] 

train_sentences = np.array(list(pos_tweets) + list(neg_tweets))
train_labels = np.array(pos_labels + neg_labels)

In [320]:
# average all word-embeddings within a sentence
def average_embeddings(sentences, model, vector_length):
    return_matrix = []
    for sentence in sentences:
        words = sentence.split()
        word_count = float(len(words))
        temp_list = np.zeros(vector_length)
        for word in words:
            try:
                temp_list += model[word][:vector_length]
            except:
                word_count -= 1
        if word_count == 0:
            avrg_list = np.zeros(vector_length)
        else:
            avrg_list = temp_list/word_count
        return_matrix.append(avrg_list)
    return return_matrix

In [321]:
# convert tweets into bag-of-words representations
vector_length = 200
train_features = average_embeddings(train_sentences, glove_model, vector_length)

`Predict Categories with Scikit-learn`

`Logistic Regression Model`

In [322]:
lr = linear_model.LogisticRegression(C=1e5)
lr.fit(train_features, train_labels)

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [323]:
accuracy_score(train_labels, lr.predict(train_features))

0.75675000000000003

| `Model` | `Accuracy`   |
|------|------|
| *`Sampled`* `Fasttext` | `0.72694`|
| `Word2vec` | `0.72697`|
| `Glove` | `0.75675`|