In [None]:
import sys
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from gensim.models import KeyedVectors
import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv('../Data/r8-train-all-terms.txt', header=None, sep='\t')
test = pd.read_csv('../Data/r8-test-all-terms.txt', header=None, sep='\t')
train.columns = ['label', 'content']
test.columns = ['label', 'content']

In [None]:
class GloveVectorizer:
    def __init__(self):
        word2vec = {}
        embedding = []
        idx2word = []
        with open('../Data/glove.6B/glove.6B.50d.txt') as f:
            for line in f:
                values = line.split()
                word = values[0]
                vec = np.asarray(values[1:], dtype = 'float32')
                word2vec[word] = vec
                embedding.append(vec)
                idx2word.append(word)
        print('Found %s word vectors.' % len(word2vec))
        
        self.word2vec = word2vec
        self.embedding = np.array(embedding)
        self.word2idx = {v:k for k, v in enumerate(idx2word)}
        self.V, self.D = self.embedding.shape
    
    def fit(self, data):
        pass
    
    def transform(self, data):
        X = np.zeros((len(data), self.D))
        n = 0 
        emptycount = 0 
        
        for sentence in data:
            tokens = sentence.lower().split()
            vecs = []
            for word in tokens:
                if word in self.word2vec:
                    vec = self.word2vec[word]
                    vecs.append(vec)
            if len(vecs) > 0:
                vecs = np.array(vecs)
                X[n] = vecs.mean(axis = 0)
            else:
                emptycount += 1 
            n += 1 
        print("number of samples with no words found in glove: %s/%s" % (emptycount, len(data)))
        return X
    
    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

In [None]:
class Word2VecVectorizer:
    def __init__(self):
        self.word_vectors = KeyedVectors.load_word2vec_format(
            '../Data/GoogleNews-vectors-negative300.bin',
            binary = True
        )
    
    def fit(self, data):
        pass
    
    def transform(self, data):
        v = self.word_vectors.get_vector('king')
        self.D = v.shape[0]
        
        X = np.zeros(len(data), self.D)
        n = 0 
        emptycount = 0 
        
        for sentence in data:
            tokens = sentence.split()
            vecs = [] 
            for word in tokens:
                try:
                    vec = self.word_vectors.get_vector(word)
                    vecs.append(vec)
                except KeyError:
                    pass
            if len(vecs) > 0:
                vecs = np.array(vecs)
                X[n] = vecs.mean(axis = 0)
            else:
                emptycount += 1 
            n += 1 
        
        print("number of samples with not words found: %s / %s" % (emptycount, len(data)))
        return X
    
    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)


In [None]:
vectorizer = GloveVectorizer()
Xtrain = vectorizer.fit_transform(train['content'])
Ytrain = train.label
Xtest = vectorizer.fit_transform(test['content'])
Ytest = test.label

In [None]:
model = RandomForestClassifier(n_estimators = 200)
model.fit(Xtrain, Ytrain)
model.feature_importances_

In [None]:
print("train score:", model.score(Xtrain, Ytrain))
print("test score:", model.score(Xtest, Ytest))