In [35]:
# Loading the data file from local download
path_fastText = 'wiki-news-300d-1M.vec'
dictionary = open(path_fastText, 'r', encoding='utf-8',
                  newline='\n', errors='ignore')
embeds = {}
for line in dictionary:
    tokens = line.rstrip().split(' ')
    embeds[tokens[0]] = [float(x) for x in tokens[1:]]
    
    if len(embeds) == 100000:
        break

In [36]:
import pandas as pd
import numpy as np

path_to_text = 'tweet.csv'
data = pd.read_csv(path_to_text, encoding='latin-1')[['v1', 'v2']]
text = data['v2']
label = data['v1']


In [37]:
data[2:6]

Unnamed: 0,v1,v2
2,skip,Brady is showing no signs of slowing down on t...
3,trump,Want to thank you. And I'll tell you what we'r...
4,trump,We took on the corrupt media a lot of it is co...
5,trump,we need to get change and it will get done


In [38]:
from keras.preprocessing.text import text_to_word_sequence
array_length = 20 * 300
embedding_features = pd.DataFrame()
for document in text:
    # Saving the first 20 words of the document as a sequence
    words = text_to_word_sequence(document)[0:20] 
    
    # Retrieving the vector representation of each word and 
    # appending it to the feature vector 
    feature_vector = []
    for word in words:
        try:
            feature_vector = np.append(feature_vector, 
                                       np.array(embeds[word]))
        except KeyError:
            # In the event that a word is not included in our 
            # dictionary skip that word
            pass
    # If the text has less then 20 words, fill remaining vector with
    # zeros
    zeroes_to_add = array_length - len(feature_vector)
    feature_vector = np.append(feature_vector, 
                               np.zeros(zeroes_to_add)
                               ).reshape((1,-1))
    
    # Append the document feature vector to the feature table
    embedding_features = embedding_features.append( 
                                     pd.DataFrame(feature_vector))

In [39]:
print(embedding_features.shape)

(10, 6000)


In [40]:
train_percent = 0.7
train_cutoff = int(np.floor(train_percent*len(text) ) )


from sklearn.svm import LinearSVC

embeded_model = LinearSVC()
embeded_model.fit(embedding_features[0 : train_cutoff], 
                  label[0 : train_cutoff])
embeded_prediction = embeded_model.predict(
                   embedding_features[train_cutoff + 1 : len(text)])


In [41]:
embeded_prediction

array(['trump', 'trump'], dtype=object)