In [30]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pandas as pd
import re
from nltk.corpus import stopwords
import nltk.data
from nltk.tokenize import word_tokenize
import time
import numpy as np
from __future__ import absolute_import

In [32]:
from gensim.models.wrappers import FastText

model = FastText.load_fasttext_format('model_1_skipgram.bin', encoding='latin1')


In [35]:
tweets = pd.read_csv('../tweets_labelled',skipinitialspace=True, skip_blank_lines=True,encoding='latin1')

In [36]:
index2word_set = set(model.wv.index2word)

In [38]:
def clean_tweets( text ):
    # clean_1 = BeautifulSoup(text,"lxml").get_text()
    clean_2 = re.sub("[^a-zA-Z]"," ", text).lower().strip()
    clean_3 = word_tokenize(clean_2)
    stop_words = set(stopwords.words("english")) - set(['and','or','not'])
    words = [w for w in clean_3 if not w in stop_words]
    return words


In [41]:
num_features = 100

def makeFeatureVec(words, num_features):
    global index2word_set
    featureVec = np.zeros((num_features,),dtype="float32")
    nwords = 0.00000000001
    for word in words:
        # if the word is in wordset then add to feature vec
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    
    featureVec = np.divide(featureVec,nwords)
    return featureVec

def get_average_feature_vectors( tweets, num_feature ):
    counter = 0
    tweetFeatureVecs = np.zeros((len(tweets),num_features),dtype="float32")
    
    for tweet in tweets:
        tweetFeatureVecs[counter] = makeFeatureVec(tweet, num_features)
        counter += 1
    
    return tweetFeatureVecs

In [42]:
np.seterr(divide='ignore', invalid='ignore')
start = time.time()

cleaned_tweets = []
for text in tweets['text']:
    cleaned_tweets.append( clean_tweets(text))
    
DataVecs = get_average_feature_vectors( cleaned_tweets, num_features )

end = time.time()

elapsed_cleaning = end - start 

In [43]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(DataVecs, tweets['label'],random_state=10,test_size = 0.3 )

In [44]:
from sklearn.ensemble import RandomForestClassifier
start = time.time() 
model_ml = RandomForestClassifier(n_estimators=200, n_jobs=-1)
print('starting mmodel')
model_ml = model_ml.fit( X_train ,y_train)

end = time.time()

elapsed_training = end - start
print('end..')


starting mmodel
end..


In [45]:
result = model_ml.predict( X_test )

In [46]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, result))
print(classification_report(y_test, result))

[[17436   155   239   141]
 [  852  7055   467   345]
 [  659   189  9682   598]
 [  723   207   940  8354]]
             precision    recall  f1-score   support

          1       0.89      0.97      0.93     17971
          2       0.93      0.81      0.86      8719
          3       0.85      0.87      0.86     11128
          4       0.89      0.82      0.85     10224

avg / total       0.89      0.89      0.88     48042



In [47]:
from gensim.models.wrappers import FastText

model = FastText.load_fasttext_format('model_2_skipgram.bin', encoding='latin1')


In [48]:
tweets = pd.read_csv('../tweets_labelled',skipinitialspace=True, skip_blank_lines=True,encoding='latin1')
index2word_set = set(model.wv.index2word)

def clean_tweets( text ):
    # clean_1 = BeautifulSoup(text,"lxml").get_text()
    clean_2 = re.sub("[^a-zA-Z]"," ", text).lower().strip()
    clean_3 = word_tokenize(clean_2)
    stop_words = set(stopwords.words("english")) - set(['and','or','not'])
    words = [w for w in clean_3 if not w in stop_words]
    return words

num_features = 400

def makeFeatureVec(words, num_features):
    global index2word_set
    featureVec = np.zeros((num_features,),dtype="float32")
    nwords = 0.00000000001
    for word in words:
        # if the word is in wordset then add to feature vec
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    
    featureVec = np.divide(featureVec,nwords)
    return featureVec

def get_average_feature_vectors( tweets, num_feature ):
    counter = 0
    tweetFeatureVecs = np.zeros((len(tweets),num_features),dtype="float32")
    
    for tweet in tweets:
        tweetFeatureVecs[counter] = makeFeatureVec(tweet, num_features)
        counter += 1
    
    return tweetFeatureVecs

np.seterr(divide='ignore', invalid='ignore')
start = time.time()

cleaned_tweets = []
for text in tweets['text']:
    cleaned_tweets.append( clean_tweets(text))
    
DataVecs = get_average_feature_vectors( cleaned_tweets, num_features )

end = time.time()

elapsed_cleaning = end - start 

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(DataVecs, tweets['label'],random_state=10,test_size = 0.3 )

In [49]:
from sklearn.ensemble import RandomForestClassifier
start = time.time() 
model_ml = RandomForestClassifier(n_estimators=200, n_jobs=-1)
print('starting mmodel')
model_ml = model_ml.fit( X_train ,y_train)

end = time.time()

elapsed_training = end - start
print('end..')

result = model_ml.predict( X_test )

from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, result))
print(classification_report(y_test, result))

starting mmodel
end..
[[17600    70   213    88]
 [  797  7043   596   283]
 [  572   107 10061   388]
 [  617   108   954  8545]]
             precision    recall  f1-score   support

          1       0.90      0.98      0.94     17971
          2       0.96      0.81      0.88      8719
          3       0.85      0.90      0.88     11128
          4       0.92      0.84      0.88     10224

avg / total       0.90      0.90      0.90     48042

