In [1]:
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pandas as pd
import re
from nltk.corpus import stopwords
import nltk.data
from nltk.tokenize import word_tokenize
import time
import numpy as np

In [2]:
model = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)  
model.init_sims(replace=True)

In [3]:
tweets = pd.read_csv('tweets_labelled',skipinitialspace=True, skip_blank_lines=True,encoding='latin1')

In [4]:
index2word_set = set(model.wv.index2word)

In [5]:
def clean_tweets( text ):
    # clean_1 = BeautifulSoup(text,"lxml").get_text()
    clean_2 = re.sub("[^a-zA-Z]"," ", text).lower().strip()
    clean_3 = word_tokenize(clean_2)
    stop_words = set(stopwords.words("english")) - set(['and','or','not'])
    words = [w for w in clean_3 if not w in stop_words]
    return words



In [9]:
num_features = 300

def makeFeatureVec(words, num_features):
    global index2word_set
    featureVec = np.zeros((num_features,),dtype="float32")
    nwords = 0.00000000001
    for word in words:
        # if the word is in wordset then add to feature vec
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    
    featureVec = np.divide(featureVec,nwords)
    return featureVec

def get_average_feature_vectors( tweets, num_feature ):
    counter = 0
    tweetFeatureVecs = np.zeros((len(tweets),num_features),dtype="float32")
    
    for tweet in tweets:
        tweetFeatureVecs[counter] = makeFeatureVec(tweet, num_features)
        counter += 1
    
    return tweetFeatureVecs


In [10]:
np.seterr(divide='ignore', invalid='ignore')
start = time.time()

cleaned_tweets = []
for text in tweets['text']:
    cleaned_tweets.append( clean_tweets(text))
    
DataVecs = get_average_feature_vectors( cleaned_tweets, num_features )

end = time.time()

elapsed_cleaning = end - start

In [10]:
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [14]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(DataVecs)
scaled_features = scaler.transform(DataVecs)
data = pd.DataFrame(scaled_features)



In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(DataVecs, tweets['label'],random_state=10,test_size = 0.3 )

In [25]:
from sklearn.ensemble import RandomForestClassifier
start = time.time() 
model_ml = RandomForestClassifier(n_estimators=200, n_jobs=-1)
print('starting mmodel')
model_ml = model_ml.fit( X_train ,y_train)

end = time.time()

elapsed_training = end - start
print('end..')



starting mmodel
end..


In [26]:
result = model_ml.predict( X_test )

In [27]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, result))
print(classification_report(y_test, result))

[[17509   143   183   136]
 [  966  6896   401   456]
 [  677   190  9506   755]
 [  815   235   832  8342]]
             precision    recall  f1-score   support

          1       0.88      0.97      0.92     17971
          2       0.92      0.79      0.85      8719
          3       0.87      0.85      0.86     11128
          4       0.86      0.82      0.84     10224

avg / total       0.88      0.88      0.88     48042



In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
start = time.time() 
model_ml = RandomForestClassifier(n_estimators=200, n_jobs=-1)
print('starting mmodel...without lower')
model_ml = model_ml.fit( X_train ,y_train)

end = time.time()

elapsed_training = end - start
print('end..')
result = model_ml.predict( X_test )

print(confusion_matrix(y_test, result))
print(classification_report(y_test, result))

starting mmodel...without lower
end..
[[17301   225   215   230]
 [ 1908  5867   433   511]
 [ 1482   221  8689   736]
 [ 1451   267   910  7596]]
             precision    recall  f1-score   support

          1       0.78      0.96      0.86     17971
          2       0.89      0.67      0.77      8719
          3       0.85      0.78      0.81     11128
          4       0.84      0.74      0.79     10224

avg / total       0.83      0.82      0.82     48042



In [39]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, classification_report
start = time.time() 
model_ml = GradientBoostingClassifier(n_estimators=100)
print('starting mmodel...without lower')
model_ml = model_ml.fit( X_train ,y_train)

end = time.time()

elapsed_training = end - start
print('end..')
result = model_ml.predict( X_test )

print(confusion_matrix(y_test, result))
print(classification_report(y_test, result))

starting mmodel...without lower
end..
[[16614   504   414   439]
 [ 1061  6753   405   500]
 [  800   287  9184   857]
 [  948   324   907  8045]]
             precision    recall  f1-score   support

          1       0.86      0.92      0.89     17971
          2       0.86      0.77      0.81      8719
          3       0.84      0.83      0.83     11128
          4       0.82      0.79      0.80     10224

avg / total       0.84      0.85      0.84     48042



In [40]:
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import confusion_matrix, classification_report
start = time.time() 
model_ml = BaggingClassifier(n_estimators=100,n_jobs = -1)
print('starting mmodel.. lower')
model_ml = model_ml.fit( X_train ,y_train)

end = time.time()

elapsed_training = end - start
print('end..')
result = model_ml.predict( X_test )

print(confusion_matrix(y_test, result))
print(classification_report(y_test, result))

starting mmodel.. lower
end..
[[17198   246   252   275]
 [  992  6725   463   539]
 [  833   274  9075   946]
 [  909   301   996  8018]]
             precision    recall  f1-score   support

          1       0.86      0.96      0.91     17971
          2       0.89      0.77      0.83      8719
          3       0.84      0.82      0.83     11128
          4       0.82      0.78      0.80     10224

avg / total       0.85      0.85      0.85     48042



In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
start = time.time() 
model_ml = RandomForestClassifier(n_estimators=100, n_jobs=-1)
print('starting mmodel...without normalization')
model_ml = model_ml.fit( X_train ,y_train)

end = time.time()

elapsed_training = end - start
print('end..')
result = model_ml.predict( X_test )

print(confusion_matrix(y_test, result))
print(classification_report(y_test, result))

starting mmodel...without normalization
end..
[[17492   122   199   158]
 [  928  6881   542   368]
 [  701   182  9655   590]
 [  817   170  1065  8172]]
             precision    recall  f1-score   support

          1       0.88      0.97      0.92     17971
          2       0.94      0.79      0.86      8719
          3       0.84      0.87      0.85     11128
          4       0.88      0.80      0.84     10224

avg / total       0.88      0.88      0.88     48042



In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
start = time.time() 
model_ml = RandomForestClassifier(n_estimators=100, n_jobs=-1)
print('starting mmodel...with normalization')
model_ml = model_ml.fit( X_train ,y_train)

end = time.time()

elapsed_training = end - start
print('end..')
result = model_ml.predict( X_test )

print(confusion_matrix(y_test, result))
print(classification_report(y_test, result))

starting mmodel...with normalization
end..
[[17484   114   204   169]
 [  902  6891   528   398]
 [  685   178  9650   615]
 [  835   194  1046  8149]]
             precision    recall  f1-score   support

          1       0.88      0.97      0.92     17971
          2       0.93      0.79      0.86      8719
          3       0.84      0.87      0.86     11128
          4       0.87      0.80      0.83     10224

avg / total       0.88      0.88      0.88     48042

