In [19]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize

from time import time
from pprint import pprint 

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib

import gensim

In [45]:
df_full = pd.read_csv("C:/Users/Ung Lik Teng/Desktop/CodenData/Machine Learning/NLP/cleaned.csv")
sanders_df = pd.read_csv("C:/Users/Ung Lik Teng/Desktop/CodenData/Machine Learning/NLP/sanders_cleaned.csv")

In [46]:
df_full = df_full.drop('clean_count_words', 1)
sanders_df = sanders_df.drop('clean_count_words', 1)

In [4]:
df_full.shape

(99989, 8)

In [47]:
sanders_df.shape

(3424, 8)

In [48]:
#Here we are using a mini sample of 200 datapoints from the original datsets
df_model = df_full.sample(200)
df_model = df_model.reset_index(drop = True)
sanders_df_model = sanders_df.sample(200)
sanders_df_model = sanders_df_model.reset_index(drop=True)

In [49]:
class ColumnExtractor(TransformerMixin, BaseEstimator):
    def __init__(self, cols):
        self.cols = cols

    def transform(self, X, **transform_params):
        return X[self.cols]

    def fit(self, X, y=None, **fit_params):
        return self

In [50]:
X_train, X_test, y_train, y_test = train_test_split(df_model.drop('Sentiment', axis=1), df_model.Sentiment, test_size=0.2, random_state=37, stratify = df_model.Sentiment )

In [51]:
X_train.shape

(160, 7)

In [52]:
# Based on https://www.kaggle.com/bertcarremans/predicting-sentiment-with-text-features?scriptVersionId=2139318/code
def grid_vect(clf, parameters_clf, X_train, X_test, parameters_text=None, vect=None, is_w2v=False):
    
    textcountscols = ['count_capital_words','count_excl_quest_marks','count_hashtags'
                      ,'count_mentions','count_urls','count_words']
    
    if is_w2v:
        w2vcols = []
        for i in range(SIZE):
            w2vcols.append(i)
        features = FeatureUnion([('textcounts', ColumnExtractor(cols=textcountscols))
                                 , ('w2v', ColumnExtractor(cols=w2vcols).fit(X_train))])
    else:
         features = FeatureUnion([('textcounts', ColumnExtractor(cols=textcountscols))
                                 , ('pipe', Pipeline([('cleantext', ColumnExtractor(cols='clean_text').fit(X_train)), ('vect', vect)]))])
    
    pipeline = Pipeline([
        ('features', features)
        , ('clf', clf)
    ])
    
    # Join the parameters dictionaries together
    parameters = dict()
    if parameters_text:
        parameters.update(parameters_text)
    parameters.update(parameters_clf)

    # Make sure you have scikit-learn version 0.19 or higher to use multiple scoring metrics
    grid_search = GridSearchCV(pipeline, parameters, verbose=1, cv=5)
    
    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)

    t0 = time()
    grid_search.fit(X_train, y_train)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best CV score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
        
    print("Test score with best_estimator_: %0.3f" % grid_search.best_estimator_.score(X_test, y_test))
    print("\n")
    print("Classification Report Test Data")
    print(classification_report(y_test, grid_search.best_estimator_.predict(X_test)))
                        
    return grid_search

In [53]:
# Parameter grid settings for the vectorizers (Count and TFIDF)
parameters_vect = {
    'features__pipe__vect__max_df': (0.25, 0.5, 0.75),
    'features__pipe__vect__ngram_range': ((1, 1), (1, 2)),
    'features__pipe__vect__min_df': (1,2)
}


# Parameter grid settings for MultinomialNB
parameters_mnb = {
    'clf__alpha': (0.25, 0.5, 0.75)
}


# Parameter grid settings for LogisticRegression
parameters_logreg = {
    'clf__C': (0.25, 0.5, 1.0),
    'clf__penalty': ('l1', 'l2')
}

In [54]:
mnb = MultinomialNB()
logisticsreg = LogisticRegression()

In [55]:
countVec = CountVectorizer()
tfidfVec= TfidfVectorizer()

In [56]:
mnb_countVec = grid_vect(mnb,parameters_mnb,X_train, X_test, parameters_text = parameters_vect, vect = countVec)

Performing grid search...
pipeline: ['features', 'clf']
parameters:
{'clf__alpha': (0.25, 0.5, 0.75),
 'features__pipe__vect__max_df': (0.25, 0.5, 0.75),
 'features__pipe__vect__min_df': (1, 2),
 'features__pipe__vect__ngram_range': ((1, 1), (1, 2))}
Fitting 5 folds for each of 36 candidates, totalling 180 fits
done in 3.423s

Best CV score: 0.675
Best parameters set:
	clf__alpha: 0.25
	features__pipe__vect__max_df: 0.25
	features__pipe__vect__min_df: 2
	features__pipe__vect__ngram_range: (1, 1)
Test score with best_estimator_: 0.625


Classification Report Test Data
             precision    recall  f1-score   support

          0       0.55      0.65      0.59        17
          1       0.70      0.61      0.65        23

avg / total       0.64      0.62      0.63        40



[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed:    3.3s finished


In [57]:
mnb_tfidfVec = grid_vect(mnb, parameters_mnb, X_train, X_test, parameters_vect, vect = tfidfVec)
    

Performing grid search...
pipeline: ['features', 'clf']
parameters:
{'clf__alpha': (0.25, 0.5, 0.75),
 'features__pipe__vect__max_df': (0.25, 0.5, 0.75),
 'features__pipe__vect__min_df': (1, 2),
 'features__pipe__vect__ngram_range': ((1, 1), (1, 2))}
Fitting 5 folds for each of 36 candidates, totalling 180 fits
done in 5.085s

Best CV score: 0.688
Best parameters set:
	clf__alpha: 0.5
	features__pipe__vect__max_df: 0.25
	features__pipe__vect__min_df: 2
	features__pipe__vect__ngram_range: (1, 2)
Test score with best_estimator_: 0.650


Classification Report Test Data
             precision    recall  f1-score   support

          0       0.59      0.59      0.59        17
          1       0.70      0.70      0.70        23

avg / total       0.65      0.65      0.65        40



[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed:    4.9s finished


In [58]:
lr_countVec = grid_vect(logisticsreg,parameters_logreg,X_train, X_test,parameters_text = parameters_vect, vect = countVec)

Performing grid search...
pipeline: ['features', 'clf']
parameters:
{'clf__C': (0.25, 0.5, 1.0),
 'clf__penalty': ('l1', 'l2'),
 'features__pipe__vect__max_df': (0.25, 0.5, 0.75),
 'features__pipe__vect__min_df': (1, 2),
 'features__pipe__vect__ngram_range': ((1, 1), (1, 2))}
Fitting 5 folds for each of 72 candidates, totalling 360 fits
done in 6.196s

Best CV score: 0.681
Best parameters set:
	clf__C: 0.25
	clf__penalty: 'l2'
	features__pipe__vect__max_df: 0.25
	features__pipe__vect__min_df: 2
	features__pipe__vect__ngram_range: (1, 1)
Test score with best_estimator_: 0.625


Classification Report Test Data
             precision    recall  f1-score   support

          0       0.60      0.35      0.44        17
          1       0.63      0.83      0.72        23

avg / total       0.62      0.62      0.60        40



[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed:    6.1s finished


In [59]:
lr_tfidfVec = grid_vect(logisticsreg,parameters_logreg,X_train, X_test,parameters_text = parameters_vect, vect = tfidfVec)

Performing grid search...
pipeline: ['features', 'clf']
parameters:
{'clf__C': (0.25, 0.5, 1.0),
 'clf__penalty': ('l1', 'l2'),
 'features__pipe__vect__max_df': (0.25, 0.5, 0.75),
 'features__pipe__vect__min_df': (1, 2),
 'features__pipe__vect__ngram_range': ((1, 1), (1, 2))}
Fitting 5 folds for each of 72 candidates, totalling 360 fits
done in 5.472s

Best CV score: 0.675
Best parameters set:
	clf__C: 1.0
	clf__penalty: 'l2'
	features__pipe__vect__max_df: 0.25
	features__pipe__vect__min_df: 2
	features__pipe__vect__ngram_range: (1, 2)
Test score with best_estimator_: 0.700


Classification Report Test Data
             precision    recall  f1-score   support

          0       0.65      0.65      0.65        17
          1       0.74      0.74      0.74        23

avg / total       0.70      0.70      0.70        40



[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed:    5.4s finished


# Word2Vec

In [60]:
SIZE = 50

X_train['wordlist'] = X_train.clean_text.apply(lambda x: word_tokenize(x))
X_test['wordlist'] = X_test.clean_text.apply(lambda x: word_tokenize(x))

w2v_model = gensim.models.Word2Vec(X_train.wordlist
                                  , min_count= 1
                                  , size = SIZE
                                  , window = 5
                                  , workers = 4)

In [61]:
def compute_avg_w2v_vector(w2v_dict, tweet):
    list_of_word_vectors = [w2v_dict[w] for w in tweet if w in w2v_dict.vocab.keys()]
    
    if len(list_of_word_vectors) == 0:
        result = [0.0]*SIZE
    else:
        result = np.sum(list_of_word_vectors, axis=0) / len(list_of_word_vectors)
        
    return result

In [62]:
X_train_w2v = X_train['wordlist'].apply(lambda x: compute_avg_w2v_vector(w2v_model.wv,x))
X_test_w2v = X_test['wordlist'].apply(lambda x: compute_avg_w2v_vector(w2v_model.wv,x))


In [63]:
X_train_w2v = pd.DataFrame(X_train_w2v.values.tolist(), index= X_train.index)
X_test_w2v = pd.DataFrame(X_test_w2v.values.tolist(), index= X_test.index)

# Concatenate the TextCounts variables with w2v and drop the clean text and wordlist columns
X_train_w2v = pd.concat([X_train_w2v, X_train.drop(['clean_text', 'wordlist'], axis=1)], axis=1)
X_test_w2v = pd.concat([X_test_w2v, X_test.drop(['clean_text', 'wordlist'], axis=1)], axis=1)

In [64]:
logreg_w2v = grid_vect(logisticsreg, parameters_logreg, X_train_w2v, X_test_w2v, is_w2v=True)

Performing grid search...
pipeline: ['features', 'clf']
parameters:
{'clf__C': (0.25, 0.5, 1.0), 'clf__penalty': ('l1', 'l2')}
Fitting 5 folds for each of 6 candidates, totalling 30 fits
done in 0.337s

Best CV score: 0.562
Best parameters set:
	clf__C: 0.25
	clf__penalty: 'l1'
Test score with best_estimator_: 0.675


Classification Report Test Data
             precision    recall  f1-score   support

          0       0.83      0.29      0.43        17
          1       0.65      0.96      0.77        23

avg / total       0.73      0.68      0.63        40



[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.2s finished


# Sander Corpus

In [65]:
X_train, X_test, y_train, y_test = train_test_split(sanders_df_model.drop('Sentiment', axis=1), sanders_df_model.Sentiment, test_size=0.3, random_state=37, stratify = sanders_df_model.Sentiment )

In [66]:
#Naive Bayes with Count Vectorizer 
mnb_countVec = grid_vect(mnb,parameters_mnb,X_train, X_test, parameters_text = parameters_vect, vect = countVec)

Performing grid search...
pipeline: ['features', 'clf']
parameters:
{'clf__alpha': (0.25, 0.5, 0.75),
 'features__pipe__vect__max_df': (0.25, 0.5, 0.75),
 'features__pipe__vect__min_df': (1, 2),
 'features__pipe__vect__ngram_range': ((1, 1), (1, 2))}
Fitting 5 folds for each of 36 candidates, totalling 180 fits
done in 5.901s

Best CV score: 0.679
Best parameters set:
	clf__alpha: 0.75
	features__pipe__vect__max_df: 0.25
	features__pipe__vect__min_df: 2
	features__pipe__vect__ngram_range: (1, 1)
Test score with best_estimator_: 0.583


Classification Report Test Data
             precision    recall  f1-score   support

         -1       0.20      0.08      0.12        12
          0       0.63      0.87      0.73        39
          1       0.00      0.00      0.00         9

avg / total       0.45      0.58      0.50        60



[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed:    5.8s finished


In [67]:
#Naive Bayes with Tfidf Vectorizer
mnb_tfidfVec = grid_vect(mnb, parameters_mnb, X_train, X_test, parameters_vect, vect = tfidfVec)

Performing grid search...
pipeline: ['features', 'clf']
parameters:
{'clf__alpha': (0.25, 0.5, 0.75),
 'features__pipe__vect__max_df': (0.25, 0.5, 0.75),
 'features__pipe__vect__min_df': (1, 2),
 'features__pipe__vect__ngram_range': ((1, 1), (1, 2))}
Fitting 5 folds for each of 36 candidates, totalling 180 fits
done in 5.098s

Best CV score: 0.664
Best parameters set:
	clf__alpha: 0.25
	features__pipe__vect__max_df: 0.25
	features__pipe__vect__min_df: 2
	features__pipe__vect__ngram_range: (1, 2)
Test score with best_estimator_: 0.700


Classification Report Test Data
             precision    recall  f1-score   support

         -1       0.62      0.42      0.50        12
          0       0.73      0.95      0.82        39
          1       0.00      0.00      0.00         9

avg / total       0.60      0.70      0.63        60



[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed:    5.0s finished


In [68]:
#Logistic Regression with Count Vectorizer 
lr_countVec = grid_vect(logisticsreg,parameters_logreg,X_train, X_test,parameters_text = parameters_vect, vect = countVec)

Performing grid search...
pipeline: ['features', 'clf']
parameters:
{'clf__C': (0.25, 0.5, 1.0),
 'clf__penalty': ('l1', 'l2'),
 'features__pipe__vect__max_df': (0.25, 0.5, 0.75),
 'features__pipe__vect__min_df': (1, 2),
 'features__pipe__vect__ngram_range': ((1, 1), (1, 2))}
Fitting 5 folds for each of 72 candidates, totalling 360 fits
done in 14.077s

Best CV score: 0.657
Best parameters set:
	clf__C: 0.5
	clf__penalty: 'l2'
	features__pipe__vect__max_df: 0.25
	features__pipe__vect__min_df: 2
	features__pipe__vect__ngram_range: (1, 1)
Test score with best_estimator_: 0.617


Classification Report Test Data
             precision    recall  f1-score   support

         -1       0.25      0.08      0.12        12
          0       0.64      0.92      0.76        39
          1       0.00      0.00      0.00         9

avg / total       0.47      0.62      0.52        60



[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed:   14.0s finished
  'precision', 'predicted', average, warn_for)


In [69]:
#Logistic Regression with Tfidf Vectorizer
lr_tfidfVec = grid_vect(logisticsreg,parameters_logreg,X_train, X_test,parameters_text = parameters_vect, vect = tfidfVec)

Performing grid search...
pipeline: ['features', 'clf']
parameters:
{'clf__C': (0.25, 0.5, 1.0),
 'clf__penalty': ('l1', 'l2'),
 'features__pipe__vect__max_df': (0.25, 0.5, 0.75),
 'features__pipe__vect__min_df': (1, 2),
 'features__pipe__vect__ngram_range': ((1, 1), (1, 2))}
Fitting 5 folds for each of 72 candidates, totalling 360 fits
done in 9.158s

Best CV score: 0.621
Best parameters set:
	clf__C: 0.25
	clf__penalty: 'l1'
	features__pipe__vect__max_df: 0.25
	features__pipe__vect__min_df: 1
	features__pipe__vect__ngram_range: (1, 1)
Test score with best_estimator_: 0.650


Classification Report Test Data
             precision    recall  f1-score   support

         -1       0.00      0.00      0.00        12
          0       0.65      1.00      0.79        39
          1       0.00      0.00      0.00         9

avg / total       0.42      0.65      0.51        60



[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed:    9.0s finished
  'precision', 'predicted', average, warn_for)
