# Part 0: Preprocessing

In [1]:
# Import modules
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import random

In [2]:
# ignore deprecation warnings in sklearn

import warnings
warnings.filterwarnings("ignore")

In [3]:
# Specify data directory

data_dir = os.path.join(os.path.dirname(os.getcwd()),'Data')

# Set model directory

model_dir = os.path.join(os.path.dirname(os.getcwd()), 'Model')

In [4]:
# Set data paths

train_path = os.path.join(data_dir, 'train.csv')

train_processed_path = os.path.join(data_dir, 'interim', 'train_preprocessed.txt')

meta_feat_path = os.path.join(data_dir, 'interim', 'meta_feat.txt')

In [5]:
train = pd.read_csv(train_path)
train_processed = pd.read_json(train_processed_path)
meta_feat = pd.read_json(meta_feat_path)

In [6]:
meta_feat.head()

Unnamed: 0,sentiment,n_upper,word_count,char_count,avg_wlen,adj_drug_count,n_stop,n_num,drug_category
0,2,1,76,404,5.315789,0.026316,28,0,38
1,2,5,206,1184,5.747573,0.0,85,0,38
2,2,4,135,780,5.777778,0.007407,50,3,35
3,2,1,20,124,6.2,0.1,5,0,64
4,1,8,115,612,5.321739,0.0,44,0,38


# Part 1: Machine Learning

In [7]:
# Import Naive Bayes

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from custom_function.ml import create_classifier, evaluate_classifier, get_model_results
import pickle

# <font color='Blue'>Baseline Model</font>

# Define X and y

X = meta_feat.drop("sentiment", axis = 1)
y = meta_feat.sentiment
indices = train_processed.index

# Split train and test set

Xtrain, Xtest, ytrain, ytest, itrain, itest = train_test_split(X, y, indices, train_size = 0.8, random_state = 42)

%%time

filename = 'baseline_lr.sav'

parameter = {'model__C':[0.01, 0.1, 1, 10, 100]}

clf = create_classifier(Xtrain,
                        ytrain,
                        LogisticRegression(solver = 'lbfgs', class_weight = 'balanced'),
                        parameter,
                        5,
                        'f1_macro',
                        open(os.path.join(model_dir,filename), 'wb'),
                        preprocess = StandardScaler())

# Extract classifier score
f1macro_score, cm = evaluate_classifier(clf,
                                       Xtest,
                                       ytest)

# Print score
print(f1macro_score)
print(cm)
print(clf.best_params_)

unique, counts = np.unique(clf.predict(Xtest), return_counts=True)
dict(zip(unique, counts))

# <font color='Blue'>Bag of Word models</font>

In [8]:
# Define X and y

X = train_processed.text
y = train_processed.sentiment
indices = train_processed.index

# Split train and test set

Xtrain, Xtest, ytrain, ytest, itrain, itest = train_test_split(X, y, indices, train_size = 0.8, random_state = 42)

## <font color='green'>Naive Bayes - BoW</font>

%%time

filename = 'nb_bow.sav'

parameter = {'pre__min_df':[0.01, 0.1, 1, 10],
             'pre__ngram_range':[(1,1),(1,2)],
             'model__alpha':[0.01, 0.1, 1, 10]}

clf = create_classifier(Xtrain,
                        ytrain,
                        MultinomialNB(),
                        parameter,
                        5,
                        'f1_macro',
                        open(os.path.join(model_dir,filename), 'wb'),
                        preprocess = CountVectorizer())

# Extract classifier score
f1macro_score, cm = evaluate_classifier(clf,
                                       Xtest,
                                       ytest)

# Print score
print(f1macro_score)
print(cm)
print(clf.best_params_)

## <font color='green'>Naive Bayes - Tfidf</font>

%%time

filename = 'nb_tfidf.sav'

parameter = {'pre__min_df':[0.01, 0.1, 1, 10],
             'pre__ngram_range':[(1,1),(1,2)],
             'model__alpha':[0.01, 0.1, 1, 10]}

clf = create_classifier(Xtrain,
                        ytrain,
                        MultinomialNB(),
                        parameter,
                        5,
                        'f1_macro',
                        open(os.path.join(model_dir,filename), 'wb'),
                        preprocess = TfidfVectorizer())

# Extract classifier score
f1macro_score, cm = evaluate_classifier(clf,
                                       Xtest,
                                       ytest)

# Print score
print(f1macro_score)
print(cm)
print(clf.best_params_)

##  <font color='green'>Random Forest - BoW</font>

%%time

filename = 'rf_bow.sav'

parameter = {'pre__min_df':[0.01, 0.1, 1, 10],
              'pre__ngram_range':[(1,1),(1,2)],
              'model__n_estimators':[10 , 20, 40],
              'model__max_features': ['auto', 'sqrt', 'log2']}

clf = create_classifier(Xtrain,
                        ytrain,
                        RandomForestClassifier(),
                        parameter,
                        5,
                        'f1_macro',
                        open(os.path.join(model_dir,filename), 'wb'),
                        preprocess = CountVectorizer())

# Extract classifier score
f1macro_score, cm = evaluate_classifier(clf,
                                       Xtest,
                                       ytest)

# Print score
print(f1macro_score)
print(cm)
print(clf.best_params_)

##  <font color='green'>Random Forest - TfIdf</font>

%%time

filename = 'rf_tfidf.sav'

parameter = {'pre__min_df':[0.01, 0.1, 1, 10],
              'pre__ngram_range':[(1,1),(1,2)],
              'model__n_estimators':[10 , 20, 40],
              'model__max_features': ['auto', 'sqrt', 'log2']}

clf = create_classifier(Xtrain,
                        ytrain,
                        RandomForestClassifier(),
                        parameter,
                        5,
                        'f1_macro',
                        open(os.path.join(model_dir,filename), 'wb'),
                        preprocess = TfidfVectorizer())

# Extract classifier score
f1macro_score, cm = evaluate_classifier(clf,
                                       Xtest,
                                       ytest)

# Print score
print(f1macro_score)
print(cm)
print(clf.best_params_)

##  <font color='green'>SVC - BoW</font>

%%time

filename = 'svc_bow.sav'

parameter = {'pre__min_df':[0.01, 0.1, 1, 10],
             'pre__ngram_range':[(1,1),(1,2)],
              'model__C':[0.01, 0.1, 1],
              'model__kernel':['linear', 'poly', 'rbf'],
              'model__gamma':['auto', 'scale']}

clf = create_classifier(Xtrain,
                        ytrain,
                        SVC(),
                        parameter,
                        3,
                        'f1_macro',
                        open(os.path.join(model_dir,filename), 'wb'),
                        preprocess = CountVectorizer())

# Extract classifier score
f1macro_score, cm = evaluate_classifier(clf,
                                       Xtest,
                                       ytest)

# Print score
print(f1macro_score)
print(cm)
print(clf.best_params_)

##  <font color='green'>SVC - TfIdf</font>

%%time

filename = 'svc_tfidf.sav'

parameter = {'pre__min_df':[0.01, 0.1, 1, 10],
             'pre__ngram_range':[(1,1),(1,2)],
              'model__C':[0.01, 0.1, 1],
              'model__kernel':['linear', 'poly', 'rbf'],
              'model__gamma':['auto', 'scale']}

clf = create_classifier(Xtrain,
                        ytrain,
                        SVC(),
                        parameter,
                        3,
                        'f1_macro',
                        open(os.path.join(model_dir,filename), 'wb'),
                        preprocess = TfidfVectorizer())

# Extract classifier score
f1macro_score, cm = evaluate_classifier(clf,
                                       Xtest,
                                       ytest)

# Print score
print(f1macro_score)
print(cm)
print(clf.best_params_)

##  <font color='green'>Logistic Regression - BoW</font>

In [9]:
%%time

filename = 'lr_bow.sav'

parameter = {'pre__min_df':[0.01, 0.1, 1, 10],
             'pre__ngram_range':[(1,1),(1,2)],
             'model__C':[0.01, 0.1, 1, 10, 100]}

clf = create_classifier(Xtrain,
                        ytrain,
                        LogisticRegression(solver = 'lbfgs', class_weight = 'balanced'),
                        parameter,
                        5,
                        'f1_macro',
                        open(os.path.join(model_dir,filename), 'wb'),
                        preprocess = CountVectorizer())

# Extract classifier score
f1macro_score, cm = evaluate_classifier(clf,
                                       Xtest,
                                       ytest)

# Print score
print(f1macro_score)
print(cm)
print(clf.best_params_)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   26.8s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   43.2s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   54.5s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:  9.8min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 10

0.48998528133196806
[[ 37  19  55]
 [ 15  75  85]
 [ 89 105 576]]
{'model__C': 0.01, 'pre__min_df': 1, 'pre__ngram_range': (1, 2)}
Wall time: 12min 41s


##  <font color='green'>Logistic Regression - TfIdf</font>

%%time

filename = 'lr_tfidf.sav'

parameter = {'pre__min_df':[0.01, 0.1, 1, 10],
             'pre__ngram_range':[(1,1),(1,2)],
             'model__C':[0.01, 0.1, 1, 10, 100]}

clf = create_classifier(Xtrain,
                        ytrain,
                        LogisticRegression(solver = 'lbfgs', class_weight = 'balanced'),
                        parameter,
                        5,
                        'f1_macro',
                        open(os.path.join(model_dir,filename), 'wb'),
                        preprocess = TfidfVectorizer())

# Extract classifier score
f1macro_score, cm = evaluate_classifier(clf,
                                       Xtest,
                                       ytest)

# Print score
print(f1macro_score)
print(cm)
print(clf.best_params_)

##  <font color='green'>SGD - BoW</font>

%%time

filename = 'sgd_bow.sav'

parameter = {'pre__min_df':[0.01, 0.1, 1, 10],
             'pre__ngram_range':[(1,1),(1,2)],
             'model__alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2]}

clf = create_classifier(Xtrain,
                        ytrain,
                        SGDClassifier(max_iter = 1000, loss = 'log', penalty = 'l2'),
                        parameter,
                        5,
                        'f1_macro',
                        open(os.path.join(model_dir,filename), 'wb'),
                        preprocess = CountVectorizer())

# Extract classifier score
f1macro_score, cm = evaluate_classifier(clf,
                                       Xtest,
                                       ytest)

# Print score
print(f1macro_score)
print(cm)
print(clf.best_params_)

##  <font color='green'>SGD - TfIdf</font>

%%time

filename = 'sgd_tfidf.sav'

parameter = {'pre__min_df':[0.01, 0.1, 1, 10],
             'pre__ngram_range':[(1,1),(1,2)],
             'model__alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2]}

clf = create_classifier(Xtrain,
                        ytrain,
                        SGDClassifier(max_iter = 1000, loss = 'log', penalty = 'l2'),
                        parameter,
                        5,
                        'f1_macro',
                        open(os.path.join(model_dir,filename), 'wb'),
                        preprocess = TfidfVectorizer())

# Extract classifier score
f1macro_score, cm = evaluate_classifier(clf,
                                       Xtest,
                                       ytest)

# Print score
print(f1macro_score)
print(cm)
print(clf.best_params_)

In [10]:
train_processed.text[3]

'interesting grand merci wonder lemtrada ocrevus sale would go prove anti-cd20 induction'

In [11]:
models = ['nb_bow.sav', 'nb_tfidf.sav', 
          'rf_bow.sav', 'rf_tfidf.sav', 
          'svc_bow.sav', 'svc_tfidf.sav', 
          'lr_bow.sav', 'lr_tfidf.sav',
          'sgd_bow.sav', 'sgd_tfidf.sav']

results = get_model_results(models, model_dir, Xtest, ytest)
results

Unnamed: 0,model,f1_macro
0,nb_bow.sav,0.852339
1,nb_tfidf.sav,0.775414
2,rf_bow.sav,0.893952
3,rf_tfidf.sav,0.900964
4,svc_bow.sav,0.85694
5,svc_tfidf.sav,0.487064
6,lr_bow.sav,0.489985
7,lr_tfidf.sav,0.747727
8,sgd_bow.sav,0.816405
9,sgd_tfidf.sav,0.550691


<div class="alert alert-block alert-warning">
<b>Example:</b> Logistic Regression trained on previous dataset did a much better job than the one trained on this dataset
</div>

In [12]:
clf = pickle.load(open(os.path.join(model_dir, 'lr_tfidf.sav'), 'rb'))

In [13]:
clf

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('pre', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...enalty='l2', random_state=None,
          solver='lbfgs', tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'pre__min_df': [0.01, 0.1, 1, 10], 'pre__ngram_range': [(1, 1), (1, 2)], 'model__C': [0.01, 0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='f1_macro', verbose=10)

In [14]:
print(confusion_matrix(ytest,clf.predict(Xtest)))
print(f1_score(ytest, 
               clf.predict(Xtest), average = 'macro'))

[[ 94   6  11]
 [  5 149  21]
 [ 77  82 611]]
0.7477266152438963


In [15]:
clf

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('pre', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...enalty='l2', random_state=None,
          solver='lbfgs', tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'pre__min_df': [0.01, 0.1, 1, 10], 'pre__ngram_range': [(1, 1), (1, 2)], 'model__C': [0.01, 0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='f1_macro', verbose=10)

In [None]:
print(confusion_matrix(ytest,clf.predict(Xtest)))
print(f1_score(ytest, 
               clf.predict(Xtest), average = 'macro'))

# Stacked Model

## Stacking Meta and BoW Features

In [None]:
# Vectorizing the sparse features

vec = TfidfVectorizer(min_df = 15, ngram_range=(1, 2))
#vec_fit = vec.fit(train_processed.text)
sparse_feat = vec.fit_transform(train_processed.text)
sparse_feat

# Scaling meta features

scaler = MinMaxScaler()
meta_feat_scaled = scaler.fit_transform(meta_feat.drop("sentiment", axis = 1))

# Combine the features

from scipy.sparse import coo_matrix, hstack

meta_feat_scaled = coo_matrix(meta_feat_scaled)
meta_feat_scaled

# Derive X and Y

X = hstack([sparse_feat, meta_feat_scaled.astype(float)])
y = train_processed.sentiment
indices = train_processed.index

# Derive train/test set

from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest, itrain, itest = train_test_split(X, y, indices, train_size = 0.8, random_state = 42)

filename = 'lr_stacked.sav'


parameter = {'model__C':[0.01, 0.1, 1, 10, 100, 1000]}

clf = create_classifier(Xtrain,
                        ytrain,
                        LogisticRegression(solver = 'lbfgs', class_weight = 'balanced'),
                        parameter,
                        5,
                        'f1_macro',
                        open(os.path.join(model_dir,filename), 'wb'))

# Extract classifier score
f1macro_score, cm = evaluate_classifier(clf,
                                       Xtest,
                                       ytest)

# Print score
print(f1macro_score)
print(cm)
print(clf.best_params_)

### Logistic Regression

### Naive Bayes

filename = 'nb_stacked.sav'

parameter = {'model__alpha':[0.01, 0.1, 1, 10, 100]}

clf = create_classifier(Xtrain,
                        ytrain,
                        MultinomialNB(),
                        parameter,
                        5,
                        'f1_macro',
                        open(os.path.join(model_dir,filename), 'wb'))

# Extract classifier score
f1macro_score, cm = evaluate_classifier(clf,
                                       Xtest,
                                       ytest)

# Print score
print(f1macro_score)
print(cm)
print(clf.best_params_)

### SVC

filename = 'svc_stacked.sav'

parameter = {'model__C':[0.01, 0.1, 1],
             'model__kernel':['linear', 'poly', 'rbf'],
             'model__gamma':['auto', 'scale']}


clf = create_classifier(Xtrain,
                        ytrain,
                        SVC(),
                        parameter,
                        3,
                        'f1_macro',
                        open(os.path.join(model_dir,filename), 'wb'))

# Extract classifier score
f1macro_score, cm = evaluate_classifier(clf,
                                       Xtest,
                                       ytest)

# Print score
print(f1macro_score)
print(cm)
print(clf.best_params_)

### SGD

filename = 'sgd_stacked.sav'

parameter = {'model__alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2]}


clf = create_classifier(Xtrain,
                        ytrain,
                        SGDClassifier(),
                        parameter,
                        3,
                        'f1_macro',
                        open(os.path.join(model_dir,filename), 'wb'))

# Extract classifier score
f1macro_score, cm = evaluate_classifier(clf,
                                       Xtest,
                                       ytest)

# Print score
print(f1macro_score)
print(cm)
print(clf.best_params_)

In [None]:
models = ['nb_stacked.sav', 'lr_stacked.sav', 'svc_stacked.sav', 'sgd_stacked.sav']
results = get_model_results(models, model_dir, Xtest, ytest)
results

# Embedding models

## Word2vec custom trained

In [None]:
from gensim.models import Word2Vec, KeyedVectors
import multiprocessing

In [None]:
def get_mean_vector(word2vec_model, n_dim, doc):
    """ 
    From a document, derive the mean vector of its vocabularies with embeddings from a specified word2vec model with dimension n
    """
    
    # remove out-of-vocabulary words
    word_list = [word for word in doc if word in word2vec_model.wv.vocab]
    
    # Take average if the doc contains in-vocabulary words, else, take a matrix of zeroes
    if len(word_list) >= 1:
        return np.mean(word2vec_model[word_list], axis=0)
    else:
        return ([0] * n_dim)

In [None]:
%%time
#### Create Word2vec word embeddings
n_dim = 400
w2v_window_grid = {}
for n_window in np.arange(1,20,2):

    sentences = [row.split() for row in train_processed.text]

    cores = multiprocessing.cpu_count()

    w2v_model = Word2Vec(min_count=10,
                         window=n_window,
                         size=n_dim,
                         sample=6e-5, 
                         alpha=0.03, 
                         min_alpha=0.0007, 
                         negative=20,
                         workers=cores-1)

    w2v_model.build_vocab(sentences)

    w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30)

    #### Derive word2vec features

    # Create a dictionary with the vocabs and its embeddings
    w2v = dict(zip(w2v_model.wv.index2word, w2v_model.wv.syn0))

    # Initialize a vector features dictionary
    vec_features = {}

    # Iterate over the documents to get the mean of word embeddings feature to derive document embeddings
    for index, doc in train_processed.text.iteritems(): 
        vec_features[index] = get_mean_vector(w2v_model, n_dim, doc.split(' '))

    # Create a document feature dataframe
    vec_features_df = pd.DataFrame.from_dict(vec_features, 'index')
    vec_features_df.shape

    #### Evaluate classifier with given word2vec embeddings

    # Define X and y

    X = vec_features_df
    y = train_processed.sentiment
    indices = train_processed.index

    # Split train and test set

    Xtrain, Xtest, ytrain, ytest, itrain, itest = train_test_split(X, y, indices, train_size = 0.8, random_state = 42)

    # Specify save file for model
    filename = 'lr_w2v.sav'

    # Specify parameter grid
    parameter = {'model__C':[0.01, 0.1, 1, 10, 100]}

    # Run grid search to create best classifier of type
    clf = create_classifier(Xtrain, 
                            ytrain,
                            LogisticRegression(solver = 'lbfgs', class_weight = 'balanced'),
                            parameter,
                            5,
                            'f1_macro',
                            open(os.path.join(model_dir,filename), 'wb'))

    # Extract classifier score
    f1macro_score, cm = evaluate_classifier(clf,
                                           Xtest,
                                           ytest)

    # Save score
    w2v_window_grid[n_window] = f1macro_score

    # Print score
    print("The word embedding dimension number is", n_dim)
    print(f1macro_score)
    print(cm)
    print(clf.best_params_)

In [None]:
w2v_window_df = pd.DataFrame.from_dict(w2v_window_grid, orient = 'index' )

In [None]:
w2v_window_df.to_json(os.path.join(data_dir, 'interim', 'w2v_window_search_results.txt'))

In [None]:
w2v_window_df = pd.read_json(os.path.join(data_dir, 'interim', 'w2v_window_search_results.txt'))

In [None]:
w2v_window_df.sort_index().plot()
plt.title('Word2Vec window Search')
plt.xlabel('Number of windows')
plt.ylabel('F1 Macro Score')
plt.legend('')
plt.show()

## Word2Vec Google Pre-Trained Model

In [None]:
# Load Google's pre-trained Word2Vec model.

w2v_google_model = KeyedVectors.load_word2vec_format(os.path.join(model_dir, 'embeddings', 'GoogleNews-vectors-negative300.bin'), binary=True)

In [None]:
# Create a dictionary with the vocabs and its embeddings
w2v = dict(zip(w2v_google_model.wv.index2word, w2v_google_model.wv.syn0))

# Initialize a vector features dictionary
vec_features = {}

# Iterate over the documents to get the mean of word embeddings feature to derive document embeddings
for index, doc in train_processed.text.iteritems(): 
    vec_features[index] = get_mean_vector(w2v_google_model, 300, doc.split(' '))

# Create a document feature dataframe
vec_features_df = pd.DataFrame.from_dict(vec_features, 'index')
vec_features_df.shape

#### Evaluate classifier with given word2vec embeddings

# Define X and y

X = vec_features_df
y = train_processed.sentiment
indices = train_processed.index

# Split train and test set

Xtrain, Xtest, ytrain, ytest, itrain, itest = train_test_split(X, y, indices, train_size = 0.8, random_state = 42)

# Specify save file for model
filename = 'lr_w2v_pretrained_google.sav'

# Specify parameter grid
parameter = {'model__C':[0.01, 0.1, 1, 10, 100]}

# Run grid search to create best classifier of type
clf = create_classifier(Xtrain, 
                        ytrain,
                        LogisticRegression(solver = 'lbfgs', class_weight = 'balanced'),
                        parameter,
                        5,
                        'f1_macro',
                        open(os.path.join(model_dir,filename), 'wb'))

# Extract classifier score
f1macro_score, cm = evaluate_classifier(clf,
                                       Xtest,
                                       ytest)

# Print score
print(f1macro_score)
print(cm)
print(clf.best_params_)

## GloVe Stanford Pre-trained

import spacy

nlp = spacy.load('en_vectors_web_lg')

total_vectors = len(nlp.vocab.vectors)
print('Total word vectors:', total_vectors)

from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = os.path.join(model_dir, 'embeddings', 'glove.6B.300d.txt')
word2vec_output_file = os.path.join(model_dir, 'embeddings','glove.6B.300d.txt.word2vec')
glove2word2vec(glove_input_file, word2vec_output_file)

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec
word2vec_output_file = os.path.join(model_dir, 'embeddings','glove.6B.300d.txt.word2vec')
w2v_glove_stanford_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

In [None]:
# Create a dictionary with the vocabs and its embeddings
w2v = dict(zip(w2v_glove_stanford_model.wv.index2word, w2v_glove_stanford_model.wv.syn0))

# Initialize a vector features dictionary
vec_features = {}

# Iterate over the documents to get the mean of word embeddings feature to derive document embeddings
for index, doc in train_processed.text.iteritems(): 
    vec_features[index] = get_mean_vector(w2v_glove_stanford_model, 300, doc.split(' '))

# Create a document feature dataframe
vec_features_df = pd.DataFrame.from_dict(vec_features, 'index')
vec_features_df.shape

#### Evaluate classifier with given word2vec embeddings

# Define X and y

X = vec_features_df
y = train_processed.sentiment
indices = train_processed.index

# Split train and test set

Xtrain, Xtest, ytrain, ytest, itrain, itest = train_test_split(X, y, indices, train_size = 0.8, random_state = 42)

# Specify save file for model
filename = 'lr_w2v_pretrained_glove_stanford.sav'

# Specify parameter grid
parameter = {'model__C':[0.01, 0.1, 1, 10, 100]}

# Run grid search to create best classifier of type
clf = create_classifier(Xtrain, 
                        ytrain,
                        LogisticRegression(solver = 'lbfgs', class_weight = 'balanced'),
                        parameter,
                        5,
                        'f1_macro',
                        open(os.path.join(model_dir,filename), 'wb'))

# Extract classifier score
f1macro_score, cm = evaluate_classifier(clf,
                                       Xtest,
                                       ytest)

# Print score
print(f1macro_score)
print(cm)
print(clf.best_params_)

## Fastext

In [None]:
import nltk
from gensim.models.fasttext import FastText

wpt = nltk.WordPunctTokenizer()
tokenized_corpus = [wpt.tokenize(doc) for document in train_processed.text]

In [None]:
# Set values for various parameters
feature_size = 300    # Word vector dimensionality  
window_context = 50          # Context window size                                                                                    
min_word_count = 5   # Minimum word count                        
sample = 1e-3   # Downsample setting for frequent words


ft_model = FastText(tokenized_corpus, 
                    size=feature_size, 
                    window=window_context, 
                    min_count=min_word_count,
                    sample=sample, 
                    sg=1, 
                    iter=50)

In [None]:
# Create a dictionary with the vocabs and its embeddings
w2v = dict(zip(ft_model.wv.index2word, ft_model.wv.syn0))

# Initialize a vector features dictionary
vec_features = {}

# Iterate over the documents to get the mean of word embeddings feature to derive document embeddings
for index, doc in train_processed.text.iteritems(): 
    vec_features[index] = get_mean_vector(ft_model, 300, doc.split(' '))

# Create a document feature dataframe
vec_features_df = pd.DataFrame.from_dict(vec_features, 'index')
vec_features_df.shape

#### Evaluate classifier with given word2vec embeddings

# Define X and y

X = vec_features_df
y = train_processed.sentiment
indices = train_processed.index

# Split train and test set

Xtrain, Xtest, ytrain, ytest, itrain, itest = train_test_split(X, y, indices, train_size = 0.8, random_state = 42)

# Specify save file for model
filename = 'lr_w2v_fasttext.sav'

# Specify parameter grid
parameter = {'model__C':[0.01, 0.1, 1, 10, 100]}

# Run grid search to create best classifier of type
clf = create_classifier(Xtrain, 
                        ytrain,
                        LogisticRegression(solver = 'lbfgs', class_weight = 'balanced'),
                        parameter,
                        5,
                        'f1_macro',
                        open(os.path.join(model_dir,filename), 'wb'))

# Extract classifier score
f1macro_score, cm = evaluate_classifier(clf,
                                       Xtest,
                                       ytest)

# Print score
print(f1macro_score)
print(cm)
print(clf.best_params_)

## Results

weights = pd.DataFrame(w2v).transpose()

similar_words = {search_term: [item[0] for item in w2v_model.wv.most_similar([search_term], topn=5)]
                  for search_term in ['patient', 'drug', 'therapy', 'treatment', 'vaccine', 'health', 'public','effective']}
similar_words

from sklearn.manifold import TSNE

words = sum([[k] + v for k, v in similar_words.items()], [])
wvs = w2v_model.wv[words]

tsne = TSNE(n_components=2, random_state=0, n_iter=10000, perplexity=2)
np.set_printoptions(suppress=True)
T = tsne.fit_transform(wvs)
labels = words

plt.figure(figsize=(14, 8))
plt.scatter(T[:, 0], T[:, 1], c='orange', edgecolors='r')
for label, x, y in zip(labels, T[:, 0], T[:, 1]):
    plt.annotate(label, xy=(x+1, y+1), xytext=(0, 0), textcoords='offset points')