# Creating Doc2Vec Vectors

In [1]:
import pandas as pd
import numpy as np

np.random.seed(0)

#function reads data file
def read_text_file(f):
    df_complete = pd.read_csv(f)
    df = df_complete.loc[:,["sentiment","comment"]]  #add label to comments
    df.dropna(how="any", inplace=True) #drops columns that are not used
    return df

In [2]:
df = read_text_file("500/cleaned.csv")
print (df)

     sentiment                                            comment
0            1  She present class materials with powerpoint wh...
1            1  The instructor was generally quite good at exp...
2            0  I cant really tell how effective the instructi...
3            1  She did a good job of explaining the logic beh...
4            1  The activities we did in classed where explain...
5            1              It was very informative and thourough
6            0  Sometimes hard to understand and catch up with...
7            1  Teaching was good with lecture and few example...
8            1  The teaching was excellent as a whole there wa...
9            1  Prof Koufakou did a good job teaching the clas...
10           1  The teaching seemed well thought out and the i...
11           1  Your teaching style is good the mixture of lec...
12           1  The information was presented very well with i...
13           1  The instructor taught the lecture to accompany...
14        

In [3]:
#Train Doc2Vec - considering each comment a document
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import re # for regular expressions

lmtzr = WordNetLemmatizer()
w = re.compile("\w+",re.I) #Matches Unicode word characters; this includes most characters that can be part of a word in any language

#Doc2vec only receive labeled sentences so the following method creates a label for each comment
def label_sentences(df):
    labeled_sentences = []
    for index, datapoint in df.iterrows():
        tokenized_words = re.findall(w,datapoint["comment"].lower())
        labeled_sentences.append(LabeledSentence(words=tokenized_words, tags=['SENT_%s' %index]))
    return labeled_sentences


def train_doc2vec_model(labeled_sentences):
    model = Doc2Vec(min_count=1, window=8, size=100, alpha=0.025, min_alpha=0.025)
    
    #The following line creates a vocabulary table, digesting all the words and filtering out the unique words, and doing some basic counts on them
    model.build_vocab(labeled_sentences)
    for epoch in range(10):
        #trains Doc2Vec on variable learning rate sequentially decreasing.
        model.train(labeled_sentences,total_examples=model.corpus_count, epochs=model.iter)
        model.alpha -= 0.002 
        model.min_alpha = model.alpha
    
    return model

sen = label_sentences(df)
%time model = train_doc2vec_model(sen) #calls to train the model, and gives the time it takes

Using TensorFlow backend.


Wall time: 4.8 s


## Inferring a Vector¶
One important thing to note is that you can now infer a vector for any piece of text without having to re-train the model by passing a list of words to the model.infer_vector function. This vector can then be compared with other vectors via cosine similarity.

In [4]:
model.infer_vector(['only', 'you', 'can', 'prevent', 'forrest', 'fires'])

array([ 0.01613971, -0.04091503,  0.00876203, -0.01167556, -0.01995205,
       -0.00318355,  0.0410048 ,  0.02143336, -0.01548649, -0.0450041 ,
        0.00122282,  0.01447786, -0.04755984,  0.00333679, -0.00152568,
        0.02429867, -0.01075239,  0.0137948 ,  0.01199499, -0.04730212,
       -0.03136966, -0.03100823, -0.04372935, -0.01686253,  0.03164188,
        0.01725915, -0.00215421, -0.01616173,  0.01221084, -0.01466749,
       -0.02699424,  0.0463641 ,  0.04878873, -0.01369095, -0.01290173,
       -0.02132636, -0.03562954, -0.08159862, -0.01800677,  0.05490265,
        0.03383458,  0.02048385, -0.01209165, -0.02024756,  0.04544707,
        0.05536291, -0.03576288, -0.02892365, -0.03904415,  0.00911469,
       -0.00288884,  0.03717517,  0.00210875,  0.01011741,  0.00641064,
       -0.00916962,  0.05623824, -0.01612606, -0.00252074, -0.00154954,
        0.03942744,  0.00510509, -0.00520389,  0.03616713,  0.00219202,
        0.09004503,  0.01241444,  0.01579709, -0.0244659 ,  0.01

In [5]:
#This shows the vector for the first comment labeled as SENT_0
model.docvecs['SENT_0']

array([  3.68987322e-02,  -1.34596065e-01,   1.20176189e-02,
        -5.41564040e-02,  -1.38594434e-01,   6.60225004e-02,
         3.47515084e-02,   4.00989205e-02,  -2.24182624e-02,
        -3.63116749e-02,  -5.61324768e-02,   6.80981427e-02,
        -8.86763260e-02,   2.62290295e-02,   1.25036672e-01,
         7.40186572e-02,   2.35537700e-02,   5.90865910e-02,
         3.97029631e-02,  -5.40015623e-02,  -2.97947600e-02,
        -5.16041405e-02,  -9.29372013e-02,  -8.47185850e-02,
         1.45436883e-01,  -2.78214320e-05,  -3.22634429e-02,
        -9.27570462e-02,  -4.28357068e-03,   4.60200273e-02,
        -8.40533674e-02,   6.03971891e-02,   1.19356319e-01,
        -1.08101949e-01,  -3.44229974e-02,  -5.53229526e-02,
        -1.48277357e-02,  -6.87017590e-02,  -7.39986151e-02,
         1.01892285e-01,   1.30582452e-01,   1.03756793e-01,
        -4.45642024e-02,   2.01259479e-02,   5.93720600e-02,
         1.30347580e-01,  -3.20442840e-02,   4.18784702e-03,
        -1.10545151e-01,

In [6]:
model.most_similar('good')

[('teaching', 0.8877487778663635),
 ('alright', 0.8867093920707703),
 ('keeping', 0.8856614828109741),
 ('instructors', 0.8766233921051025),
 ('pretty', 0.8694695234298706),
 ('overall', 0.8670663833618164),
 ('job', 0.8608330488204956),
 ('jo', 0.8493382334709167),
 ('great', 0.8398537635803223),
 ('hat', 0.8341473937034607)]

In [7]:
#Saving and Loading Models
model.save('./surveyVectors.d2v')

In [8]:
#The following method stores the vectorized comments in the array comments[]
#and stores its label in the array y. This is done so that we can do
#the classification using X and y values.
def vectorize_comments(df,d2v_model):
    y = []
    comments = []
    for i in range(0,df.shape[0]):
        label = 'SENT_%s' %i
        comments.append(d2v_model.docvecs[label])
    df['vectorized_comments'] = comments
    
    return df

df = vectorize_comments(df,model)
print (df.head(1))

   sentiment                                            comment  \
0          1  She present class materials with powerpoint wh...   

                                 vectorized_comments  
0  [0.0368987, -0.134596, 0.0120176, -0.0541564, ...  


In [9]:
#Save vectorized datafrane to pckl file
df.to_pickle('vectorComments')

In [None]:
#Cross validation
from sklearn import cross_validation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold


import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import pickle

#metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

# classifiers

from keras.models import Sequential
from keras.layers import Dense, Activation
from sklearn.model_selection import train_test_split
from keras.wrappers.scikit_learn import KerasClassifier
import keras.backend as K
from sklearn.metrics import fbeta_score
import numpy



In [None]:
def precision(y_true, y_pred):
    # Calculates the precision
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def recall(y_true, y_pred):
    # Calculates the recall
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def fmeasure(y_true, y_pred):
    # Calculates the f-measure, the harmonic mean of precision and recall.
    f = 2*(precision(y_true, y_pred)*recall(y_true, y_pred))/(precision(y_true, y_pred)+recall(y_true, y_pred))
    #return fbeta_score(y_true, y_pred, beta=1)
    return f

In [None]:
# Function to create model, required for KerasClassifier
def create_model(optimizer='rmsprop', init='glorot_uniform'):
    model = Sequential()
    model.add(Dense(32, activation='relu', input_dim=100))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy', recall, precision,fmeasure])
    return model

In [None]:
kerasModel = KerasClassifier(build_fn=create_model, verbose=2)

# grid search epochs, batch size and optimizer
optimizers = ['rmsprop', 'adam']
init = ['glorot_uniform', 'normal', 'uniform']
epochs = [10]
batches = [100]
param_grid = dict(optimizer=optimizers, epochs=epochs, batch_size=batches, init=init)

Kgrid = GridSearchCV(estimator=kerasModel, param_grid=param_grid, cv=10)

In [None]:
#Convert data arrays to lists
X=df["vectorized_comments"].T.tolist()
y=df["sentiment"].T.tolist()

In [None]:
Kgrid.fit(numpy.asarray(X), numpy.asarray(y))

In [None]:
len(X)

In [None]:
# create a list of the mean scores only
Kgrid.grid_scores_

In [None]:
# examine the best model
print(Kgrid.best_score_)
print(Kgrid.best_params_)
print(Kgrid.best_estimator_)

In [None]:
from sklearn.metrics import confusion_matrix
y1_pred = Kgrid.best_estimator_.predict(numpy.asarray(X))
print(confusion_matrix(y, y1_pred))