In [1]:
import pandas as pd
import numpy as np

In [2]:
np.random.seed(0)

In [3]:
#function reads data file
def read_text_file(f):
    df_complete = pd.read_csv(f)
    df = df_complete.loc[:,["sentiment","comment"]] 
    df.dropna(how="any", inplace=True) #drops columns that are not used
    return df


In [4]:
df = read_text_file("500/cleaned.csv")
print (df.head())

   sentiment                                            comment
0          1  She present class materials with powerpoint wh...
1          1  The instructor was generally quite good at exp...
2          0  I cant really tell how effective the instructi...
3          1  She did a good job of explaining the logic beh...
4          1  The activities we did in classed where explain...


In [5]:
#Train Doc2Vec - considering each comment a document
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import re

lmtzr = WordNetLemmatizer()
w = re.compile("\w+",re.I)

#Doc2vec only receive labeled sentences so the following method creates a label for each comment
def label_sentences(df):
    labeled_sentences = []
    for index, datapoint in df.iterrows():
        tokenized_words = re.findall(w,datapoint["comment"].lower())
        labeled_sentences.append(LabeledSentence(words=tokenized_words, tags=['SENT_%s' %index]))
    return labeled_sentences


def train_doc2vec_model(labeled_sentences):
    model = Doc2Vec(min_count=1, window=8, size=100, alpha=0.025, min_alpha=0.025)
    
    #The following line creates a vocabulary table, digesting all the words and filtering out the unique words, and doing some basic counts on them
    model.build_vocab(labeled_sentences)
    for epoch in range(10):
        #trains Doc2Vec on variable learning rate sequentially decreasing.
        model.train(labeled_sentences,total_examples=model.corpus_count, epochs=model.iter)
        model.alpha -= 0.002 
        model.min_alpha = model.alpha
    
    return model

sen = label_sentences(df)
%time model = train_doc2vec_model(sen) #calls to train the model, and gives the time it takes

Using TensorFlow backend.


Wall time: 2.39 s


### Inferring a Vector¶ 

One important thing to note is that you can now infer a vector for any piece of text without having to re-train the model by passing a list of words to the model.infer_vector function. This vector can then be compared with other vectors via cosine similarity.

In [6]:
model.infer_vector(['only', 'you', 'can', 'prevent', 'forrest', 'fires'])

array([ -2.55407393e-03,  -1.47841424e-02,  -6.65727956e-03,
         1.87597498e-02,   2.86649447e-02,  -2.10007075e-02,
         1.91084221e-02,   2.29063965e-02,  -8.31268355e-03,
        -1.05696451e-02,   2.13856634e-04,   2.78418362e-02,
        -5.16085653e-03,  -2.68914662e-02,   6.06097281e-02,
         2.81181969e-02,  -1.31330760e-02,   2.18424015e-02,
         1.78850722e-02,  -5.04343696e-02,  -7.78315868e-03,
         2.84297112e-02,   1.19949570e-02,   8.95569660e-03,
        -2.67488360e-02,  -1.70973979e-03,   2.58434750e-02,
         1.17072044e-02,   3.95860150e-02,  -2.06501712e-03,
         1.10748690e-02,  -5.20406477e-03,   7.09436983e-02,
        -7.98506662e-03,  -1.11838719e-02,   3.27165686e-02,
         6.36243029e-03,  -5.16321696e-03,   4.32418510e-02,
        -2.49316841e-02,   2.70249862e-02,  -3.07538658e-02,
         2.26640031e-02,   2.73975134e-02,  -4.12394218e-02,
        -3.85291167e-02,  -4.14932147e-02,   4.14330652e-03,
         1.77087775e-03,

In [7]:
#This shows the vector for the first comment labeled as SENT_0
model.docvecs['SENT_0']

array([ 0.0159505 , -0.03893923,  0.02340249,  0.01307224,  0.03451093,
       -0.00656774, -0.02457786,  0.06272283, -0.0298181 , -0.08020693,
       -0.0162821 ,  0.07188087,  0.04145225, -0.15585591,  0.14139356,
        0.0873617 , -0.02294507,  0.0879023 , -0.03360162, -0.17169152,
       -0.05919168,  0.07944062,  0.05829292,  0.0142904 ,  0.06803995,
       -0.02852449,  0.14204425,  0.0545768 ,  0.05104154,  0.00896609,
       -0.04531134, -0.07706524,  0.09572662, -0.08720192, -0.01756951,
        0.11524361, -0.01322339, -0.0378295 ,  0.01805082,  0.08674865,
        0.08608117,  0.04490526,  0.03913374,  0.15469056, -0.05246277,
       -0.04330213, -0.09590889, -0.03762541,  0.0629676 ,  0.01090413,
       -0.06410495, -0.00426013,  0.0222648 ,  0.07095952, -0.02104527,
       -0.02962356,  0.12879348, -0.11376718, -0.12841707,  0.03012384,
       -0.06069612,  0.05788385, -0.12934239, -0.01060066, -0.04596388,
       -0.01550295, -0.14706556,  0.09882553, -0.02576829, -0.09

In [9]:
model.most_similar('good')

[('hat', 0.8923630118370056),
 ('teaching', 0.890877366065979),
 ('keeping', 0.8794277906417847),
 ('instructors', 0.8748044967651367),
 ('generally', 0.8627312183380127),
 ('overall', 0.856669008731842),
 ('job', 0.8497782945632935),
 ('jo', 0.8494387865066528),
 ('logic', 0.8493380546569824),
 ('reasons', 0.839232325553894)]

In [10]:
#The following method stores the vectorized comments in the array comments[]
#and stores its label in the array y. This is done so that we can do
#the classification using X and y values.
def vectorize_comments(df,d2v_model):
    y = []
    comments = []
    for i in range(0,df.shape[0]):
        label = 'SENT_%s' %i
        comments.append(d2v_model.docvecs[label])
    df['vectorized_comments'] = comments
    
    return df

df = vectorize_comments(df,model)
print (df.head(1))

   sentiment                                            comment  \
0          1  She present class materials with powerpoint wh...   

                                 vectorized_comments  
0  [0.0159505, -0.0389392, 0.0234025, 0.0130722, ...  


In [11]:
#Cross validation
from sklearn import cross_validation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold


import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import pickle

#metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score

# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier




## Model with knn:
- doc2vec vectors
- k values from 1 to 10
- Cross validation fold=10

In [12]:
#This stores a list from 1 to 10, which will be used as k values from 1 to 10
k_range=list(range(1,10))
print(k_range)

[1, 2, 3, 4, 5, 6, 7, 8, 9]


In [13]:
#Creates a dictionary, tuple of the key of nn and its values
param_grid = dict(n_neighbors=k_range)
print(param_grid)

{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9]}


In [14]:
#Create knn classifier
knn=KNeighborsClassifier()

In [15]:
#Use knn classifier, parameters from param_grid, k-fold = 10, and precision as the metrics score
grid=GridSearchCV(knn,param_grid,cv=10,scoring='precision')

In [16]:
#Convert data arrays to lists
X=df["vectorized_comments"].T.tolist()
y=df["sentiment"].T.tolist()

In [17]:
#use the classifier 'grid' created in line 15 to fit our data
#X - represents the comments
#y - represents the labels (1 or 0)
grid.fit(X, y)

GridSearchCV(cv=10, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='precision', verbose=0)

In [18]:
# view the complete results (list of named tuples)
#Shows the mean value for each k value, as well as the standard deviation, and its parameters
grid.grid_scores_

[mean: 0.87862, std: 0.02478, params: {'n_neighbors': 1},
 mean: 0.89861, std: 0.01473, params: {'n_neighbors': 2},
 mean: 0.86594, std: 0.02212, params: {'n_neighbors': 3},
 mean: 0.87992, std: 0.02213, params: {'n_neighbors': 4},
 mean: 0.85583, std: 0.01840, params: {'n_neighbors': 5},
 mean: 0.86595, std: 0.02338, params: {'n_neighbors': 6},
 mean: 0.85371, std: 0.01354, params: {'n_neighbors': 7},
 mean: 0.86547, std: 0.01970, params: {'n_neighbors': 8},
 mean: 0.84930, std: 0.01689, params: {'n_neighbors': 9}]

In [19]:
# examine the first tuple, when k=1
print(grid.grid_scores_[0].parameters)
print(grid.grid_scores_[0].cv_validation_scores)
print(grid.grid_scores_[0].mean_validation_score)

{'n_neighbors': 1}
[ 0.86956522  0.875       0.85714286  0.89795918  0.89795918  0.86666667
  0.83333333  0.87234043  0.88888889  0.92857143]
0.878622769453


In [20]:
# create a list of the mean scores only
grid_mean_scores = [result.mean_validation_score for result in grid.grid_scores_]
print(grid_mean_scores)

[0.87862276945266327, 0.8986053253891505, 0.86593861770171099, 0.87992042636769996, 0.85582703007190575, 0.86594582319351709, 0.85371127828722515, 0.86546869236432356, 0.84930065113152042]


In [21]:
# examine the best model
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

0.898605325389
{'n_neighbors': 2}
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=2, p=2,
           weights='uniform')


## Model with Random Forest Classifier (RFC)
A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and use averaging to improve the predictive accuracy and control over-fitting.

bootstrap = [True] ---> samples are drawn with replacement if bootstrap=True (default).

In [22]:
parameters = {
    'n_estimators': [200,400],
    'min_samples_leaf': [1],
    'min_samples_split': [2]
}

clf = GridSearchCV(RFC(verbose=1,n_jobs=1), param_grid=parameters, cv=10, scoring='precision')
clf.fit(X, y)

[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=1, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'min_samples_leaf': [1], 'n_estimators': [200, 400], 'min_samples_split': [2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='precision', verbose=0)

In [23]:
# create a list of the mean scores only
grid_mean_scores = [result.mean_validation_score for result in grid.grid_scores_]
print(grid_mean_scores,"Mean of all scores for each cv fold")

[0.87862276945266327, 0.8986053253891505, 0.86593861770171099, 0.87992042636769996, 0.85582703007190575, 0.86594582319351709, 0.85371127828722515, 0.86546869236432356, 0.84930065113152042] Mean of all scores for each cv fold


In [24]:
# examine the best model
print(clf.best_score_)
print(clf.best_params_)
print(clf.best_estimator_)

0.870728383742
{'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=1, oob_score=False, random_state=None,
            verbose=1, warm_start=False)


## Stochastic Gradient Descent Classifier (SGDC)


In [25]:
# Setup 5-fold stratified cross validation
cross_v = StratifiedKFold(n_splits=10) #Takes group information into account to avoid building folds with imbalanced class distributions (for binary or multiclass classification tasks).

In [26]:
# Define a parameter grid to search over
param_grid = {
    'alpha': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1],
    'loss': ('log', 'hinge'), #function that gives the amount of error rate
    'penalty': ['l1', 'l2', 'elasticnet']
    }

In [27]:
#Create SGDC classifier using logistic regression and hinge
grid_search = GridSearchCV(SGDClassifier(), param_grid=param_grid, cv=cross_v, scoring='precision')

In [28]:
#Fit model to the data
grid_search.fit(X, y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
       error_score='raise',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'loss': ('log', 'hinge'), 'penalty': ['l1', 'l2', 'elasticnet'], 'alpha': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='precision', verbose=0)

In [29]:
# create a list of the mean scores only
grid_search.grid_scores_

[mean: 0.87760, std: 0.02873, params: {'loss': 'log', 'penalty': 'l1', 'alpha': 0.0001},
 mean: 0.85281, std: 0.02104, params: {'loss': 'log', 'penalty': 'l2', 'alpha': 0.0001},
 mean: 0.85953, std: 0.03016, params: {'loss': 'log', 'penalty': 'elasticnet', 'alpha': 0.0001},
 mean: 0.88579, std: 0.04406, params: {'loss': 'hinge', 'penalty': 'l1', 'alpha': 0.0001},
 mean: 0.86118, std: 0.03669, params: {'loss': 'hinge', 'penalty': 'l2', 'alpha': 0.0001},
 mean: 0.88110, std: 0.03847, params: {'loss': 'hinge', 'penalty': 'elasticnet', 'alpha': 0.0001},
 mean: 0.85875, std: 0.03645, params: {'loss': 'log', 'penalty': 'l1', 'alpha': 0.0005},
 mean: 0.85400, std: 0.02759, params: {'loss': 'log', 'penalty': 'l2', 'alpha': 0.0005},
 mean: 0.85268, std: 0.01985, params: {'loss': 'log', 'penalty': 'elasticnet', 'alpha': 0.0005},
 mean: 0.86154, std: 0.03340, params: {'loss': 'hinge', 'penalty': 'l1', 'alpha': 0.0005},
 mean: 0.86546, std: 0.02766, params: {'loss': 'hinge', 'penalty': 'l2', 'alph

In [30]:
# examine the best model
print(grid_search.best_score_)
print(grid_search.best_params_)
print(grid_search.best_estimator_)

0.885793439345
{'loss': 'hinge', 'penalty': 'l1', 'alpha': 0.0001}
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l1', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)


## Neural Network Classifier using keras

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=200))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

X_train, X_test, y_train, y_test = cross_validation.train_test_split(np.concatenate(X), np.concatenate(y), test_size=0.02, random_state=17)

model.fit(X_train, y_train, epochs=9, batch_size=32, verbose=2)

In [None]:
#df["vectorized_comments"]
df["vectorized_comments"].T.tolist()[0]
#y=df["sentiment"]

In [None]:
#y_pred = classifier.predict(X_test)
#from sklearn.metrics import confusion_matrix
#print(y_test)
#print(y_pred)
#print(confusion_matrix(y_test, y_pred))