# Word2Vec 

In [1]:
# load the datasets 
import pandas as pd 
v2w_add = pd.read_csv("data/word2vec-data.csv")
v2w_avg = pd.read_csv("data/word2vec-avg-data.csv")

### word2vec addition analysis 

In [4]:
df = v2w_add

### Pre-processing before the analysis

In [5]:
df = df.drop(columns = ['question', 'FaceID'])
df = df.astype('category')

df['true_answer'] = df['true_answer'].cat.codes
df['gender'] = df['gender'].cat.codes
df['hair'] = df['hair'].cat.codes
df['eyes'] = df['eyes'].cat.codes
df['ethnicity'] = df['ethnicity'].cat.codes
df['age'] = df['age'].cat.codes

df = pd.DataFrame(df, dtype = 'float')

In [6]:
# dividing the dataset according to the features and the target 
y = df["true_answer"]
X = df.drop(columns = ["true_answer"])

In [7]:
# applying PCA
from sklearn.decomposition import PCA
pca = PCA(0.95)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents)

In [8]:
#splitting the train and test sets with 80:20 ratio for the classification models 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.2, random_state= 2345)

### Logistic Regression 

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [11]:
y_pred = logreg.predict(X_test)
print(logreg.score(X_test, y_test))

0.5729440357330531


In [12]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[28032  5855]
 [20151  6858]]


In [13]:
import statsmodels.api as sm
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.674025
         Iterations 5
                          Results: Logit
Model:              Logit            Pseudo R-squared: 0.018      
Dependent Variable: true_answer      AIC:              411064.0188
Date:               2018-12-27 16:13 BIC:              414305.0588
No. Observations:   304480           Log-Likelihood:   -2.0523e+05
Df Model:           304              LL-Null:          -2.0907e+05
Df Residuals:       304175           LLR p-value:      0.0000     
Converged:          1.0000           Scale:            1.0000     
No. Iterations:     5.0000                                        
-------------------------------------------------------------------
               Coef.   Std.Err.     z      P>|z|    [0.025   0.975]
-------------------------------------------------------------------
hair          -0.0296    0.0031   -9.4761  0.0000  -0.0357  -0.0235
eyes          -0.0406    0.0046   -8.8908  0.0000

### Naive Bayes 

In [9]:
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
from sklearn.metrics import accuracy_score

BernNB = BernoulliNB(binarize=True)
BernNB.fit(X_train, y_train)

y_expect = y_test
y_pred = BernNB.predict(X_test)

print(accuracy_score(y_expect, y_pred))

0.5555208880714662


### K-Nearest Neighbours

In [14]:
#Import knearest neighbors Classifier model
from sklearn.neighbors import KNeighborsClassifier

#Create KNN Classifier with k = 5
knn = KNeighborsClassifier(n_neighbors=5)

#Train the model using the training sets
knn.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = knn.predict(X_test)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))


Accuracy: 0.6462821860220704


In [15]:
#Import knearest neighbors Classifier model
from sklearn.neighbors import KNeighborsClassifier

#Create KNN Classifier with k = 7
knn = KNeighborsClassifier(n_neighbors=7)

#Train the model using the training sets
knn.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = knn.predict(X_test)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))


Accuracy: 0.6534747766684182


### Random Forest 

In [16]:
from math import sqrt
sqrt(307)

17.52141546793523

In [17]:
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(max_features = 17)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6695185233841303


In [18]:
import pandas as pd
feature_imp = pd.Series(clf.feature_importances_,index=X.columns,).sort_values(ascending=False)
feature_imp

eyes         0.125212
gender       0.123757
age          0.123254
hair         0.109224
ethnicity    0.104678
wv168        0.007017
wv243        0.004615
wv257        0.004117
wv142        0.003923
wv59         0.003722
wv75         0.003643
wv10         0.003471
wv144        0.003359
wv267        0.003304
wv55         0.003188
wv84         0.003010
wv77         0.002963
wv6          0.002932
wv162        0.002923
wv44         0.002813
wv250        0.002766
wv241        0.002738
wv181        0.002693
wv111        0.002641
wv47         0.002625
wv68         0.002596
wv19         0.002488
wv246        0.002487
wv273        0.002472
wv51         0.002449
               ...   
wv53         0.000865
wv120        0.000857
wv152        0.000851
wv116        0.000847
wv70         0.000846
wv217        0.000846
wv277        0.000832
wv80         0.000826
wv97         0.000824
wv81         0.000819
wv34         0.000813
wv176        0.000811
wv48         0.000803
wv107        0.000798
wv121     

### word2vec Average analysis 

In [19]:
# define the dataset 
df = v2w_avg 

In [20]:
# preparing the dataset for the classifiers

df = df.drop(columns = ['question', 'FaceID'])
df = df.astype('category')

df['true_answer'] = df['true_answer'].cat.codes
df['gender'] = df['gender'].cat.codes
df['hair'] = df['hair'].cat.codes
df['eyes'] = df['eyes'].cat.codes
df['ethnicity'] = df['ethnicity'].cat.codes
df['age'] = df['age'].cat.codes

df = pd.DataFrame(df, dtype = 'float')

In [21]:
# dividing the dataset according to the features and the target 
y = df["true_answer"]
X = df.drop(columns = ["true_answer"])

In [22]:
# applying PCA

from sklearn.decomposition import PCA
pca = PCA(0.95)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents)

In [23]:
#splitting the train and test sets with 80:20 ratio for the classification models 

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.2, random_state= 2345)

### Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
print("Logistic regression accuracy score - word2vec average:", logreg.score(X_test, y_test))

Logistic regression accuracy score - word2vec average: 0.5671801103520757


In [25]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[27773  6114]
 [20243  6766]]


### Naive Bayes 

In [26]:
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
from sklearn.metrics import accuracy_score

BernNB = BernoulliNB(binarize=True)
BernNB.fit(X_train, y_train)

y_expect = y_test
y_pred = BernNB.predict(X_test)

print("Naive Bayes accuracy score - word2vec average", accuracy_score(y_expect, y_pred))

Naive Bayes accuracy score - word2vec average 0.5564733315817131


### K-Nearest Neighbours 

In [27]:
#Import knearest neighbors Classifier model
from sklearn.neighbors import KNeighborsClassifier

#Create KNN Classifier with k = 5
knn = KNeighborsClassifier(n_neighbors=5)

#Train the model using the training sets
knn.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = knn.predict(X_test)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy KNN - word2vec average, k = 5:",metrics.accuracy_score(y_test, y_pred))


Accuracy KNN - word2vec average, k = 5: 0.6241789280084078


In [28]:
#Import knearest neighbors Classifier model
from sklearn.neighbors import KNeighborsClassifier

#Create KNN Classifier with k = 7
knn = KNeighborsClassifier(n_neighbors=7)

#Train the model using the training sets
knn.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = knn.predict(X_test)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy KNN - word2vec average, k = 7:",metrics.accuracy_score(y_test, y_pred))


Accuracy KNN - word2vec average, k = 7: 0.6250164214398318


### Random Forest 

In [29]:
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(max_features = 17)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy Random Forest - word2vec average:",metrics.accuracy_score(y_test, y_pred))

Accuracy Random Forest - word2vec average: 0.6698797950604309


# Fast Text 

In [11]:
# load the datasets
import pandas as pd 
ft_add = pd.read_csv("data/fasttext-data-add.csv")
ft_avg = pd.read_csv("data/fasttext-data-avg.csv")

### fast text addition analysis 

In [12]:
df = ft_add

In [13]:
# preparing the dataset for the classifiers

df = df.drop(columns = ['question', 'FaceID'])
df = df.astype('category')

df['true_answer'] = df['true_answer'].cat.codes
df['gender'] = df['gender'].cat.codes
df['hair'] = df['hair'].cat.codes
df['eyes'] = df['eyes'].cat.codes
df['ethnicity'] = df['ethnicity'].cat.codes
df['age'] = df['age'].cat.codes

df = pd.DataFrame(df, dtype = 'float')

In [14]:
# dividing the dataset according to the features and the target 
y = df["true_answer"]
X = df.drop(columns = ["true_answer"])

In [15]:
# applying PCA

from sklearn.decomposition import PCA
pca = PCA(0.95)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents)

In [16]:
#splitting the train and test sets with 80:20 ratio for the classification models 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.2, random_state= 2345)

### Logistic Regression

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
print("Logistic regression accuracy score - fast text addition:", logreg.score(X_test, y_test))

Logistic regression accuracy score - fast text addition: 0.5789827948515892


In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)
cm = confusion_matrix

In [None]:
plt.clf()
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Wistia)
classNames = ['Negative','Positive']
plt.title('Predicted versus True Label')
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames, rotation=45)
plt.yticks(tick_marks, classNames)
s = [['TN','FP'], ['FN', 'TP']]
for i in range(2):
    for j in range(2):
        plt.text(j,i, str(s[i][j])+" = "+str(cm[i][j]))
plt.show()

### Naive Bayes

In [38]:
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
from sklearn.metrics import accuracy_score

BernNB = BernoulliNB(binarize=True)
BernNB.fit(X_train, y_train)

y_expect = y_test
y_pred = BernNB.predict(X_test)

print("Naive Bayes accuracy score - fast text addition", accuracy_score(y_expect, y_pred))

Naive Bayes accuracy score - fast text addition 0.5526497241922774


### K-Nearest Neighbours 

In [39]:
#Import knearest neighbors Classifier model
from sklearn.neighbors import KNeighborsClassifier

#Create KNN Classifier with k = 5
knn = KNeighborsClassifier(n_neighbors=5)

#Train the model using the training sets
knn.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = knn.predict(X_test)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy KNN - fast text addition, k = 5:",metrics.accuracy_score(y_test, y_pred))

Accuracy KNN - fast text addition, k = 5: 0.648460073548726


In [62]:
#Import knearest neighbors Classifier model
from sklearn.neighbors import KNeighborsClassifier

#Create KNN Classifier with k = 7
knn = KNeighborsClassifier(n_neighbors=7)

#Train the model using the training sets
knn.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = knn.predict(X_test)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy KNN - fast text addition, k = 7:",metrics.accuracy_score(y_test, y_pred))


Accuracy KNN - fast text addition, k = 7: 0.6533359600735488


In [8]:
#Import knearest neighbors Classifier model
from sklearn.neighbors import KNeighborsClassifier

#Create KNN Classifier with k = 10
knn = KNeighborsClassifier(n_neighbors= 10)

#Train the model using the training sets
knn.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = knn.predict(X_test)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy KNN - fast text addition, k = 7:",metrics.accuracy_score(y_test, y_pred))

Accuracy KNN - fast text addition, k = 7: 0.6599356448647229


In [None]:
k_range = range(3, 20)
scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, y_train)
    scores.append(knn.score(X_test, y_test))
from matplotlib import pyplot as plt
plt.figure()
plt.xlabel('k values')
plt.ylabel('Accuracy ratios')
plt.scatter(k_range, scores)
plt.xticks([3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19])

### Random Forest 

In [41]:
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(max_features = 17)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy Random Forest - fast text addition:",metrics.accuracy_score(y_test, y_pred))

Accuracy Random Forest - fast text addition: 0.6719858156028369


### fast text average analysis 

In [42]:
df = ft_avg

In [43]:
# preparing the dataset for the classifiers

df = df.drop(columns = ['question', 'FaceID'])
df = df.astype('category')

df['true_answer'] = df['true_answer'].cat.codes
df['gender'] = df['gender'].cat.codes
df['hair'] = df['hair'].cat.codes
df['eyes'] = df['eyes'].cat.codes
df['ethnicity'] = df['ethnicity'].cat.codes
df['age'] = df['age'].cat.codes

df = pd.DataFrame(df, dtype = 'float')

In [44]:
# dividing the dataset according to the features and the target 
y = df["true_answer"]
X = df.drop(columns = ["true_answer"])

In [45]:
# applying PCA

from sklearn.decomposition import PCA
pca = PCA(0.95)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents)

In [46]:
#splitting the train and test sets with 80:20 ratio for the classification models 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.2, random_state= 2345)

### Logistic Regression

In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
print("Logistic regression accuracy score - fast text average:", logreg.score(X_test, y_test))

Logistic regression accuracy score - fast text average: 0.5732532177567639


In [48]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[27970  5986]
 [20008  6948]]


### Naive Bayes 

In [49]:
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
from sklearn.metrics import accuracy_score

BernNB = BernoulliNB(binarize=True)
BernNB.fit(X_train, y_train)

y_expect = y_test
y_pred = BernNB.predict(X_test)

print("Naive Bayes accuracy score - fast text average", accuracy_score(y_expect, y_pred))

Naive Bayes accuracy score - fast text average 0.5574599422117152


### K-Nearest Neighbours 

In [50]:
#Import knearest neighbors Classifier model
from sklearn.neighbors import KNeighborsClassifier

#Create KNN Classifier with k = 5
knn = KNeighborsClassifier(n_neighbors=5)

#Train the model using the training sets
knn.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = knn.predict(X_test)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy KNN - fast text average, k = 5:",metrics.accuracy_score(y_test, y_pred))

Accuracy KNN - fast text average, k = 5: 0.6152318098240084


In [51]:
#Import knearest neighbors Classifier model
from sklearn.neighbors import KNeighborsClassifier

#Create KNN Classifier with k = 7
knn = KNeighborsClassifier(n_neighbors=7)

#Train the model using the training sets
knn.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = knn.predict(X_test)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy KNN - fast text average, k = 7:",metrics.accuracy_score(y_test, y_pred))


Accuracy KNN - fast text average, k = 7: 0.6210270554242185


### Random Forest 

In [52]:
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(max_features = 17)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy Random Forest - fast text average:",metrics.accuracy_score(y_test, y_pred))

Accuracy Random Forest - fast text average: 0.675055818229577


In [53]:
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(max_features = 15)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy Random Forest - fast text average:",metrics.accuracy_score(y_test, y_pred))

Accuracy Random Forest - fast text average: 0.6737752823745732
