### Import Libraries

In [6]:
import numpy as np
import pandas as pd
import gensim
import string

# Preprocessing

In [32]:
'''
Getting all the stopwords
'''
with open("stopwords.txt") as f: stopwords = f.read().split("\n")

'''
preprocess function takes a string
and returns a preprocessed string
- turn to lower
- removes punctuations
- removes stopwords
'''
def preprocess(x):
  x = x.translate(str.maketrans('', '', string.punctuation))
  x = x.lower().split()
  x = [ele for ele in x if ele not in stopwords]
  return x

'''
get_data takes the filename of the dataset
and returns standard X and y vectors. X is 
also preprocessed here
'''
def get_data(filename):
  with open(filename, encoding="utf8") as f: data = f.read().split("\n")
  data = [ele for ele in data if len(ele)>0]
  x, y = [], []
  for line in data:
    y += [line[0]]
    x += [preprocess(line[2:])]
  return x, y

In [33]:
print('The Total Number Of Stopwords are: ', len(stopwords))

The Total Number Of Stopwords are:  153


In [34]:
raw_X, y = get_data("train.txt")
test_X, test_y = get_data("test.txt")

# Word2Vec Embeddings

In [9]:
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [10]:
"UNK" in model

True

In [35]:
'''
get_embeddings
- vector of words as input
- gets embeddings for each word in the input
- averages out all the embeddings and returns a vector
'''
def get_embeddings(vec):
  embs = []
  for w in vec:
    if w in model: embs += [model[w]]
    else: embs += [model["UNK"]]
  return np.mean(embs, axis=0)

'''
transform gets the input data x and 
returns the x where each vector is now
transformed to word embeddings
'''
def transform(data): return [get_embeddings(ele) for ele in data]

X = transform(raw_X)

In [36]:
X

[array([ 0.03700765,  0.0983785 , -0.07155355,  0.0347697 , -0.00427246,
         0.0641276 ,  0.09266663, -0.08587646,  0.11321449,  0.01627096,
        -0.01796722, -0.09275564, -0.01201375,  0.00597127, -0.19329198,
         0.06387329,  0.00901286,  0.06037903, -0.0022494 , -0.11277262,
        -0.04414876,  0.12578583, -0.0625    ,  0.02821096,  0.0467631 ,
        -0.08151499, -0.10483805,  0.04084269,  0.02464358, -0.05314128,
        -0.04038493,  0.04361979, -0.0688502 ,  0.02342733,  0.04553223,
        -0.02492269,  0.0695165 ,  0.09542402,  0.09810384,  0.03055827,
         0.06807455, -0.10888672,  0.12571208,  0.00939941,  0.00681559,
        -0.11681112, -0.05707423,  0.05908203,  0.11873118,  0.08963648,
        -0.03512828, -0.01119995, -0.03957113, -0.0254008 ,  0.05132167,
        -0.04231771,  0.03515625, -0.05184047,  0.02309259, -0.12346903,
        -0.00538127,  0.1021773 , -0.04599508, -0.03042221,  0.02032725,
         0.02946472, -0.0301005 ,  0.03404744,  0.0

# Reporting setup

In [0]:

def get_true_positives(golds, coals):
  return sum([1 for x in zip(golds, coals) if (x[0] == 1 == x[1])])

def get_true_negatives(golds, coals):
  return sum([1 for x in zip(golds, coals) if (x[0] == 0 == x[1])])

def get_false_positives(golds, coals):
  return sum([1 for x in zip(golds, coals) if (x[0] == 0 and x[1] == 1)])

def get_false_negatives(golds, coals):
  return sum([1 for x in zip(golds, coals) if (x[0] == 1 and x[1] == 0)])
def make_report(golds, coals):
  tp = get_true_positives(golds, coals)
  fp = get_false_positives(golds, coals)
  tn = get_true_negatives(golds, coals)
  fn = get_false_negatives(golds, coals)
  print("True Positives: ", tp)
  print("False Positives: ", fp)
  print("True Negatives: ", tn)
  print("False Negatives: ", fn)

  precision = tp / (tp + fp)
  recall = tp / (tp + fn)
  accuracy = (tp + tn) / (tp + tn + fp + fn)
  f1 = (2 * precision * recall) / (precision + recall)

  print("Precision: ", precision)
  print("Recall: ", recall)
  print("Accuracy: ", accuracy)
  print("F1 Score: ", f1)
  return tp, fp, tn, fn

# Logistic Regression

In [41]:
g = [[3,4,5,6,7],[4,5,6,7,8]]
g = np.asarray(g)
np.insert(g,0, 1, axis=1)

array([[1, 3, 4, 5, 6, 7],
       [1, 4, 5, 6, 7, 8]])

In [0]:
def prepare_lr_data(X, y):
  X_lr = np.matrix(X, dtype='float64')
  Y_lr = np.matrix(y, dtype='float64').T
  X_lr = np.insert(X_lr, 0, 1, axis=1)
  print(X_lr.shape, Y_lr.shape)
  W = np.matrix(np.zeros(X_lr.shape[1]), dtype='float64').transpose()
  print(W.shape)
  return X_lr, Y_lr, W
X_lr, Y_lr, W = prepare_lr_data(X, y)

(22892, 301) (22892, 1)
(301, 1)


In [0]:

'''
Sigmoid function
is actually prediction function
'''
def sigmoid(w, x):
  z = np.dot(x, w)
  res = 1/(1+np.exp(-1 * z))
  return res
y_hat = sigmoid(W, X_lr)
print(y_hat.shape, X_lr.shape)

(22892, 1) (22892, 301)


In [0]:
'''
Cross Entropy Loss function
'''
def cross_entropy(y, w, x):
  pred = sigmoid(w, x)
  g = np.squeeze(np.asarray(y))
  p = np.squeeze(np.asarray(pred))
  a = (g*np.log(p)) + ((1-g)*np.log(1-p))
  entropy = -1 * np.sum(a)/x.shape[0]
  return entropy
error = cross_entropy(Y_lr, W, X_lr)

In [0]:
def mini_batch_gradient_descent(X, w, Y, n_epoch, alpha, batch):
  J = list()
  epoch = n_epoch
  # for epoch in range(n_epoch):
  while epoch:
    epoch -= 1
    ind = 0
    while 1:
      x = X[ind:min(ind+batch, X.shape[0]), :]
      y = Y[ind:min(ind+batch, Y.shape[0]), :]

      y_hat = sigmoid(w, x)
      grad = np.dot(np.transpose(x), y_hat-y)
      w = w - grad*(alpha/x.shape[0])

      ind += batch
      if ind >= X.shape[0]:
        break
    
    J += [cross_entropy(y, w, x)]
    print(J[-1])
  return w, J

bgd_weights, J = mini_batch_gradient_descent(X_lr, W, Y_lr, 1000, 0.01, 32)
# print(bgd_weights)

0.6948900522175676
0.6852760698295471
0.6775518820049551
0.671814178570953
0.6675912203688155
0.6645069065264729
0.6622797979803147
0.6606983006091279
0.6596020920880451
0.6588689163350742
0.6584050885993081
0.6581385674635923
0.658013832039301
0.6579880504128987
0.6580281861835537
0.6581087958625247
0.6582103410090551
0.6583178876405321
0.6584200993966393
0.6585084550328385
0.6585766381969776
0.6586200601506341
0.6586354854991461
0.6586207380198544
0.6585744689709511
0.658495974278052
0.6583850500581204
0.6582418782876153
0.6580669362280572
0.6578609246171064
0.6576247107137585
0.6573592831255605
0.6570657159993778
0.6567451406674462
0.6563987232396731
0.6560276469461942
0.6556330982802163
0.6552162561849465
0.6547782836813621
0.6543203214545549
0.6538434830123135
0.6533488511058165
0.6528374751629963
0.6523103695335852
0.6517685123835936
0.6512128451080372
0.6506442721557018
0.6500636611798299
0.6494718434448504
0.6488696144323897
0.6482577346004543
0.6476369302583121
0.6470078945266

In [0]:
'''
The predict function,
which thresholds on
output of sigmoid
and returns 0 or 1
'''
def predict(w, X_test):
  res = sigmoid(w, X_test).A1
  res = [int(x + 0.5) for x in res]
  return res

In [0]:
X_test_lr = transform(test_X)
x_test, y_test, discard = prepare_lr_data(X_test_lr, test_y)
y_hat = predict(bgd_weights, x_test)

(5724, 301) (5724, 1)
(301, 1)


In [0]:
linear_regression_report = make_report(y_test, y_hat)

True Positives:  1915
False Positives:  685
True Negatives:  2312
False Negatives:  812
Precision:  0.7365384615384616
Recall:  0.7022368903557022
Accuracy:  0.7384696016771488
F1 Score:  0.7189787873099306


# KNN

In [0]:
def prepare_knn_data():
  raw_X, y = get_data("train.txt")
  test_X, test_y = get_data("test.txt")
  train_X = transform(raw_X)
  test_X = transform(test_X)
  return train_X, y, test_X, test_y


In [0]:
train_x, train_y, test_x, test_y = prepare_knn_data()

In [0]:
import operator
'''
compute_cosine_sim computes
cosine similarity given
two points a and b
'''
def compute_cosine_sim(a, b):
  cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
  return cos_sim

def get_dist(distances, k):
  candidates = distances[:k]
  freq = {"0":0, "1":0}
  for cand in candidates:
    freq[str(cand[1])] += 1
  return freq
'''
predict_batch givees predictions for test data
given train data, test data and the parameter k
for KNN
'''
def predict_batch(train, test, train_labels, test_labels, k):
  predictions = []
  golds = []
  count0 = 500
  count1 = 0

  for test_ind, test_point in enumerate(test):
    distances = []
    for train_ind, train_point in enumerate(train):
      distances += [(1-compute_cosine_sim(test_point, train_point), train_labels[train_ind])]
    distances = sorted(distances, key=lambda x: x[0])
    freq = get_dist(distances, k)
    while k > 1:
      if freq["0"] == freq["1"]:
        freq = get_dist(distances, k-1)
      else:
        break
    res = max(freq.items(), key=operator.itemgetter(1))[0]
    predictions += [int(res)]
    golds += [int(test_labels[test_ind])]
    if count0 <= 0:
      count1 += 1
      print(count1)
      count0 = 500
    else: count0 -= 1
  return predictions, golds

In [0]:
coals, golds = predict_batch(train_x, test_x, train_y, test_y, 1)
make_report(golds, coals)

1
2
3
4
5
6
7
8
9
10
11
True Positives:  1852
False Positives:  775
True Negatives:  2222
False Negatives:  875
Precision:  0.7049866768176627
Recall:  0.6791345801246791
Accuracy:  0.7117400419287212
F1 Score:  0.691819200597684


(1852, 775, 2222, 875)

In [0]:
coals, golds = predict_batch(train_x, test_x, train_y, test_y, 3)
make_report(golds, coals)

1
2
3
4
5
6
7
8
9
10
11
True Positives:  1891
False Positives:  688
True Negatives:  2309
False Negatives:  836
Precision:  0.7332299340829779
Recall:  0.6934360102676934
Accuracy:  0.7337526205450734
F1 Score:  0.7127779871843195


(1891, 688, 2309, 836)

In [0]:
coals, golds = predict_batch(train_x, test_x, train_y, test_y, 5)
make_report(golds, coals)

1
2
3
4
5
6
7
8
9
10
11
True Positives:  1947
False Positives:  656
True Negatives:  2341
False Negatives:  780
Precision:  0.7479830964271994
Recall:  0.713971397139714
Accuracy:  0.7491264849755416
F1 Score:  0.7305816135084428


(1947, 656, 2341, 780)

In [0]:
coals, golds = predict_batch(train_x, test_x, train_y, test_y, 7)
make_report(golds, coals)

1
2
3
4
5
6
7
8
9
10
11
True Positives:  1915
False Positives:  626
True Negatives:  2371
False Negatives:  812
Precision:  0.7536402990948445
Recall:  0.7022368903557022
Accuracy:  0.7487770789657582
F1 Score:  0.7270311313591494


(1915, 626, 2371, 812)

In [0]:
coals, golds = predict_batch(train_x, test_x, train_y, test_y, 10)
make_report(golds, coals)

1
2
3
4
5
6
7
8
9
10
11
True Positives:  1932
False Positives:  601
True Negatives:  2396
False Negatives:  795
Precision:  0.7627319384129491
Recall:  0.7084708470847084
Accuracy:  0.756114605171209
F1 Score:  0.7346007604562738


(1932, 601, 2396, 795)

# Perceptron

In [0]:
def prepare_perceptron_data():
  raw_X, y = get_data("train.txt")
  test_X, test_y = get_data("test.txt")
  train_X = transform(raw_X)
  test_X = transform(test_X)
  return train_X, y, test_X, test_y

In [0]:
train_x, train_y, test_x, test_y = prepare_perceptron_data() #please note that tran_y or test_y are strings "0" and "1"