In [2]:

import pandas as pd
import numpy as np
import sys  
import re
import sklearn
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer
import ast
from xgboost import plot_importance
import matplotlib.pyplot as plt


import spacy  # For preprocessing
import re  # For preprocessing


import multiprocessing

from gensim.models import Word2Vec
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.utils import to_categorical
from keras import optimizers




In [3]:
hygiene_text_path= "../data/Hygiene/hygiene.dat"
hygiene_labels_path= "../data/Hygiene/hygiene.dat.labels"
hygiene_others_path= "../data/Hygiene/hygiene.dat.additional"

In [4]:
with open(hygiene_text_path) as f:
    arrText = [l.rstrip() for l in f]
with open(hygiene_labels_path) as f:
    arrLabels = [l.rstrip() for l in f]

df = pd.DataFrame({'text':arrText, 'labels':arrLabels})
hygiene_others = pd.read_csv(hygiene_others_path, names=["cuisines", "zipcode", "reviews", "avg_ratings"])
df = df.join(hygiene_others)

In [5]:
df.cuisines = [ast.literal_eval(x) for x in df.cuisines]
mlb = MultiLabelBinarizer()

res = pd.DataFrame(mlb.fit_transform(df.cuisines),
                   columns=mlb.classes_,
                   index=df.cuisines.index)
df = df.drop("cuisines", axis =1)
df = df.join(res)

In [6]:
df.columns[df.isna().any()].tolist()

[]

### Baseline model without using NLP

In [198]:
train_df = df[df["labels"] != "[None]" ]
test_df = df[df["labels"] == "[None]" ]
X_train, y_train =train_df.drop(['text', 'labels', "zipcode"], axis=1), train_df["labels"]
X_test, y_test =test_df.drop(['text', 'labels', "zipcode"], axis=1), test_df["labels"]

In [149]:
# import xgboost as xgb
# dtrain = xgb.DMatrix(np.array(X_train), label=np.array(y_train))
# dtest = xgb.DMatrix(np.array(X_test))
# param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
# param['nthread'] = 4
# param['eval_metric'] = 'auc'
# bst = xgb.train(param, dtrain, 10)
# y_pred = bst.predict(dtest)


In [199]:
model = XGBClassifier()
model.fit(np.array(X_train), np.array(y_train))
y_pred = model.predict(np.array(X_test))

In [200]:
np.savetxt('./baseline_predictions.out', y_pred, fmt='%s')
with open('./baseline_predictions.out', 'r') as original: data = original.read()
with open('./baseline_predictions.out', 'w') as modified: modified.write("Viraj Bhalala(vbb2)\n" + data)

- F1: 0.6659

In [14]:
nlp = spacy.load('en', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)
    
    
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['text'])


In [16]:
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]


In [20]:
from UtilWordEmbedding import DocPreprocess
nlp = spacy.load('en', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed
stop_words = spacy.lang.en.stop_words.STOP_WORDS
all_docs = DocPreprocess(nlp, stop_words, df['text'], df['labels'])




In [23]:
import pickle
import os
dir_path = "./"
# Save all_docs as pickle.
with open(os.path.join(dir_path, 'all_docs.pickle'), 'wb') as f:
    pickle.dump(all_docs, f, pickle.HIGHEST_PROTOCOL)
# Read pickle.
with open(os.path.join(dir_path, 'all_docs.pickle'), 'rb') as f:
    all_docs = pickle.load(f)

In [27]:
len(all_docs.tagdocs), df.shape


(13299, (13299, 104))

In [25]:
all_docs.tagdocs[2]


TaggedDocument(words=['worry', 'review', 'place', 'strongly', 'think', 'bad', 'night', 'place', 'lot', 'better', 'mexican', 'food', 'place', 'run', 'avocado', 'vegetarian', 'friend', 'order', 'meatless', 'dish', 'rely', 'heavily', 'avocado', 'minute', 'order', 'drool', 'expect', 'eat', 'waitress', 'approach', 'table', 'tell', 'bad', 'news', 'bad', 'dish', 'order', 'table', 'people', 'include', 'avocado', 'service', 'little', 'slow', 'waitress', 'wasn', 'friendly', 'helpful', 'food', 'arrive', 'people', 'wait', 'minute', 'plate', 'eat', 'get', 'bad', 'awkward', 'large', 'group', 'come', 'pay', 'bill', 'sit', 'group', 'decide', 'service', 'didn', 'tip', 'ask', 'split', 'tab', 'way', 'large', 'group', 'waitress', 'huff', 'puff', 'roller', 'eye', 'say', 'usually', 'not', 'calculate', 'tip', 'head', 'door', 'catch', 'guard', 'shout', 'direction', 'turn', 'waitress', 'approach', 'say', 'tip', 'wasn', 'large', 'insult', 'feel', 'money', 'grant', 'embarrassed', 'ask', 'size', 'group', 'gratuit

## Build word embedding using Word2vec

In [28]:
workers = multiprocessing.cpu_count()
word_model = Word2Vec(all_docs.doc_words,
                      min_count=2,
                      size=100,
                      window=5,
                      workers=workers,
                      iter=100)

In [31]:
word_model.wv.syn0.shape

  """Entry point for launching an IPython kernel.


(38977, 100)

In [34]:
word_model.wv.syn0[1]

  """Entry point for launching an IPython kernel.


array([ 1.7647111 , -0.73086184, -1.2468382 , -1.0027946 , -0.96313715,
        0.9111134 ,  0.65977895,  3.5456011 , -0.02235585,  1.6440451 ,
       -0.0661039 ,  0.41211852, -0.19577555,  0.1812265 ,  1.5138743 ,
        0.02916883,  0.4625483 , -1.3284774 , -0.45937747, -1.6949239 ,
        1.2153699 ,  4.196206  , -0.50019646,  0.48818356, -0.6409747 ,
        1.5496792 ,  1.1308266 , -2.791238  , -0.7878722 ,  1.9967105 ,
       -2.0945165 ,  2.8918045 , -2.4257357 , -0.78464067, -2.8459082 ,
        4.979463  , -2.870692  ,  1.8776709 , -0.87444013, -0.9911716 ,
       -4.8545923 , -0.29963732,  0.27686313, -2.1057916 ,  1.8179989 ,
        1.1317976 ,  1.8244607 , -2.3895843 ,  1.934337  , -0.9373677 ,
        2.4383726 ,  1.4679741 , -0.45419896, -0.39970812, -1.4040339 ,
        0.5939909 ,  0.5153689 ,  0.71926403,  1.5762645 , -0.29474178,
       -0.35648167,  0.00639748,  1.3986342 ,  0.8788819 , -0.4781381 ,
        3.0308967 ,  1.1946028 , -0.2747328 ,  2.299122  ,  0.26

## averaging word embedding in each review

In [32]:
from UtilWordEmbedding import MeanEmbeddingVectorizer

mean_vec_tr = MeanEmbeddingVectorizer(word_model)
doc_vec = mean_vec_tr.transform(all_docs.doc_words)



In [42]:
doc_vec.shape

(13299, 100)

In [59]:
np.savetxt(os.path.join(dir_path,'doc_vec.csv'), doc_vec, delimiter=',')

In [62]:
mean_embedding_df = df.join(pd.DataFrame(doc_vec))

XGBOOST

In [251]:
train_df = mean_embedding_df[mean_embedding_df["labels"] != "[None]" ]
test_df = mean_embedding_df[mean_embedding_df["labels"] == "[None]" ]
X_train, y_train =train_df.drop(['text', 'labels', 'zipcode'], axis=1), train_df["labels"]
X_test, y_test =test_df.drop(['text', 'labels', 'zipcode'], axis=1), test_df["labels"]

In [153]:
dtrain = xgb.DMatrix(np.array(X_train), label=np.array(y_train))
dtest = xgb.DMatrix(np.array(X_test))

In [196]:
model = XGBClassifier(n_estimators=1000)
model.fit(np.array(X_train), np.array(y_train))
y_pred = model.predict(np.array(X_test))


In [190]:
param = {'max_depth': 6, 'eta': 0.3, 'objective': 'binary:logistic', 'subsample':0.8, "n_estimators":20000}
param['nthread'] = 4
param['eval_metric'] = 'auc'
bst = xgb.train(param, dtrain)
y_pred = bst.predict(dtest)
y_pred = np.where(y_pred > 0.95, 1, 0)

In [197]:
np.savetxt('./average_word2vec_predictions.out', y_pred, fmt='%s')
with open('./average_word2vec_predictions.out', 'r') as original: data = original.read()
with open('./average_word2vec_predictions.out', 'w') as modified: modified.write("Viraj Bhalala(vbb2)\n" + data)

In [109]:
X_train.shape

(546, 202)

- F1: 0.7027

Deep Learning

In [252]:
def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [301]:
model = Sequential()
model.add(Dense(150, input_dim=201, activation='linear', kernel_initializer= "random_uniform"))
model.add(Dropout(0.4))
model.add(Dense(100, activation='linear', kernel_initializer= "random_uniform"))
model.add(Dropout(0.4))
model.add(Dense(50, activation='linear', kernel_initializer= "random_uniform"))
model.add(Dropout(0.4))
model.add(Dense(10, activation='linear', kernel_initializer= "random_uniform"))
model.add(Dropout(0.4))

model.add(Dense(1, activation='sigmoid', kernel_initializer= "random_uniform", bias_initializer='zeros'))

model.compile(optimizer=optimizers.Adam(lr=0.0001),
              loss='binary_crossentropy',
              metrics=["binary_accuracy"])


model.fit(np.array(X_train, dtype=np.float32),np.array(y_train, dtype=np.float32) , epochs=100, batch_size=64)
y_pred = model.predict(np.array(X_test, dtype=np.float32))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100


Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [302]:
y_pred = np.where(y_pred > 0.5, 1, 0)
np.savetxt('./average_word2vec_predictions_dl.out', y_pred, fmt='%s')
with open('./average_word2vec_predictions_dl.out', 'r') as original: data = original.read()
with open('./average_word2vec_predictions_dl.out', 'w') as modified: modified.write("Viraj Bhalala(vbb2)\n" + data)