In [None]:

import pandas as pd
import numpy as np
import sys  
import re
import sklearn
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer
import ast
from xgboost import plot_importance
import matplotlib.pyplot as plt


import spacy  # For preprocessing
import re  # For preprocessing


import multiprocessing

from gensim.models import Word2Vec
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.utils import to_categorical
from keras import optimizers

import keras


In [None]:
hygiene_text_path= "../data/Hygiene/hygiene.dat"
hygiene_labels_path= "../data/Hygiene/hygiene.dat.labels"
hygiene_others_path= "../data/Hygiene/hygiene.dat.additional"

In [None]:
with open(hygiene_text_path) as f:
    arrText = [l.rstrip() for l in f]
with open(hygiene_labels_path) as f:
    arrLabels = [l.rstrip() for l in f]

df = pd.DataFrame({'text':arrText, 'labels':arrLabels})
hygiene_others = pd.read_csv(hygiene_others_path, names=["cuisines", "zipcode", "reviews", "avg_ratings"])
df = df.join(hygiene_others)

In [None]:
df.cuisines = [ast.literal_eval(x) for x in df.cuisines]
mlb = MultiLabelBinarizer()

res = pd.DataFrame(mlb.fit_transform(df.cuisines),
                   columns=mlb.classes_,
                   index=df.cuisines.index)
df = df.drop("cuisines", axis =1)
df = df.join(res)

In [None]:
df.columns[df.isna().any()].tolist()

### Baseline model without using NLP

In [None]:
train_df = df[df["labels"] != "[None]" ]
test_df = df[df["labels"] == "[None]" ]
X_train, y_train =train_df.drop(['text', 'labels', "zipcode"], axis=1), train_df["labels"]
X_test, y_test =test_df.drop(['text', 'labels', "zipcode"], axis=1), test_df["labels"]

In [None]:
# import xgboost as xgb
# dtrain = xgb.DMatrix(np.array(X_train), label=np.array(y_train))
# dtest = xgb.DMatrix(np.array(X_test))
# param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
# param['nthread'] = 4
# param['eval_metric'] = 'auc'
# bst = xgb.train(param, dtrain, 10)
# y_pred = bst.predict(dtest)


In [None]:
model = XGBClassifier()
model.fit(np.array(X_train), np.array(y_train))
y_pred = model.predict(np.array(X_test))

In [None]:
np.savetxt('./baseline_predictions.out', y_pred, fmt='%s')
with open('./baseline_predictions.out', 'r') as original: data = original.read()
with open('./baseline_predictions.out', 'w') as modified: modified.write("Viraj Bhalala(vbb2)\n" + data)

- F1: 0.6659

In [None]:
nlp = spacy.load('en', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)
    
    
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['text'])


In [None]:
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]


In [None]:
from UtilWordEmbedding import DocPreprocess
nlp = spacy.load('en', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed
stop_words = spacy.lang.en.stop_words.STOP_WORDS
all_docs = DocPreprocess(nlp, stop_words, df['text'], df['labels'])




In [None]:
import pickle
import os
dir_path = "./"
# Save all_docs as pickle.
with open(os.path.join(dir_path, 'all_docs.pickle'), 'wb') as f:
    pickle.dump(all_docs, f, pickle.HIGHEST_PROTOCOL)
# Read pickle.
with open(os.path.join(dir_path, 'all_docs.pickle'), 'rb') as f:
    all_docs = pickle.load(f)

In [None]:
len(all_docs.tagdocs), df.shape


## Build word embedding using Word2vec

In [None]:
workers = multiprocessing.cpu_count()
word_model = Word2Vec(all_docs.doc_words,
                      min_count=2,
                      size=100,
                      window=5,
                      workers=workers,
                      iter=100)

In [None]:
word_model.wv.syn0.shape

In [None]:
word_model.wv.syn0[1]

## averaging word embedding in each review

In [None]:
from UtilWordEmbedding import MeanEmbeddingVectorizer

mean_vec_tr = MeanEmbeddingVectorizer(word_model)
doc_vec = mean_vec_tr.transform(all_docs.doc_words)

In [None]:
doc_vec.shape

In [None]:
np.savetxt(os.path.join(dir_path,'doc_vec.csv'), doc_vec, delimiter=',')

In [None]:
mean_embedding_df = df.join(pd.DataFrame(doc_vec))

XGBOOST

In [None]:
X_train

In [None]:
train_df = mean_embedding_df[mean_embedding_df["labels"] != "[None]" ]
test_df = mean_embedding_df[mean_embedding_df["labels"] == "[None]" ]
X_train, y_train =train_df.drop(['text', 'labels', 'zipcode'], axis=1), train_df["labels"]
X_test, y_test =test_df.drop(['text', 'labels', 'zipcode'], axis=1), test_df["labels"]

In [None]:
dtrain = xgb.DMatrix(np.array(X_train), label=np.array(y_train))
dtest = xgb.DMatrix(np.array(X_test))

In [None]:
model = XGBClassifier(n_estimators=100, subsample=1, colsample_bytree=1, colsample_bylevel=1)
model.fit(np.array(X_train), np.array(y_train))
y_pred = model.predict(np.array(X_test))


In [None]:
param = {'max_depth': 6, 'eta': 0.3, 'objective': 'binary:logistic', 'subsample':0.8, "n_estimators":200}
param['nthread'] = 4
param['eval_metric'] = 'auc'
bst = xgb.train(param, dtrain)
y_pred = bst.predict(dtest)
y_pred = np.where(y_pred > 0.95, 1, 0)

In [None]:
np.savetxt('./average_word2vec_predictions.out', y_pred, fmt='%s')
with open('./average_word2vec_predictions.out', 'r') as original: data = original.read()
with open('./average_word2vec_predictions.out', 'w') as modified: modified.write("Viraj Bhalala(vbb2)\n" + data)

In [None]:
X_train.shape

- F1: 0.7027

Deep Learning

In [None]:
def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
model = Sequential()
model.add(Dense(150, input_dim=201, activation='linear', kernel_initializer= "random_uniform"))
model.add(Dropout(0.5))
model.add(Dense(100, activation='linear', kernel_initializer= "random_uniform"))
model.add(Dropout(0.5))
model.add(Dense(50, activation='linear', kernel_initializer= "random_uniform"))
model.add(Dropout(0.5))
model.add(Dense(10, activation='linear', kernel_initializer= "random_uniform"))
model.add(Dropout(0.5))

model.add(Dense(1, activation='sigmoid', kernel_initializer= "random_uniform", bias_initializer='zeros'))

model.compile(optimizer=optimizers.Adam(lr=0.0001),
              loss='binary_crossentropy',
              metrics=["binary_accuracy"])


model.fit(np.array(X_train, dtype=np.float32),np.array(y_train, dtype=np.float32) , epochs=100, batch_size=32)
y_pred = model.predict(np.array(X_test, dtype=np.float32))
y_pred = np.where(y_pred > 0.5, 1, 0)
np.savetxt('./average_word2vec_predictions_dl.out', y_pred, fmt='%s')
with open('./average_word2vec_predictions_dl.out', 'r') as original: data = original.read()
with open('./average_word2vec_predictions_dl.out', 'w') as modified: modified.write("Viraj Bhalala(vbb2)\n" + data)

In [None]:
from UtilWordEmbedding import TfidfEmbeddingVectorizer
tfidf_vec_tr = TfidfEmbeddingVectorizer(word_model)

tfidf_vec_tr.fit(all_docs.doc_words)  # fit tfidf model first
tfidf_doc_vec = tfidf_vec_tr.transform(all_docs.doc_words)
np.savetxt(os.path.join(dir_path, './tfidf_doc_vec.csv'), tfidf_doc_vec, delimiter=',')


In [None]:
tfidf_mean_embedding_df = df.join(pd.DataFrame(tfidf_doc_vec))

In [None]:
train_df = tfidf_mean_embedding_df[tfidf_mean_embedding_df["labels"] != "[None]" ]
test_df = tfidf_mean_embedding_df[tfidf_mean_embedding_df["labels"] == "[None]" ]
X_train, y_train =train_df.drop(['text', 'labels', 'zipcode'], axis=1), train_df["labels"]
X_test, y_test =test_df.drop(['text', 'labels', 'zipcode'], axis=1), test_df["labels"]

In [None]:
model = Sequential()
model.add(Dense(150, input_dim=201, activation='linear', kernel_initializer= "random_uniform"))
model.add(Dropout(0.5))
model.add(Dense(100, activation='linear', kernel_initializer= "random_uniform"))
model.add(Dropout(0.5))
model.add(Dense(50, activation='linear', kernel_initializer= "random_uniform"))
model.add(Dropout(0.5))
model.add(Dense(10, activation='linear', kernel_initializer= "random_uniform"))
model.add(Dropout(0.5))

model.add(Dense(1, activation='sigmoid', kernel_initializer= "random_uniform", bias_initializer='zeros'))

model.compile(optimizer=optimizers.Adam(lr=0.0001),
              loss='binary_crossentropy',
              metrics=["binary_accuracy"])


model.fit(np.array(X_train, dtype=np.float32),np.array(y_train, dtype=np.float32) , epochs=100, batch_size=64)
y_pred = model.predict(np.array(X_test, dtype=np.float32))

y_pred = np.where(y_pred > 0.5, 1, 0)
np.savetxt('./average_word2vec_predictions_dl.out', y_pred, fmt='%s')
with open('./average_word2vec_predictions_dl.out', 'r') as original: data = original.read()
with open('./average_word2vec_predictions_dl.out', 'w') as modified: modified.write("Viraj Bhalala(vbb2)\n" + data)