In [27]:
!pip install gensim
!pip install pyrsm --user



In [1]:
import pickle
import pandas as pd
import itertools
from collections import Counter
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim.models import word2vec
from sklearn.linear_model import LogisticRegression
import os
import string
from collections import defaultdict
from math import log
from gensim.models import TfidfModel
from sklearn.metrics import f1_score
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as sp
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# Functions

In [3]:
# A function used to build a vocabulary based on descending word frequencies 
def build_vocab(sentences):
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return word_counts, vocabulary, vocabulary_inv

In [4]:
# A function used to learn word embeddings through Word2vec module
def get_embeddings(inp_data, vocabulary_inv, size_features=100,
                   mode='skipgram',
                   min_word_count=2,
                   context=5):
    model_name = "embedding"
    model_name = os.path.join(model_name)
    num_workers = 15  # Number of threads to run in parallel
    downsampling = 1e-3  # Downsample setting for frequent words
    print('Training Word2Vec model...')
    # use inp_data and vocabulary_inv to reconstruct sentences
    sentences = [[vocabulary_inv[w] for w in s] for s in inp_data]
    if mode == 'skipgram':
        sg = 1
        print('Model: skip-gram')
    elif mode == 'cbow':
        sg = 0
        print('Model: CBOW')
    embedding_model = word2vec.Word2Vec(sentences, workers=num_workers,
                                        sg=sg,
                                        vector_size=size_features,
                                        min_count=min_word_count,
                                        window=context,
                                        sample=downsampling)
    print("Saving Word2Vec model {}".format(model_name))
    embedding_weights = np.zeros((len(vocabulary_inv), size_features))
    for i in range(len(vocabulary_inv)):
        word = vocabulary_inv[i]
        if word in embedding_model.wv:
            embedding_weights[i] = embedding_model.wv[word]
        else:
            embedding_weights[i] = np.random.uniform(-0.25, 0.25,
                                                     embedding_model.vector_size)
    return embedding_weights, embedding_model

In [5]:
from nltk.stem import PorterStemmer 
ps = PorterStemmer() 
def preprocess_df(df, stemming=False):
    # get English stopwords
    stop_words = set(stopwords.words('english'))
    stop_words.add('would')
    stop_words.add('The')
    # prepare translation table to translate punctuation to space
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    preprocessed_sentences = []
    for i, row in df.iterrows():
        sent = row["text"]
        sent_nopuncts = sent.translate(translator)
        words_list = sent_nopuncts.strip().split()
        if stemming == True:
            words_list = [ps.stem(word) for word in words_list]
        filtered_words = [word for word in words_list if word not in stop_words and len(word) != 1] # also skip space from above translation
        preprocessed_sentences.append(" ".join(filtered_words))
    df["text"] = preprocessed_sentences
    return df

In [6]:
def process_attributes(df, test=False):
    attribute_list = ['name','stars','review_count','attributes.OutdoorSeating','attributes.BusinessAcceptsCreditCards','attributes.RestaurantsTableService',
    'attributes.RestaurantsReservations','attributes.RestaurantsPriceRange2','attributes.HasTV','attributes.RestaurantsGoodForGroups','attributes.Caters',
    'attributes.RestaurantsTakeOut','attributes.RestaurantsDelivery','attributes.GoodForKids', 'attributes.BikeParking', 'latitude', 'longitude', 'postal_code', 'is_open', 'text']
    if test == False:
        attribute_list.append('label')
    else:
        pass
    df = df[attribute_list]
    col_names = df.columns
    col_name_clean = list(df[attribute_list].columns.str.replace('attributes.', ''))
    new_column_names = {column: column.replace('attributes.', '') for column in col_names}
    df = df.rename(columns=new_column_names)
    d = {'False': 0, 'True': 1}
    pattern = r'\b\d+\b(?:\s+\b\d+\b)*\s*'
    #clean b'
    for col in df:
        if col != 'text' and col != 'label' and col != 'name' and col != 'latitude' and col != 'longitude' and col != 'postal_code' and col != 'is_open':
            try:
                df[col] = df[col].str.extract(r"b'(.*?)'")
                df[col] = df[col].map(d)
            except:
                pass
        if col == 'name' or col == 'postal_code':
            df[col] = df[col].str.extract(r"b'(.*?)'")
        ## 0 if Canada postal code 1 if US
        if col == 'postal_code':
            df[col] = df[col].str.match(pattern)
    df = df.fillna(0)
    df['text'] = df['name'].astype(str) + ' ' + df['text']
    df[['OutdoorSeating','BusinessAcceptsCreditCards','RestaurantsTableService','RestaurantsReservations','RestaurantsPriceRange2','HasTV','RestaurantsGoodForGroups',
    'Caters','RestaurantsTakeOut','RestaurantsDelivery','GoodForKids', 'postal_code', 'is_open', 'BikeParking']] = df[['OutdoorSeating','BusinessAcceptsCreditCards','RestaurantsTableService',
    'RestaurantsReservations','RestaurantsPriceRange2','HasTV','RestaurantsGoodForGroups','Caters','RestaurantsTakeOut','RestaurantsDelivery','GoodForKids', 'postal_code', 'is_open', 'BikeParking']].astype('category')

    return df

In [7]:
def get_train_vali_test(X, y, ratio_train = 0.8, ratio_val = 0.1, ratio_test = 0.1):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ratio_test, random_state=42, shuffle=True)
    ratio_remaining = 1 - ratio_test
    ratio_val_adjusted = ratio_val / ratio_remaining
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=ratio_val_adjusted, random_state=42, shuffle=True)
    return X_train, X_test, X_val, y_train, y_test, y_val

# Main

In [8]:
data_path = "/."

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df_train["text"] = df_train["review"]
df_test["text"] = df_test["review"]
df_train = preprocess_df(df_train, stemming=False)
df_test = preprocess_df(df_test, stemming=False)
df_train = process_attributes(df_train)
df_test = process_attributes(df_test, test=True)

  col_name_clean = list(df[attribute_list].columns.str.replace('attributes.', ''))
  col_name_clean = list(df[attribute_list].columns.str.replace('attributes.', ''))


In [9]:
X_train_rf, X_test_rf, X_val_rf, y_train_rf, y_test_rf, y_val_rf = \
    get_train_vali_test(df_train.drop(['text', 'label', 'name', 'RestaurantsPriceRange2'], axis=1), df_train['label'])

In [10]:
clf_rf = RandomForestClassifier(max_depth=11, n_estimators=500, max_features=3, random_state=0)
clf_rf.fit(X_train_rf, y_train_rf)
f1_score(y_test_rf, clf_rf .predict(X_test_rf), average='micro')

0.3726235741444867

In [11]:
f1_score(y_val_rf, clf_rf .predict(X_val_rf), average='micro')

0.4038022813688213

In [12]:
roc_auc_score(y_test_rf, clf_rf.predict_proba(X_test_rf), multi_class='ovr')

0.7785755177018053

In [13]:
feature_names = list(X_train_rf.columns)
importance = pd.Series(clf_rf.feature_importances_, index=feature_names).sort_values(ascending=False)
feature_imp = pd.DataFrame(sorted(zip(importance,importance.index)), columns=['Value','Feature'])

In [14]:
importance

latitude                      0.164870
longitude                     0.164245
review_count                  0.129298
stars                         0.086543
RestaurantsDelivery           0.063767
OutdoorSeating                0.047138
RestaurantsReservations       0.045378
GoodForKids                   0.043876
HasTV                         0.038531
Caters                        0.033661
is_open                       0.030708
RestaurantsTableService       0.029000
BikeParking                   0.028762
postal_code                   0.027794
RestaurantsGoodForGroups      0.022933
BusinessAcceptsCreditCards    0.022625
RestaurantsTakeOut            0.020870
dtype: float64

In [16]:
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=True,
                        preprocessor=None,  # applied preprocessor in Data Cleaning
                        tokenizer=word_tokenize,
                        use_idf=True,
                        norm='l2',
                        smooth_idf=True,
                        stop_words= 'english',
                        max_df=0.4,
                        sublinear_tf=True)

In [18]:
df_train["text"]

0        Rush Inn So stopped way Side Quest street nWe ...
1        GreenMix This go healthy spot food always fres...
2        BarBurrito - Gerrard Food court meal Gerrard S...
3        SalvaMex Located Rainbow Charleston small fami...
4        Hop Hing No frills Chinese takeout joint serve...
                               ...                        
13139    Mariscos Vuelve a La Vida Worst food ever asha...
13140    Alize Catering My experience restaurant little...
13141    Taco Bell Giving one star write review Place s...
13142    Sushi Kai It good deal voucher However food no...
13143    China House write many reviews place good food...
Name: text, Length: 13144, dtype: object

In [17]:
X =  tfidf.fit_transform(df_train["text"])



In [155]:
X_train, X_test, X_val, y_train, y_test, y_val = get_train_vali_test(X, df_train['label'])

In [55]:
#tfidf_vec =  tfidf.fit_transform(X_train["text"])
#additional_features = sp.csr_matrix(X_train.drop(['text'], axis=1))
#X_train = sp.hstack((tfidf_vec,additional_features))

# Model Training

## logit

In [156]:
clf = LogisticRegression(max_iter=100000000, C=5, class_weight='balanced').fit(X_train, y_train)
f1_score(y_test, clf.predict(X_test), average='micro')

0.8167300380228137

In [157]:
f1_score(y_val, clf.predict(X_val), average='micro')

0.8076045627376426

In [158]:
roc_auc_score(y_test, clf.predict_proba(X_test), multi_class='ovr')

0.9686476545898808

In [159]:
roc_auc_score(y_test, 0.9*clf.predict_proba(X_test) + 0.1*clf_rf.predict_proba(X_test_rf), multi_class='ovr')

0.969639674526927

In [160]:
clf.classes_

array(['american (new)', 'american (traditional)', 'asian fusion',
       'canadian (new)', 'chinese', 'italian', 'japanese',
       'mediterranean', 'mexican', 'thai'], dtype=object)

In [173]:
pred_index = [d.tolist().index(max(d)) for d in (0.6*clf.predict_proba(X_test) + 0.4*clf_rf.predict_proba(X_test_rf))]
pred_test = [clf.classes_[i] for i in pred_index]
f1_score(y_test, pred_test, average='micro')

0.8212927756653993

In [174]:
pred_index = [d.tolist().index(max(d)) for d in (0.6*clf.predict_proba(X_val) + 0.4*clf_rf.predict_proba(X_val_rf))]
pred_val = [clf.classes_[i] for i in pred_index]
f1_score(y_val, pred_val, average='micro')

0.817490494296578

In [175]:
f1_score(y_val, pred_val, average=None)

array([0.5       , 0.73368607, 0.51515152, 0.51764706, 0.93841642,
       0.93203883, 0.91089109, 0.88      , 0.94444444, 0.92631579])

In [176]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_val, pred_val, labels=clf.classes_)

array([[ 70,  61,   2,   3,   1,   4,   0,   5,   3,   0],
       [ 41, 208,   0,   9,   1,   4,   0,   0,   2,   0],
       [  2,   2,  17,   2,   9,   1,   4,   0,   1,   2],
       [  2,   9,   0,  22,   0,   2,   0,   0,   2,   0],
       [  1,   2,   0,   1, 160,   0,   2,   0,   0,   1],
       [  5,   7,   0,   5,   0, 192,   0,   0,   0,   0],
       [  3,   2,   3,   0,   3,   0,  92,   0,   1,   0],
       [  1,   4,   0,   6,   0,   0,   0,  66,   2,   0],
       [  6,   6,   1,   0,   0,   0,   0,   0, 204,   0],
       [  0,   1,   3,   0,   0,   0,   0,   0,   0,  44]], dtype=int64)

In [23]:
# in your implemetation, create the output file using the same format
dic = {"Id": [], "Predicted": []}
df_X_test_lr = tfidf.transform(df_test['text'])
df_X_test_rf = df_test.drop(['text', 'name', 'RestaurantsPriceRange2'], axis=1)

In [24]:
pred_index = [d.tolist().index(max(d)) for d in (0.9*clf.predict_proba(df_X_test_lr) + 0.1*clf_rf.predict_proba(df_X_test_rf))]
preds = [clf.classes_[i] for i in pred_index]
for i, pred in enumerate(preds):
    dic["Id"].append(i)
    dic["Predicted"].append(pred)

dic_df = pd.DataFrame.from_dict(dic)
#dic_df.to_csv("predicted.csv", index=False)