In [1]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import StratifiedShuffleSplit

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import os
import pandas as pd
os.chdir('D:\APDS\Project\Yelp\py workspace')

In [3]:
df_recommended = pd.read_csv('..\\dataset\\RestaurantData\\recommended.csv')
df_not_recommended = pd.read_csv('..\\dataset\\RestaurantData\\not_recommended.csv')
frames = [df_recommended, df_not_recommended]
df = pd.concat(frames, keys=['1','0'])

In [4]:
def loadData(Text=None):
    for row in df.itertuples():
        (Id, Text, Label) = parseReview(row)
        rawData.append((Id, Text, Label))
        preprocessedData.append((Id, preProcess(Text), Label))

In [5]:
def parseReview(reviewRow):
    # Should return a triple of an integer, a string containing the review, and a string indicating the label    
    return (reviewRow[2], reviewRow[4], reviewRow[0][0])

In [6]:
# TEXT PREPROCESSING AND FEATURE VECTORIZATION

# Input: a string of one review
def preProcess(text):
    # Should return a list of tokens
    return word_tokenize(text)

In [7]:
def splitData(testSize):
    rawDataArr = np.array(rawData)
    X = rawDataArr[:,[0,1]]
    y = rawDataArr[:,[2]]
    sss = StratifiedShuffleSplit(n_splits=1, test_size=testSize, random_state=0)
    sss.get_n_splits(X, y)
    
    for train_index, test_index in sss.split(X, y):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        train = np.concatenate((X_train,y_train),axis=1) #change this if number of splits>1
        test = np.concatenate((X_test,y_test),axis=1) #change this if number of splits>1
        
    for (_, Text, Label) in train[:]:
        trainData.append((toFeatureVector(preProcess(Text)),Label))
        rawTrainData.append((Text,Label))
    for (_, Text, Label) in test[:]:
        testData.append((toFeatureVector(preProcess(Text)),Label))
        rawTestData.append((Text,Label))

In [8]:
featureDict = {} # A global dictionary of features

def toFeatureVector(tokens):
    localDict = {}
    for token in tokens:
        if token not in featureDict:
            featureDict[token] = 1
        else:
            featureDict[token] = +1
   
        if token not in localDict:
            localDict[token] = 1
        else:
            localDict[token] = +1
    
    return localDict

In [9]:
# loading reviews
rawData = []          
preprocessedData = [] 
trainData = []        
testData = []   
rawTrainData = []
rawTestData = []

print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData()
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')

splitData(0.3)
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 4419 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 4419 rawData, 3093 trainData, 1326 testData
Training Samples: 
3093
Features: 
22994


In [10]:
dataDF = pandas.DataFrame({'text':list( rawData[i][1] for i in range(len(rawData)-1)),
                          'label':list( rawData[i][2] for i in range(len(rawData)-1))})

train_x = np.asarray(list( rawTrainData[i][0] for i in range(len(rawTrainData)-1)))
valid_x = np.asarray(list( rawTestData[i][0] for i in range(len(rawTestData)-1))) 
train_y = list( rawTrainData[i][1] for i in range(len(rawTrainData)-1))
valid_y = list( rawTestData[i][1] for i in range(len(rawTestData)-1))

### Count vectors as features

In [11]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(dataDF['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

### TF IDF as features

In [12]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(dataDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(dataDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(dataDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

### Word Embeddings

In [13]:
# load the pre-trained word-embedding vectors 
embeddings_index = {}
for i, line in enumerate(open('../dataset/wiki-news-300d-1M.vec', encoding="utf8")):
    values = line.split()
    embeddings_index[values[0]] = numpy.asarray(values[1:], dtype='float32')

# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(dataDF['text'])
word_index = token.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=70)

# create token-embedding mapping
embedding_matrix = numpy.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [14]:
dataDF['char_count'] = dataDF['text'].apply(len)
dataDF['word_count'] = dataDF['text'].apply(lambda x: len(x.split()))
dataDF['word_density'] = dataDF['char_count'] / (dataDF['word_count']+1)
dataDF['punctuation_count'] = dataDF['text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
dataDF['title_word_count'] = dataDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
dataDF['upper_case_word_count'] = dataDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [62]:
#nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Sahil Tyagi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [15]:
pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

dataDF['noun_count'] = dataDF['text'].apply(lambda x: check_pos_tag(x, 'noun'))
dataDF['verb_count'] = dataDF['text'].apply(lambda x: check_pos_tag(x, 'verb'))
dataDF['adj_count'] = dataDF['text'].apply(lambda x: check_pos_tag(x, 'adj'))
dataDF['adv_count'] = dataDF['text'].apply(lambda x: check_pos_tag(x, 'adv'))
dataDF['pron_count'] = dataDF['text'].apply(lambda x: check_pos_tag(x, 'pron'))

### Topic Models as features

In [16]:
# train a LDA Model
lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)
X_topics = lda_model.fit_transform(xtrain_count)
topic_word = lda_model.components_ 
vocab = count_vect.get_feature_names()

# view the topic models
n_top_words = 10
topic_summaries = []
for i, topic_dist in enumerate(topic_word):
    topic_words = numpy.array(vocab)[numpy.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))

### Model Building

In [17]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

### Naive Bayes

In [19]:
# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print ("NB, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("NB, N-Gram Vectors: ", accuracy)

# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print ("NB, CharLevel Vectors: ", accuracy)

NB, Count Vectors:  0.7924528301886793
NB, WordLevel TF-IDF:  0.7275471698113207
NB, N-Gram Vectors:  0.7275471698113207
NB, CharLevel Vectors:  0.7230188679245283


### Linear Classifier

In [23]:
# Linear Classifier on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)
print ("LR, Count Vectors: ", accuracy)

# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("LR, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("LR, N-Gram Vectors: ", accuracy)

# Linear Classifier on Character Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print ("LR, CharLevel Vectors: ", accuracy)

LR, Count Vectors:  0.7856603773584906
LR, WordLevel TF-IDF:  0.7916981132075471
LR, N-Gram Vectors:  0.7373584905660377
LR, CharLevel Vectors:  0.7886792452830189


### SVM

In [25]:
# SVM on Ngram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("SVM, N-Gram Vectors: ", accuracy)

SVM, N-Gram Vectors:  0.7071698113207547


### Bagging

In [26]:
# RF on Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count)
print ("RF, Count Vectors: ", accuracy)

# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("RF, WordLevel TF-IDF: ", accuracy)

RF, Count Vectors:  0.7283018867924528
RF, WordLevel TF-IDF:  0.7411320754716981


### Boosting

In [27]:
# Extereme Gradient Boosting on Count Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())
print ("Xgb, Count Vectors: ", accuracy)

# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())
print ("Xgb, WordLevel TF-IDF: ", accuracy)

# Extereme Gradient Boosting on Character Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram_chars.tocsc(), train_y, xvalid_tfidf_ngram_chars.tocsc())
print ("Xgb, CharLevel Vectors: ", accuracy)

  if diff:


Xgb, Count Vectors:  0.7773584905660378


  if diff:


Xgb, WordLevel TF-IDF:  0.780377358490566
Xgb, CharLevel Vectors:  0.7849056603773585


  if diff:
