# Yelp Reviews for Senti-Analysis Binary-N/P+

#### **Importing Libraries**

In [197]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud,STOPWORDS
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn import preprocessing,tree
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,recall_score,confusion_matrix,f1_score
from sklearn.model_selection import GridSearchCV
from scipy import sparse
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
import xgboost as xgb
import string

importing the datasets

In [None]:
! mkdir ~/.kaggle #creating folder
! cp kaggle.json ~/.kaggle/ #copying kaggle.json
! chmod 600 ~/.kaggle/kaggle.json #reading the file with full access

In [None]:
! kaggle datasets download -d yacharki/yelp-reviews-for-sentianalysis-binary-np-csv

In [None]:
!unzip /content/yelp-reviews-for-sentianalysis-binary-np-csv.zip

In [14]:
yelp_train_data = pd.read_csv('/content/drive/MyDrive/train.csv')
yelp_test_data = pd.read_csv('/content/drive/MyDrive/test.csv')

In [15]:
print(yelp_train_data.shape)
print(yelp_test_data.shape)

(560000, 2)
(38000, 2)




- In total there are 560,000 training samples and 2 columns in them corresponding to **class index** (1 and 2) and **review text**. Negative polarity is class 1, and positive polarity is class 2.

- Also, we can see that there are 38000 testing samples.

#### **Text cleaning and Data Preprocessing** 

In [None]:
#cleaning the reviews
def cleaning_text(review):

    #removing the url's
    review = re.sub('http\S+\s*', ' ', review)
    #removing the  punctuations
    review = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>@[\]^_`{|}~"""), ' ', review)
    #removing non-ascii characters
    review = re.sub(r'[^\x00-\x7f]',r' ', review)
    #removing mentions (i.e, @)
    review = re.sub('@\S+', '  ', review)
    #removing hashtags
    review = re.sub('#\S+', ' ', review)
    #remove numbers
    review = re.sub("\d+", ' ', review)
    #removing extra whitespaces, wherever applicable
    review = re.sub('\s+', ' ',review)
    #converting the text into lowercase
    review = review.lower()

    return reviews

Here, we added a new column **clean_review_text** by cleaning the review_text(by removing punctuations, Url's, numbers, etc).

In [None]:
yelp_train_data['clean_review_text'] = yelp_train_data['review_text'].apply(cleaning_text)
yelp_test_data['clean_review_text'] = yelp_test_data['review_text'].apply(cleaning_text)

In [None]:
#train data after adding a new column with clean reviews
yelp_train_data.head()

Unnamed: 0,class_index,review_text,clean_review_text
0,1,"Unfortunately, the frustration of being Dr. Go...",unfortunately the frustration of being dr gold...
1,2,Been going to Dr. Goldberg for over 10 years. ...,been going to dr goldberg for over years i thi...
2,1,I don't know what Dr. Goldberg was like before...,i don t know what dr goldberg was like before ...
3,1,I'm writing this review to give you a heads up...,i m writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...,all the food is great here but the best thing ...


In [None]:
#test data after adding a new column with clean reviews
yelp_test_data.head()

Unnamed: 0,class_index,review_text,clean_review_text
0,2,"Contrary to other reviews, I have zero complai...",contrary to other reviews i have zero complain...
1,1,Last summer I had an appointment to get new ti...,last summer i had an appointment to get new ti...
2,2,"Friendly staff, same starbucks fair you get an...",friendly staff same starbucks fair you get any...
3,1,The food is good. Unfortunately the service is...,the food is good unfortunately the service is ...
4,2,Even when we didn't have a car Filene's Baseme...,even when we didn t have a car filene s baseme...


**Feature Engineering:** Adding custom features

In [None]:
def custom_features(data):
    #number of words in each review
    data['no_of_words'] = data['clean_review_text'].apply(lambda x: len(str(x).split()))
    #average length of each word in each review
    data['avg_length_word'] = data['clean_review_text'].apply(lambda x: np.average([len(each_word) for each_word in str(x).split()]))
    #number of characters in each review
    data['no_of_characters'] = data['clean_review_text'].apply(lambda x: len(str(x)))
    #number of unique words in each review
    data['no_of_unique_words'] = data['clean_review_text'].apply(lambda x: len(set(str(x).split())))
    
    return data

In [None]:
yelp_train_data = custom_features(yelp_train_data)
yelp_test_data = custom_features(yelp_test_data)

  avg = a.mean(axis)


In [None]:
print(yelp_train_data.shape)
print(yelp_test_data.shape)

(560000, 7)
(38000, 7)


In [None]:
#train data after adding custom features
yelp_train_data.head()

Unnamed: 0,class_index,review_text,clean_review_text,no_of_words,avg_length_word,no_of_characters,no_of_unique_words
0,1,"Unfortunately, the frustration of being Dr. Go...",unfortunately the frustration of being dr gold...,118,4.237288,618,80
1,2,Been going to Dr. Goldberg for over 10 years. ...,been going to dr goldberg for over years i thi...,98,3.908163,481,71
2,1,I don't know what Dr. Goldberg was like before...,i don t know what dr goldberg was like before ...,213,4.234742,1115,132
3,1,I'm writing this review to give you a heads up...,i m writing this review to give you a heads up...,203,4.029557,1021,108
4,2,All the food is great here. But the best thing...,all the food is great here but the best thing ...,76,4.105263,388,53


In [None]:
#test data after adding custom features
yelp_test_data.head()

Unnamed: 0,class_index,review_text,clean_review_text,no_of_words,avg_length_word,no_of_characters,no_of_unique_words
0,2,"Contrary to other reviews, I have zero complai...",contrary to other reviews i have zero complain...,125,4.296,662,91
1,1,Last summer I had an appointment to get new ti...,last summer i had an appointment to get new ti...,72,3.861111,350,55
2,2,"Friendly staff, same starbucks fair you get an...",friendly staff same starbucks fair you get any...,15,5.0,90,14
3,1,The food is good. Unfortunately the service is...,the food is good unfortunately the service is ...,54,4.5,297,42
4,2,Even when we didn't have a car Filene's Baseme...,even when we didn t have a car filene s baseme...,143,4.230769,748,86


In [None]:
#let's see which column/columns have null values
print(yelp_train_data.isnull().sum())
print(yelp_test_data.isnull().sum())

class_index            0
review_text            0
clean_review_text      0
no_of_words            0
avg_length_word       29
no_of_characters       0
no_of_unique_words     0
dtype: int64
class_index           0
review_text           0
clean_review_text     0
no_of_words           0
avg_length_word       0
no_of_characters      0
no_of_unique_words    0
dtype: int64


In [None]:
#From the above output, we can see that in the training data the column 'avg_length_word' has null values.
#So,removing the rows that contain null values
yelp_train_data = yelp_train_data.dropna()
yelp_train_data.shape

(559971, 7)

Partitioning the training and testing data and filtering the columns.


In [None]:
#creating a dataframe only for custom features
yelp_cf_train = yelp_train_data[['no_of_words','avg_length_word','no_of_characters','no_of_unique_words']]
yelp_cf_test = yelp_test_data[['no_of_words','avg_length_word','no_of_characters','no_of_unique_words']]
print(yelp_cf_train.shape)
print(yelp_cf_test.shape)

(559971, 4)
(38000, 4)


In [None]:
#target variables
y_train = yelp_train_data['class_index']
y_test = yelp_test_data['class_index']
print(y_train.shape)
print(y_test.shape)

(559971,)
(38000,)


#### **Count Vectorizer**

- Convert a collection of text documents to a matrix of token counts

In [None]:
def count_vect(train_data,test_data):

    #Initialising the countvectorizer
    #df:When building the vocabulary it ignores the terms that have a document frequency strictly lower than the given threshold(df=5)
    vectorizer = CountVectorizer(min_df=5)
    #fit_transform learns the vocabulary dictionary and return document-term matrix.
    train_cv = vectorizer.fit_transform(train_data)
    #Transform documents to document-term matrix
    test_cv = vectorizer.transform(test_data)

    return train_cv,test_cv

In [None]:
yelp_train_cv,yelp_test_cv=count_vect(yelp_train_data['clean_review_text'],yelp_test_data['clean_review_text'])

In [None]:
print(yelp_train_cv.shape)
print(yelp_test_cv.shape)

(559971, 61467)
(38000, 61467)


#### **TF-IDF Vectorizer**

- Convert a collection of raw documents to a matrix of TF-IDF features.

In [None]:
def tf_idf_vect(train_data,test_data):

    #Initialising the tf-idf vectorizer
    #df:When building the vocabulary it ignores the terms that have a document frequency strictly lower than the given threshold(df=5)
    vectorizer = TfidfVectorizer(min_df=5)
    #fit_transform learns the vocabulary dictionary and return document-term matrix.
    train_cv = vectorizer.fit_transform(train_data)
    #Transform documents to document-term matrix
    test_cv = vectorizer.transform(test_data)

    return train_cv,test_cv

In [None]:
yelp_train_tfidf,yelp_test_tfidf = tf_idf_vect(yelp_train_data['clean_review_text'],yelp_test_data['clean_review_text'])

In [None]:
print(yelp_train_tfidf.shape)
print(yelp_test_tfidf.shape)

(559971, 61467)
(38000, 61467)


#### **Feature Scaling(using l2 Norm)**

- Normalize samples individually to unit norm

In [None]:
def normalize_data(train_data,test_data):
    #usisng the l2 regularization
    scaling_data = preprocessing.Normalizer(norm='l2')
    #fit-transform on training data
    train_scaled = scaling_data.fit_transform(train_data)
    #transform on test data
    test_scaled = scaling_data.transform(test_data)

    return train_scaled,test_scaled

### **Machine Learning models**

#### **Logistic Regression**

In [None]:
def logistic_reg(x_train,y_train,x_test,y_test,c,p,m):
    
    #logistic regression model
    #C is the regularization strength and penalty the norm used in the penalization.
    log_reg = LogisticRegression(C = c,penalty=p,max_iter=m)
    #fitting the model
    log_reg.fit(x_train,y_train)
    #predict on train data
    y_pred_train = log_reg.predict(x_train)
    #on test data
    y_pred_test = log_reg.predict(x_test)
    #calculating the metrics to evaluate the model
    #accuracy
    train_acc = accuracy_score(y_pred_train,y_train)
    test_acc = accuracy_score(y_pred_test,y_test)
    #f1score
    train_f1 = f1_score(y_pred_train,y_train) 
    test_f1 = f1_score(y_pred_test,y_test)

    return train_acc,test_acc,train_f1,test_f1

**Logistic Regression with count Vectorizer**

In [None]:
#train data and test data consists of both custom features and count vectorizer matrix
train_data = sparse.hstack([yelp_train_cv,yelp_cf_train])
test_data = sparse.hstack([yelp_test_cv,yelp_cf_test])

In [None]:
#now normalizing the data before passing into the model
X_scaled_train_data,X_scaled_test_data = normalize_data(train_data,test_data)

Using the **GridSearchCV** to find the best parameters and train the model using those parameters

In [None]:
parameters = {'C':[0.01,0.1,1,5,10],"penalty":["l1","l2"]}
lr = LogisticRegression()
lr_gscv = GridSearchCV(lr, parameters)
lr_gscv.fit(X_scaled_train_data,y_train)

Below, we can see the best parameters given by the GridSearchCV. {**C: 10, penalty:'l2'**}

In [None]:
lr_gscv.best_params_

{'C': 10, 'penalty': 'l2'}

In [None]:
#logistic regression with count vectorizer using the best parameters as per GridSearchCV
yelp_train_acc,yelp_test_acc,yelp_train_f1,yelp_test_f1 = logistic_reg(X_scaled_train_data,y_train,X_scaled_test_data,y_test,10,'l2',300)

In [None]:
print(f'Accuracy on train data:{round(yelp_train_acc*100,2)}%')
print(f'Accuracy on test data:{round(yelp_test_acc*100,2)}%')
print(f'F1-score on train data:{yelp_train_f1}')
print(f'F1-score on test data:{yelp_test_f1}')

Accuracy on train data:85.86%
Accuracy on test data:86.02%
F1-score on train data:0.861275665525782
F1-score on test data:0.8626822458897735


**Logistic Regression with TF-IDF Vectorizer**

In [None]:
#train data and test data consists of both custom features and tf-idf vectorizer matrix
train_data_tfidf = sparse.hstack([yelp_train_tfidf,yelp_cf_train])
test_data_tfidf = sparse.hstack([yelp_test_tfidf,yelp_cf_test])

In [None]:
#scaling the data
X_scaled_train_data_tfidf,X_scaled_test_data_tfidf = normalize_data(train_data_tfidf,test_data_tfidf)

In [None]:
#logistic regression with tf-idf vectorizer(without feature scaling) and best parameters
yelp_train_acc_tfidf,yelp_test_acc_tfidf,yelp_train_f1_tfidf,yelp_test_f1_tfidf = logistic_reg(train_data_tfidf,y_train,test_data_tfidf,y_test,10,'l2',300)

In [None]:
print(f'Accuracy on train data:{round(yelp_train_acc_tfidf*100,2)}%')
print(f'Accuracy on test data:{round(yelp_test_acc_tfidf*100,2)}%')
print(f'F1-score on train data:{yelp_train_f1_tfidf}')
print(f'F1-score on test data:{yelp_test_f1_tfidf}')

Accuracy on train data:91.56%
Accuracy on test data:91.7%
F1-score on train data:0.9159846410921889
F1-score on test data:0.9175467949388267


**Analysis:**

We can observe that the logistic regression(using best parameters) with TF-IDF vectorizer is giving us a better accuracy and F1-score than the count vectorizer.

#### **Support Vector Machines (SVMs)**

**Linear Support Vector Classification(LinearSVC)**


In [None]:
def linear_svc_model(x_train,y_train,x_test,y_test,c,t):
    #linear svc model. 
    linear_svc = LinearSVC(penalty='l2',C=c,tol=t)
    #fitting the model
    linear_svc.fit(x_train,y_train)
    #predict on train data
    y_pred_train = linear_svc.predict(x_train)
    #on test data
    y_pred_test = linear_svc.predict(x_test)
    #calculating the metrics to evaluate the model
    #accuracy
    train_acc = accuracy_score(y_pred_train,y_train)
    test_acc = accuracy_score(y_pred_test,y_test)
    #f1score
    train_f1 = f1_score(y_pred_train,y_train) 
    test_f1 = f1_score(y_pred_test,y_test)

    return train_acc,test_acc,train_f1,test_f1

**LinearSVC with count Vectorizer**

**GridSearchCV** to find the best parameters

In [None]:
parameters = {'C':[0.001,0.01, 0.1, 1, 10],'tol':[1,5,10]}
lsvc = LinearSVC()
clf = GridSearchCV(lsvc, parameters)
clf.fit(X_scaled_train_data,y_train)

GridSearchCV(estimator=LinearSVC(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10], 'tol': [1, 5, 10]})

The best parameter values for **C** and **tol** are shown below. Here, C is the regularization paramter and tol is the tolerance for stopping criteria.

In [None]:
clf.best_params_

{'C': 10, 'tol': 1}

In [None]:
#LinearSVC  
#using the same training and testing data, as used for the logistic regression model with count vectorizer
#c and t values are as per the GridSearchCV results.  
train_acc_cv_svc,test_acc_cv_svc,train_f1_cv_svc,test_f1_cv_svc = linear_svc_model(X_scaled_train_data,y_train,X_scaled_test_data,y_test,10,1)

In [None]:
print(f'Accuracy on train data:{round(train_acc_cv_svc*100,2)}%')
print(f'Accuracy on test data:{round(test_acc_cv_svc*100,2)}%')
print(f'F1-score on train data:{train_f1_cv_svc}')
print(f'F1-score on test data:{test_f1_cv_svc}')

Accuracy on train data:90.94%
Accuracy on test data:91.06%
F1-score on train data:0.9095136960774348
F1-score on test data:0.910880176350181


**LinearSVC Regression with TF-IDF Vectorizer**

In [None]:
#LinearSVC  
#using the same training and testing data(without scaling), as used for the logistic regression model with tf-idf vectorizer  
train_acc_tfidf_svc,test_acc_tfidf_svc,train_f1_tfidf_svc,test_f1_tfidf_svc = linear_svc_model(train_data_tfidf,y_train,test_data_tfidf,y_test,1,0.0001)

In [None]:
print(f'Accuracy on train data:{round(train_acc_tfidf_svc*100,2)}%')
print(f'Accuracy on test data:{round(test_acc_tfidf_svc*100,2)}%')
print(f'F1-score on train data:{train_f1_tfidf_svc}')
print(f'F1-score on test data:{test_f1_tfidf_svc}')

Accuracy on train data:89.94%
Accuracy on test data:90.02%
F1-score on train data:0.9048616726909292
F1-score on test data:0.9055014083804871


**Analysis:**

We can observe that the LinearSVC model with count vectorizer and LinearSVC model with TF-IDF vectorizer are almost giving us the same accuracy and F1-score. But, we can say that the LinearSVC model with count vectorizer is slightly better.

#### **Naive-Bayes classifier**

**Multinomial Naive Bayes classifier**

In [None]:
def mn_bayes_classifier(x_train,y_train,x_test,y_test,a):
    #Multinomial Naive Bayes classifier. 
    mnb_cls = MultinomialNB(alpha = a)
    #fitting the model
    mnb_cls.fit(x_train,y_train)
    #predict on train data
    y_pred_train = mnb_cls.predict(x_train)
    #on test data
    y_pred_test = mnb_cls.predict(x_test)
    #calculating the metrics to evaluate the model
    #accuracy
    train_acc = accuracy_score(y_pred_train,y_train)
    test_acc = accuracy_score(y_pred_test,y_test)
    #f1score
    train_f1 = f1_score(y_pred_train,y_train) 
    test_f1 = f1_score(y_pred_test,y_test)

    return train_acc,test_acc,train_f1,test_f1

**Multinomial Naive Bayes classifier with Count Vectorizer**

**GridSearchCV** for finding the best alpha value

In [None]:
parameters = {'alpha':[0.0001,0.001,0.01,0.1,0.2,0.4,0.6,0.8,1]}
m_nb =  MultinomialNB()
mnb_cv = GridSearchCV(m_nb, parameters)
mnb_cv.fit(X_scaled_train_data,y_train)

GridSearchCV(estimator=MultinomialNB(),
             param_grid={'alpha': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.4, 0.6, 0.8,
                                   1]})

In [None]:
mnb_cv.best_params_

{'alpha': 0.001}

In [None]:
#Mutinomial Naive Bayes classifier with count vectorizer
#Alpha value is used as per GridSearchCV  
train_acc_cv_mnb,test_acc_cv_mnb,train_f1_cv_mnb,test_f1_cv_mnb = mn_bayes_classifier(X_scaled_train_data,y_train,X_scaled_test_data,y_test,0.001)

In [None]:
print(f'Accuracy on train data:{round(train_acc_cv_mnb*100,2)}%')
print(f'Accuracy on test data:{round(test_acc_cv_mnb*100,2)}%')
print(f'F1-score on train data:{train_f1_cv_mnb}')
print(f'F1-score on test data:{test_f1_cv_mnb}')

Accuracy on train data:88.72%
Accuracy on test data:88.18%
F1-score on train data:0.8904015487416473
F1-score on test data:0.8854774958561775


**Multinomial Naive Bayes classifier with tf-idf Vectorizer**

**GridSearchCV** for finding the best alpha value

In [None]:
parameters = {'alpha':[0.0001,0.001,0.01,0.1,0.2,0.4,0.6,0.8,1]}
m_nb =  MultinomialNB()
mnb_tfidf = GridSearchCV(m_nb, parameters)
mnb_tfidf.fit(train_data_tfidf,y_train)

GridSearchCV(estimator=MultinomialNB(),
             param_grid={'alpha': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.4, 0.6, 0.8,
                                   1]})

In [None]:
mnb_tfidf.best_params_

{'alpha': 0.01}

In [None]:
#Mutinomial Naive Bayes classifier with tf-idf vectorizer  
#Alpha value is used as per GridSearchCV  
train_acc_tfidf_mnb,test_acc_tfidf_mnb,train_f1_tfidf_mnb,test_f1_tfidf_mnb = mn_bayes_classifier(train_data_tfidf,y_train,test_data_tfidf,y_test,0.01)

In [None]:
print(f'Accuracy on train data:{round(train_acc_tfidf_mnb*100,2)}%')
print(f'Accuracy on test data:{round(test_acc_tfidf_mnb*100,2)}%')
print(f'F1-score on train data:{train_f1_tfidf_mnb}')
print(f'F1-score on test data:{test_f1_tfidf_mnb}')

Accuracy on train data:76.82%
Accuracy on test data:76.41%
F1-score on train data:0.7463870474895427
F1-score on test data:0.7413584165272088


**Analysis:** We can observe that Multinomial Naive Bayes + Count Vectorizer is giving us the better accuracy and F1-score

#### **XGBoost Classifier**

In [None]:
def xgb_classifier(x_train,y_train,x_test,y_test):
    #XGBoost classifier
    xgb_cls = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
    #fitting the model
    xgb_cls.fit(x_train,y_train)
    #predict on train data
    y_pred_train = xgb_cls.predict(x_train)
    #on test data
    y_pred_test = xgb_cls.predict(x_test)
    #calculating the metrics to evaluate the model
    #accuracy
    train_acc = accuracy_score(y_pred_train,y_train)
    test_acc = accuracy_score(y_pred_test,y_test)
    #f1score
    train_f1 = f1_score(y_pred_train,y_train) 
    test_f1 = f1_score(y_pred_test,y_test)

    return train_acc,test_acc,train_f1,test_f1

**XGBoost Classifier with Count Vectorizer**

In [None]:
train_acc_cv_xgb,test_acc_cv_xgb,train_f1_cv_xgb,test_f1_cv_xgb = xgb_classifier(X_scaled_train_data,y_train,X_scaled_test_data,y_test)

In [None]:
print(f'Accuracy on train data:{round(train_acc_cv_xgb*100,2)}%')
print(f'Accuracy on test data:{round(test_acc_cv_xgb*100,2)}%')
print(f'F1-score on train data:{train_f1_cv_xgb}')
print(f'F1-score on test data:{test_f1_cv_xgb}')

Accuracy on train data:85.08%
Accuracy on test data:85.11%
F1-score on train data:0.8505760282130427
F1-score on test data:0.8507348478851685


**XGBoost Classifier with TF-IDF Vectorizer**

In [None]:
train_acc_tfidf_xgb,test_acc_tfidf_xgb,train_f1_tfidf_xgb,test_f1_tfidf_xgb = xgb_classifier(train_data_tfidf,y_train,test_data_tfidf,y_test)

In [None]:
print(f'Accuracy on train data:{round(train_acc_tfidf_xgb*100,2)}%')
print(f'Accuracy on test data:{round(test_acc_tfidf_xgb*100,2)}%')
print(f'F1-score on train data:{train_f1_tfidf_xgb}')
print(f'F1-score on test data:{test_f1_tfidf_xgb}')

Accuracy on train data:85.14%
Accuracy on test data:85.14%
F1-score on train data:0.8510208612330543
F1-score on test data:0.8507533703409992


**Analysis:**

We can see that the XGBoost Classifier with count vectorizer and TF-IDF vectorizer are almost giving us the same results.