# **WARNING** 

### We are dealing with raw web data. Some of the information that is retrieved might contain certain explicit content (words, phrases, or references)

# Data Engineering - NLP

## Exercise 1: NLP Tweets

For this exercise, use TfidfVectorizer and any TWO classification models you would like to correctly identify the sentiments of each review, in the Restaurant_Reviews.tsv file, as (Positive, or Negative). 

### Remember:
    1. Split your data into Train and Test sets
    2. Evaluate your model using the metrics of your choice (include a brief interpretation)
    3. Explain which model performed better and why (comparison of results)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RepeatedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, confusion_matrix,precision_score, mean_squared_error,r2_score
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import preprocessing
import statsmodels.api as sm

In [2]:
#Exercise 1
#first way to open tsv:
import csv

with open('../data/Restaurant_Reviews.tsv') as file:
    tsv_file =  csv.reader(file, delimiter="\t")
#     for line in tsv_file:
#         print(line)

In [3]:
#second way to open tsv:
review_data = pd.read_csv('../data/Restaurant_Reviews.tsv', sep='\t')
review_data.head(3)

#to make lower all word
# review_data.columns = [word.lower() for word in review_data.columns]

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0


In [4]:
review_data

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [5]:
review = review_data['Review']
liked = review_data['Liked']  

In [6]:
#Before fitting models on data, primary data will be divided to training and test data then:
#there will be 3 fitting models cited by ascending ordere of thier accuracy scores:

#===============
# 1) first model: 
#               a: "CountVectorizer" make vectore from primary data
#               b: fitted by "TfidfTransformer" through having frequency of words(vectors)
#               c: fitted by "logisticRegression" 
#===============
#2)second model:
#               a: "CountVectorizer" make vectore from primary data
#               b: fitted by "logisticRegression" 
#===============
#3)third model:
#               a: "CountVectorizer" make vectore from primary data
#               b: fitted by "TruncatedSVD" , 
#               c: fitted by "logisticRegression "

# First fitting

In [7]:
review = review_data['Review']
liked = review_data['Liked']  

vectorizer = CountVectorizer()

#devide to train and test data
xtrain, xtest, ytrain, ytest = train_test_split(
    review, liked, test_size=0.2, random_state=101)

xtrain = vectorizer.fit_transform(xtrain) 

xtest = vectorizer.transform(xtest)#transforme test data to fitted models


In [8]:
#trained data by LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(xtrain, ytrain)#transformed daat in previous part now trained by logistic regression

pred = log_reg.predict(xtest)#prediction by log_reg

In [9]:
print(classification_report(ytest, pred))

              precision    recall  f1-score   support

           0       0.82      0.75      0.79       102
           1       0.76      0.83      0.79        98

    accuracy                           0.79       200
   macro avg       0.79      0.79      0.79       200
weighted avg       0.79      0.79      0.79       200



# Second fitting

In [10]:
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

In [11]:
review = review_data['Review']
liked = review_data['Liked'] 

xtrain, xtest, ytrain, ytest = train_test_split(
    review, liked, test_size=0.2, random_state=101)#devide to train and test data


In [12]:
steps = [('vectorizer', CountVectorizer()),
         ('svd', TruncatedSVD(500)), 
         ('pca', PCA(500)),
         ('log_reg2', LogisticRegression())]#dimension reduction is 500
model = Pipeline(steps=steps)

#evaluation of fitting
cv = RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=101)
n_score = cross_val_score(
           model,xtrain, ytrain, 
           cv=cv, scoring='accuracy', n_jobs=-1, error_score='raise')
print(n_score.mean(), np.std(n_score))

0.8025 0.04036293679437445


In [13]:
model.fit(xtrain, ytrain)

In [14]:
ypred = model.predict(xtest)
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.82      0.75      0.78       102
           1       0.76      0.83      0.79        98

    accuracy                           0.79       200
   macro avg       0.79      0.79      0.78       200
weighted avg       0.79      0.79      0.78       200



# Third fitting¶

In [15]:
xtrain, xtest, ytrain, ytest = train_test_split(review, liked, test_size=0.2, random_state=101)

In [16]:
steps = [('vectorizer', CountVectorizer()),
         ('tf', TfidfTransformer()), 
         ('log_reg2', LogisticRegression())]#dimension reduction is 500
model3 = Pipeline(steps=steps)

In [17]:
model3.fit(xtrain, ytrain)

In [18]:
ypred3 = model3.predict(xtest)
print(classification_report(ytest, ypred3))

              precision    recall  f1-score   support

           0       0.84      0.79      0.81       102
           1       0.80      0.84      0.82        98

    accuracy                           0.81       200
   macro avg       0.82      0.82      0.81       200
weighted avg       0.82      0.81      0.81       200



     Regrading three fitting, the best one is the last which is a combination of vectorizing,
     TfidfTransforming and logisticregression. this combination could work efficiently 
     because of TfidfTransform, which could find the cosine similarity to predict more precisely.
     in the first model because ignoring the frequency of repetition of the word could not
     meet the accuracy to more than 80.
     the second one, it could improve the first model by dimensionality reduction which is still less 
     than the last fitting. this model also because removing some important information of 
     some text still could not be considered the best fitting. 
     RandomForest used but could not improved our score(70%), so I did not  mention it here.


## Exercise 2: App Review NLP work (Similar to Web Data workshop)

The Apple app store has a `GET` API to get reviews on apps. The URL is:

```
https://itunes.apple.com/{COUNTRY_CODE}/rss/customerreviews/id={APP_ID_HERE}/page={PAGE_NUMBER}/sortby=mostrecent/json
```

Note that you need to provide:

- The country codes (`'us'`, `'gb'`, `'ca'`, `'au'`) - use all four
- The app ID. This can be found in the web page for the app right after `id`.
    - You will need to find the IDs for these apps - Candy Crush, Facebook, Twitter & Tinder
- The "Page Number". The request responds with multiple pages of data, but sends them one at a time. So you can cycle through the data pages for any app on any country. (Be careful, there are limits to the number of pages you can access)

For example, Candy Crush's US webpage is `https://apps.apple.com/us/app/candy-crush-saga/id553834731`, which means that the ID is `553834731`.


Do the following:

1. Using the bag-of-words or TF-IDF vector model (from SKLearn), cluster the reviews into 5 clusters. Measure the accuracy of the cluster overlap against the real review scores. **(UNSUPERVISED LEARNING)**
1. Using any method you want (pre-trained models, dimensionality reduction, feature engineering, etc.) make the best **regression** model you can to predict the 5 star rating. Rate the accuracy in regression terms (mean squared error) and in classification terms (accuracy score, etc.)
1. Do the same as in 1.3, but use a classification model. Are classification models better or worse to predict a 5-point rating scale? Explain in a few paragraphs and justify with metrics.

ps. Feel free to do as much data engineering to boost your model. (ie binary vs multinomial)


In [19]:
    # exercise 2
import json
from urllib.request import urlopen
import requests
import regex
from bs4 import BeautifulSoup as bs
import re
from sklearn import metrics

In [20]:
#nltk.download('stopwords')

In [21]:
from sklearn.linear_model import LinearRegression
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import string
import nltk
from nltk.corpus import brown
from gensim.models import Word2Vec
from nltk import corpus

In this part:
1) collecting data for 4 countries(ca, us, au, gb), 

2) unsupervised modeling on data as reviews including KMeans clustering on whole data without spliting to train and test data

3) modeling without clustering which improves the accuracy score but still the R-squared is low shown and approved in crosstab

4) supervised modeling done by linearregression, 

In [22]:
#get all pages by while loop 
data_ca = pd.DataFrame()
page_number = 1
while True:
            reviews =requests.get(
                 f'https://itunes.apple.com/ca/rss/customerreviews/id=553834731/page={page_number}/sortby=mostrecent/json')
 
            if reviews.status_code==200:
                api = json.loads(reviews.content)
                api = pd.json_normalize(api['feed']['entry'])
                data_ca = pd.concat([api, data_ca], axis=0, ignore_index=True)
                page_number += 1
            else:
                 break
#data_ca.head(2)
        

In [23]:
data_us = pd.DataFrame()
page_number = 1
while True:
            reviews =requests.get(
                 f'https://itunes.apple.com/us/rss/customerreviews/id=553834731/page={page_number}/sortby=mostrecent/json')
 
            if reviews.status_code==200:
                api = json.loads(reviews.content)
                api = pd.json_normalize(api['feed']['entry'])
                data_us = pd.concat([api, data_us], axis=0, ignore_index=True)
                page_number += 1
            else:
                 break
#data_us.head(2)
 

In [24]:
data_au = pd.DataFrame()
page_number = 1
while True:
            reviews =requests.get(
                 f'https://itunes.apple.com/au/rss/customerreviews/id=553834731/page={page_number}/sortby=mostrecent/json')
 
            if reviews.status_code==200:
                api = json.loads(reviews.content)
                api = pd.json_normalize(api['feed']['entry'])
                data_au = pd.concat([api, data_au], axis=0, ignore_index=True)
                page_number += 1
            else:
                 break
#data_au.head(2)

In [25]:
data_gb = pd.DataFrame()
page_number = 1
while True:
            reviews =requests.get(
                 f'https://itunes.apple.com/gb/rss/customerreviews/id=553834731/page={page_number}/sortby=mostrecent/json')
 
            if reviews.status_code==200:
                api = json.loads(reviews.content)
                api = pd.json_normalize(api['feed']['entry'])
                data_gb = pd.concat([api, data_gb], axis=0, ignore_index=True)
                page_number += 1
            else:
                 break
#data_gb.head(2)

In [26]:
data = pd.concat([data_us, data_ca, data_au, data_gb], axis=0, ignore_index=True)#concatinating 4 countries querries

In [27]:
data.shape

(2000, 16)

In [28]:
reviews = data['content.label']#data 
rating = data['im:rating.label']#labels

In [29]:
#cleaning data:
def standardize_text(data):
    data = data.replace(r"http\S+", "")
    data = data.replace(r"http", "")
    data = data.replace(r"@\S+", "")
    data = data.replace(r"[^A-Za-z-(),!?@\'\`\"\_\n]", " ")
    data = data.replace(r"@", "at")
    #data = data.str.lower()
    return data

reviews = standardize_text(reviews)
reviews

0       I’ve been having fun playing this again but la...
1       I lost my progress and the games are boring to...
2                                  MAKE THE LEVELS EASIER
3       People spend too much money only to lose most ...
4       It would definitely be a 5 star review if the ...
                              ...                        
1995    Joined recently and am on around game 220 when...
1996                               Candy crush has no end
1997    Been playing them since 2014 went to play one ...
1998    This app keeps crashing half way through a gam...
1999    The further you get they make it impossible an...
Name: content.label, Length: 2000, dtype: object

In [30]:
stop_words = set(stopwords.words('english'))#to be excluded all unneccessary letters and signs
tokenizer = RegexpTokenizer(r'\w+')
reviews = reviews.apply(tokenizer.tokenize)
reviews

0       [I, ve, been, having, fun, playing, this, agai...
1       [I, lost, my, progress, and, the, games, are, ...
2                             [MAKE, THE, LEVELS, EASIER]
3       [People, spend, too, much, money, only, to, lo...
4       [It, would, definitely, be, a, 5, star, review...
                              ...                        
1995    [Joined, recently, and, am, on, around, game, ...
1996                         [Candy, crush, has, no, end]
1997    [Been, playing, them, since, 2014, went, to, p...
1998    [This, app, keeps, crashing, half, way, throug...
1999    [The, further, you, get, they, make, it, impos...
Name: content.label, Length: 2000, dtype: object

In [31]:
stop_words = list(stop_words)

In [32]:
#removing the extra signs and letters
filtered_reviews = [] 
for row in reviews:
    x = []
    for word in row:
        word = str(word)
        word = word.lower()
        if word not in stop_words:
            x.append(word)
    x = " ".join(x)
    filtered_reviews.append(x)
filtered_reviews =np.array(filtered_reviews)  
filtered_reviews

array(['fun playing lately app crashing loose life win streak prizes getting really annoying please fix',
       'lost progress games boring start beginning', 'make levels easier',
       ...,
       'playing since 2014 went play one day asked log facebook continue playing sent back begging',
       'app keeps crashing half way game keeps playing music even close app full bugs play anymore',
       'get make impossible guess yep basically forces purchase add ons'],
      dtype='<U1001')

In [33]:
tf = TfidfVectorizer(ngram_range=(1,2))
filtered_reviews_t = tf.fit_transform(filtered_reviews)
filtered_reviews_t.shape

(2000, 29155)

In [34]:
# svd = TruncatedSVD(2000)
# filtered_reviews_t = svd.fit_transform(filtered_reviews_t)

In [35]:
from scipy.sparse import csr_matrix
from sklearn.metrics import fowlkes_mallows_score
from sklearn.preprocessing import FunctionTransformer


In [65]:
kmean = KMeans(5)
kmean.fit(filtered_reviews_t)
predicted = kmean.predict(filtered_reviews_t)
labels = kmean.cluster_centers_
centers = np.unique(labels)
 #the result is sparse matrix so by todense we have to solve the problem in order to use test data in evaluation
# predicted = np.array(predicted)
# csr_matrix.todense(predicted)

clustering evaluation:
It has been tried to calculate this all scores but because of sparse matrix it could not be executed. I tried alot for todense but threw error!!????

In [37]:
# print("Homogeneity: %0.3f" % metrics.homogeneity_score(filtered_reviews_t, predicted))#how much could algorithm cluster the similar data in each samples 
# print("Completeness: %0.3f" % metrics.completeness_score(filtered_reviews_t, predicted))#how much similarity is there among samples in each cluster after clustering
# print("V-measure: %0.3f" % metrics.v_measure_score(filtered_reviews_t, predicted))#goodness of clustering
# print("Adjusted Rand Index: %0.3f"
#       % metrics.adjusted_rand_score(filtered_reviews_t, predicted))# computes a similarity measure between two clusterings by considering all pairs of samples 
# print("Adjusted Mutual Information: %0.3f"
#       % metrics.adjusted_mutual_info_score(filtered_reviews_t, predicted))#compute the similarity by comparing two clusters how many similar data have in common
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(filtered_reviews_t, predicted))#cohesion of clusters, some how they are separate[-1, 1]
# print("Accuracy: %0.3f"
#       % metrics.accuracy_score(filtered_reviews_t, predicted))#accuracy is too low, so our model does not work well
# print("fowlkes_mallows_score: %0.3f"
#       % fowlkes_mallows_score(filtered_reviews_t, predicted))#Measure the similarity of clusters 

Silhouette Coefficient: 0.006


In [38]:
xtrain, xtest, ytrain, ytest = train_test_split(filtered_reviews, rating, test_size=0.2, random_state=101)


In [39]:
xtrain.shape

(1600,)

In [40]:
# fitting model on whole data through Pipline without spliting 
steps = [('vectorizer', CountVectorizer()),
         ('tf', TfidfTransformer()),
         ('kmeans', KMeans(5)),
         ('log_reg', LogisticRegression())]
model1 = Pipeline(steps=steps)

cv = RepeatedStratifiedKFold(n_repeats=10, 
                             n_splits=10, 
                             random_state=101)
n_score = cross_val_score(
            model1, xtrain, ytrain,
            cv=cv, 
            scoring='accuracy', 
            n_jobs=-1, 
            error_score='raise')

print(n_score.mean(), np.std(n_score))


0.46506249999999993 0.019706736633699654


In [41]:
model1.fit(xtrain, ytrain)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [42]:
ypred1 = model1.predict(xtest)
ypred1 = np.array(ypred1)
print('Mean Square Error:',mean_squared_error(ytest, ypred1),'\n', 
      'accuracy:', model1.score(ytest, ypred1),'\n', 
       'R_squared', r2_score(ytest, ypred1))
      
print('classification_report:',classification_report(ytest, ypred1, zero_division=0))#because of some zero value in division in f_score, the term of zero_vevision ia added

Mean Square Error: 4.71 
 accuracy: 0.11 
 R_squared -0.6672713917822282
classification_report:               precision    recall  f1-score   support

           1       0.44      0.97      0.60       160
           2       0.00      0.00      0.00        40
           3       0.00      0.00      0.00        51
           4       0.00      0.00      0.00        40
           5       0.73      0.29      0.42       109

    accuracy                           0.47       400
   macro avg       0.23      0.25      0.20       400
weighted avg       0.37      0.47      0.36       400



In [43]:
# as it is shown on report, the fitting is not good. even some labeles like 2, 3, 4 have been ignored by model completely
# even the r_squered is negative addressed to some howthe fitting is bad.
#for fix the problem, RandomForestClassifier used however for getting the better result , optimization and some kernel used. 
#without GridSearchCV, RandomizedSearchCV, the model would not be a good fitting model.

In [48]:
xtrain, xtest, ytrain, ytest = train_test_split(filtered_reviews, rating, test_size=0.2, random_state=101)


In [49]:
#vectorizing data
tf = TfidfVectorizer(ngram_range=(1,2))
xtrain = tf.fit_transform(xtrain).toarray()
xtest = tf.transform(xtest).toarray()

In [50]:
xtrain.shape

(1600, 23805)

In [51]:
rf = RandomForestClassifier()
rf.fit(xtrain, ytrain)


In [52]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

# Create a search grid of parameters that will be shuffled through

param_grid = {

'bootstrap': [True],#without bootstrap, the number of decision trees being built in the forest
#'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],#he number of decision trees being built in the forest

'max_features': ['auto', 'sqrt'],

#'min_samples_leaf': [1, 2, 4],#the minimum number of data point requirements in a node of the decision tree

#'min_samples_split': [2, 5, 10],#the minimum number of samples required to split an internal node.

'n_estimators': [10, 20]#the number of decision trees being built in the forest

}

# Using the random grid and searching for best hyperparameters

rf = RandomForestRegressor() #creating base model

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = param_grid, n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = -1)

rf_random.fit(xtrain, ytrain) 



Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [55]:
#this arrange for optimization of upper model has been used from "https://www.upgrad.com/blog":
param_grid = {

    'bootstrap': [True],

#     'max_depth': [80, 90, 100, 110],

#     'max_features': [2, 3],

#     'min_samples_leaf': [3, 4, 5],

    'min_samples_split': [8, 10, 12],

    'n_estimators': [10, 20]#, 300, 1000]}

# Create a based model

rf = RandomForestRegressor()

grid_earch = GridSearchCV(estimator = rf, param_grid = param_grid,cv = 3, n_jobs = -1, verbose = 2)

# Fit the grid search to the data

grid_search.fit(xtrain, ytrain)

grid_search.best_params_

{'bootstrap': True,

 'max_depth': 80,

 'max_features': 3,

 'min_samples_leaf': 5,

 'min_samples_split': 12, 'n_estimators': 100}

best_grid = grid_search.best_estimator_
best_grid

Fitting 3 folds for each of 6 candidates, totalling 18 fits


In [60]:
ypred5 = grid_search.predict(xtest).round()

In [61]:
print('Mean Square Error:', mean_squared_error(ytest, ypred5),'\n', 
      'R_Squared:', r2_score(ytest, ypred5))
print('classification_report:',classification_report(ytest, ypred5, zero_division=0))#because of some zero value in division in f_score, the term of zero_vevision ia added

Mean Square Error: 2.335 
 R_Squared: 0.1734440127788741
classification_report:               precision    recall  f1-score   support

           1       0.63      0.15      0.24       160
           2       0.14      0.47      0.21        40
           3       0.17      0.31      0.22        51
           4       0.13      0.23      0.17        40
           5       0.69      0.40      0.51       109

    accuracy                           0.28       400
   macro avg       0.35      0.31      0.27       400
weighted avg       0.49      0.28      0.30       400



In [62]:
pd.crosstab(ytest, ypred5)

col_0,1.0,2.0,3.0,4.0,5.0
im:rating.label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,24,73,40,17,6
2,5,19,8,6,2
3,5,22,16,5,3
4,2,14,6,9,9
5,2,11,22,30,44


In [66]:
#as the report denoting all labels have been predicted and there is no zero. based on crosstab the main diameter
#almost has the maximum count(considering 28% accuracy) which shows the some how the model could reconize the labels.