In [1]:
import re, nltk #importing regular expressions & natural language toolkit
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE  # imblearn library can be installed using pip install imblearn
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
import joblib

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [5]:
# Importing dataset and examining it
df = pd.read_csv("/content/drive/MyDrive/McD_Labelled.csv",encoding='ISO-8859-1')
pd.set_option('display.max_columns', None) # to make sure you can see all the columns in output window
print(dataset.head())
print(dataset.shape)
print(dataset.info())
print(dataset.describe())

                                              Review     Label
0                         The bun was dry and stale.  Negative
1    The restaurant had a great outdoor seating area  Positive
2       The order was incorrect and had to be remade  Negative
3  The burger was a bit too greasy and left a fil...  Negative
4  The special sauce was a bit too tangy, it need...  Negative
(1000, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Label   1000 non-null   object
dtypes: object(2)
memory usage: 15.8+ KB
None
                            Review     Label
count                         1000      1000
unique                        1000         2
top     The bun was dry and stale.  Negative
freq                             1       500


In [6]:
# Converting structured categorical features to numerical features
df['Label'] = df['Label'].map({'Positive':1, 'Negative':0})

In [7]:
#Cleaning the 1000 reviews obtained from chatGPT
def cleaner(review): # Cleaning reviews
    soup = BeautifulSoup(review, 'lxml') # removing HTML entities such as ‘&amp’,’&quot’,'&gt'; lxml is the html parser and shoulp be installed using 'pip install lxml'
    souped = soup.get_text()
    re1 = re.sub(r"(@|http://|https://|www|\\x)\S*", " ", souped) # substituting @mentions, urls, etc with whitespace
    re2 = re.sub("[^A-Za-z]+"," ", re1) # substituting any non-alphabetic character that repeats one or more times with whitespace

    tokens = nltk.word_tokenize(re2)
    lower_case = [t.lower() for t in tokens]

    stop_words = set(stopwords.words('english'))
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))

    wordnet_lemmatizer = WordNetLemmatizer()
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
    return lemmas

In [16]:
# Converting unstructured 'Review' column to a TF-IDF matrix
df['cleaned_review'] = df.Review.apply(cleaner)#Cleaning function
df = df[df['cleaned_review'].map(len) > 0] # removing rows with cleaned reviews of length 0
print("Printing top 5 rows of dataframe showing original and cleaned reviews....")
print(df[['Review','cleaned_review']].head())

df['cleaned_review'] = [" ".join(row) for row in df['cleaned_review'].values] # joining tokens to create strings. TfidfVectorizer does not accept tokens as input
data = df['cleaned_review']
Y = df['Label'] # target column

tfidf = TfidfVectorizer(min_df=.008, ngram_range=(1,3)) # min_df=.010 means that each ngram (unigram, bigram and trigram) must be present in at least 10 documents for it to be considered as a token (1000*.010=10).
tfidf.fit(data) # learn vocabulary of entire data
data_tfidf = tfidf.transform(data) # creating tfidf values
pd.DataFrame(pd.Series(tfidf.get_feature_names_out())).to_csv('vocabulary_McDreviews.csv', header=False, index=False)
print("Shape of tfidf matrix: ", data_tfidf.shape)

Printing top 5 rows of dataframe showing original and cleaned reviews....
                                              Review  \
0                         The bun was dry and stale.   
1    The restaurant had a great outdoor seating area   
2       The order was incorrect and had to be remade   
3  The burger was a bit too greasy and left a fil...   
4  The special sauce was a bit too tangy, it need...   

                                cleaned_review  
0                            [bun, dry, stale]  
1  [restaurant, great, outdoor, seating, area]  
2                   [order, incorrect, remade]  
3     [burger, bit, greasy, left, film, mouth]  
4  [special, sauce, bit, tangy, needed, sweet]  
Shape of tfidf matrix:  (1000, 205)


In [17]:
# Implementing Support Vector Classifier
model1 = LinearSVC() # kernel = 'linear' and C = 1

# Running cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1) # 5-fold cross-validation
scores=[]
iteration = 0
smote = SMOTE(random_state = 101)
for train_index, test_index in kf.split(data_tfidf, Y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, Y_train = data_tfidf[train_index], Y[train_index]
    X_test, Y_test = data_tfidf[test_index], Y[test_index]
    X_train,Y_train = smote.fit_resample(X_train,Y_train) # Balancing training data
    model1.fit(X_train, Y_train) # Fitting SVC
    Y_pred = model1.predict(X_test)
    score = metrics.precision_score(Y_test, Y_pred) # Calculating precision
    print("Cross-validation precison: ", score)
    scores.append(score) # appending cross-validation precision for each iteration
mean_precision = np.mean(scores)
print("SVC Mean cross-validation precision: ", mean_precision)

Iteration  1
Cross-validation precison:  0.9489795918367347
Iteration  2
Cross-validation precison:  0.9263157894736842
Iteration  3
Cross-validation precison:  0.95
Iteration  4
Cross-validation precison:  0.9489795918367347
Iteration  5
Cross-validation precison:  0.912621359223301
SVC Mean cross-validation precision:  0.937379266474091


In [18]:
# Implementing Naive Bayes Classifier
model2 = MultinomialNB()

# Running cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1) # 5-fold cross-validation
scores=[]
iteration = 0
smote = SMOTE(random_state = 101)
for train_index, test_index in kf.split(data_tfidf, Y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, Y_train = data_tfidf[train_index], Y[train_index]
    X_test, Y_test = data_tfidf[test_index], Y[test_index]
    X_train,Y_train = smote.fit_resample(X_train,Y_train) # Balancing training data
    model2.fit(X_train, Y_train) # Fitting NBC
    Y_pred = model2.predict(X_test)
    score = metrics.precision_score(Y_test, Y_pred) # Calculating precision
    print("Cross-validation precison: ", score)
    scores.append(score) # appending cross-validation precision for each iteration
mean_precision = np.mean(scores)
print("NBC Mean cross-validation precision: ", mean_precision)

Iteration  1
Cross-validation precison:  0.8962264150943396
Iteration  2
Cross-validation precison:  0.8952380952380953
Iteration  3
Cross-validation precison:  0.93
Iteration  4
Cross-validation precison:  0.9230769230769231
Iteration  5
Cross-validation precison:  0.9223300970873787
NBC Mean cross-validation precision:  0.9133743060993474


In [19]:
data_tfidf,Y = smote.fit_resample(data_tfidf,Y)
clf = LinearSVC().fit(data_tfidf, Y)
joblib.dump(clf, 'McDreviews_model.sav')

['McDreviews_model.sav']

In [20]:
pip install umap-learn

Collecting umap-learn
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.10.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: umap-learn, pynndescent
  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Created wheel for umap-learn: filename=umap_learn-0.5.3-py3-none-any.whl size=82816 sha256=dba0f5a0c3695f1c7b7707814304671a5705b527b3a20dea80f7717e30dda8bd
  Stored in directory: /root/.cache/pip/wheels/a0/e8/c6/a37ea663620bd5200ea1ba0907ab3c217042c1d035ef606acc
  Building wheel for pynndescent (setup.py) ... [?25l[?25hdone
  Created wheel for py

In [21]:
import umap
import plotly.graph_objs as go
import plotly.figure_factory as ff

# Implementing UMAP to visualize dataset
u = umap.UMAP(n_components=2, n_neighbors=10, min_dist=0.5)
x_umap = u.fit_transform(data_tfidf)

Reviews = list(df['Review'])
Sentiment = list(Y)

data_ = [go.Scatter(x=x_umap[:,0], y=x_umap[:,1], mode='markers',
                    marker = dict(color=Y, colorscale='Rainbow', opacity=0.5),
                                text=[f'Review: {a}<br>Sentiment: {b}' for a,b in list(zip(Reviews, Sentiment))],
                                hoverinfo='text')]

layout = go.Layout(title = 'UMAP Dimensionality Reduction', width = 1400, height = 1400,
                    xaxis = dict(title='First Dimension'),
                    yaxis = dict(title='Second Dimension'))
fig = go.Figure(data=data_, layout=layout)
fig.show()

In [22]:
############### Deployment of the review prediction model ##################################

import re, nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [25]:
model = joblib.load('McDreviews_model.sav')
vocabulary = pd.read_csv('vocabulary_McDreviews.csv', header=None)

#creating a dictionary of the tested vocabulary variables generated previously as we can only use that to make predictions
vocabulary_dict = {}
for i, word in enumerate(vocabulary[0]):
      vocabulary_dict[word] = i
print(vocabulary_dict)
tfidf = TfidfVectorizer(vocabulary = vocabulary_dict,lowercase=False)

# Reading new data as dataframe
df = pd.read_csv("/content/drive/MyDrive/McD_Unlabelled.csv")
pd.set_option('display.max_colwidth', None) # Setting this so we can see the full content of cells
pd.set_option('display.max_columns', None) # to make sure we can see all the columns in output window

{'accommodating': 0, 'added': 1, 'added nice': 2, 'addition': 3, 'aftertaste': 4, 'amount': 5, 'area': 6, 'atmosphere': 7, 'attentive': 8, 'balance': 9, 'beef': 10, 'beef patty': 11, 'beef patty bit': 12, 'beef patty perfect': 13, 'big': 14, 'big mac': 15, 'big mac bit': 16, 'bit': 17, 'bit bland': 18, 'bit greasy': 19, 'bit messy': 20, 'bit mild': 21, 'bit mild needed': 22, 'bit plain': 23, 'bit plain needed': 24, 'bit salty': 25, 'bit small': 26, 'bit spicy': 27, 'bit sweet': 28, 'bit tangy': 29, 'bland': 30, 'breakfast': 31, 'bun': 32, 'burger': 33, 'burger bit': 34, 'burger bit greasy': 35, 'burger bit messy': 36, 'burger cooked': 37, 'burger perfect': 38, 'burger perfect option': 39, 'burger perfect size': 40, 'cheese': 41, 'chicken': 42, 'chicken nugget': 43, 'chicken sandwich': 44, 'cold': 45, 'complement': 46, 'cooked': 47, 'cooked perfection': 48, 'cream': 49, 'crispy': 50, 'crunch': 51, 'customer': 52, 'day': 53, 'delicious': 54, 'drink': 55, 'drive': 56, 'drive thru': 57, 'd

In [27]:
# Cleaning unlabelled reviews
def cleaner(review):
    soup = BeautifulSoup(review, 'lxml') # removing HTML entities such as ‘&amp’,’&quot’,'&gt'; lxml is the html parser and shoulp be installed using 'pip install lxml'
    souped = soup.get_text()
    re1 = re.sub(r"(@|http://|https://|www|\\x)\S*", " ", souped) # substituting @mentions, urls, etc with whitespace
    re2 = re.sub("[^A-Za-z]+"," ", re1) # substituting any non-alphabetic character that repeats one or more times with whitespace

    """
    For more info on regular expressions visit -
    https://docs.python.org/3/howto/regex.html
    """

    tokens = nltk.word_tokenize(re2)
    lower_case = [t.lower() for t in tokens]

    stop_words = set(stopwords.words('english'))
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))

    wordnet_lemmatizer = WordNetLemmatizer()
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
    return lemmas

In [29]:
df['cleaned_review'] = df.Review.apply(cleaner)
df = df[df['cleaned_review'].map(len) > 0] # removing rows with cleaned tweets of length 0
print("Printing top 5 rows of dataframe showing original and cleaned reviews....")
print(df[['Review','cleaned_review']].head())
df['cleaned_review'] = [" ".join(row) for row in df['cleaned_review'].values] # joining tokens to create strings. TfidfVectorizer does not accept tokens as input
data = df['cleaned_review']
tfidf.fit(data)
data_tfidf = tfidf.transform(data)
y_pred = model.predict(data_tfidf)

#### Saving predicted McD reviews to csv
df['predicted_rating'] = y_pred.reshape(-1,1)
df.to_csv('predicted_rating.csv', index=False)

Printing top 5 rows of dataframe showing original and cleaned reviews....
                                                   Review  \
0              The burger patty was cooked to perfection.   
1      The restaurant had a cozy and inviting atmosphere.   
2                      The salad was fresh and delicious.   
3                               The fries were too salty.   
4  The staff was quick to resolve an issue with my order.   

                             cleaned_review  
0       [burger, patty, cooked, perfection]  
1  [restaurant, cozy, inviting, atmosphere]  
2                 [salad, fresh, delicious]  
3                              [fry, salty]  
4     [staff, quick, resolve, issue, order]  
