In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../DSML25/Restaurant_Reviews.tsv', sep='\t')

In [3]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
reviews = df['Review']

In [5]:
r1 = reviews.iloc[0]
r1

'Wow... Loved this place.'

In [6]:
import nltk
# pip install nltk

In [7]:
stopwords = nltk.corpus.stopwords.words('english')

In [8]:
stemmer = nltk.stem.PorterStemmer()
stemmer.stem('loving')

'love'

In [9]:
# Preprocessing
r1 = reviews.iloc[0]

# 1. all lower
r1 = r1.lower()

# 2. remove unwanted characters (non alphabets)
import re # regular expressions - pattern matching
pattern = '[^a-z]'   # identify chars which are not alphabets
r1 = re.sub(pattern,' ',r1)

# 3. remove stopwords
words = r1.split()
words = [word for word in words if word not in stopwords]

# 4. stemming/lemmatization
words = [stemmer.stem(word) for word in words]
' '.join(words)

'wow love place'

In [10]:
def preprocess(r1):
    r1 = r1.lower()
    pattern = '[^a-z]'   # identify chars which are not alphabets
    r1 = re.sub(pattern,' ',r1)
    words = r1.split()
    words = [word for word in words if word not in stopwords]
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

In [11]:
preprocess('Wow... Loved this place.')

'wow love place'

In [12]:
processed_reviews = reviews.apply(preprocess)

In [13]:
# BOW
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(processed_reviews) # unique words

In [17]:
len(vectorizer.get_feature_names_out())

1565

In [18]:
tfidf_table = vectorizer.transform(processed_reviews).toarray()

In [19]:
tfidf_table.shape

(1000, 1565)

In [20]:
# Bulid ML model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(tfidf_table,df['Liked'])

In [21]:
model.score(tfidf_table,df['Liked'])

0.942

In [22]:
# Test Data
reviews = ['food was good.','excellent service','good food nice test','The food is really, really good, and the service is super nice.','So pathetic service and food quality','Family friendly hotel with good parking','Beautiful food , lovely service and surrounding.','I had a delightful brunch at The Cozy Cafe. The eggs benedict were perfection, and the atmosphere is charming','The ambiance is superb']
reviews = [preprocess(r) for r in reviews]
X_test = vectorizer.transform(reviews).toarray()
model.predict(X_test)

array([1, 1, 1, 1, 0, 1, 1, 1, 1], dtype=int64)