<a href="https://colab.research.google.com/github/yudumpacin/NLPStudyNotes/blob/main/TextClassification_Nltk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Restaurant Review Classification

# 1) Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
data = pd.read_csv("Restaurant_Reviews.tsv", sep="\t")

In [4]:
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [5]:
data["Liked"].value_counts()

1    500
0    500
Name: Liked, dtype: int64

# 2) Data Cleaning

In [77]:
import nltk
from nltk import PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import re

In [None]:
nltk.download("stopwords")

In [83]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [61]:
stop_words = stopwords.words("english")

In [71]:
neg = ["no","not","doesn't","don't"]

In [73]:
stop_words_pre = [c for c in stop_words if c not in neg]

In [35]:
stemmer =  SnowballStemmer("english")

In [91]:
def preprocess_with_stemming(review):
  review = review.lower()
  review = re.sub("[^a-z0-9 ]","",review)
  review = " ".join([stemmer.stem(word) for word in review.split() if word not in stop_words_pre])
  return review

In [78]:
lemmatizer = WordNetLemmatizer()

In [94]:
def preprocess_with_lemmatize(review):
  review = review.lower()
  review = re.sub("[^a-z0-9 ]","",review)
  review = " ".join([lemmatizer.lemmatize(word) for word in review.split() if word not in stop_words_pre])
  return review


In [92]:
data["Review"].apply(preprocess_with_stemming)

0                                         wow love place
1                                         crust not good
2                                 not tasti textur nasti
3      stop late may bank holiday rick steve recommen...
4                                select menu great price
                             ...                        
995                        think food flavor textur lack
996                                 appetit instant gone
997                 overal not impress would not go back
998    whole experi underwhelm think well go ninja su...
999    hadnt wast enough life pour salt wound draw ti...
Name: Review, Length: 1000, dtype: object

In [95]:
data["Review"].apply(preprocess_with_lemmatize)

0                                        wow loved place
1                                         crust not good
2                                not tasty texture nasty
3      stopped late may bank holiday rick steve recom...
4                             selection menu great price
                             ...                        
995                    think food flavor texture lacking
996                              appetite instantly gone
997              overall not impressed would not go back
998    whole experience underwhelming think well go n...
999    hadnt wasted enough life poured salt wound dra...
Name: Review, Length: 1000, dtype: object

In [96]:
data["Review_Cleaned"] = data["Review"].apply(preprocess_with_lemmatize)

# 3) Bag of Words

In [97]:
from sklearn.feature_extraction.text import CountVectorizer

In [115]:
# Sample text data
documents = [
    "The quick brown fox jumped over the lazy dog",
    "The dog slept under the tree",
    "Foxes are wild animals"
]

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the documents
bow_matrix = vectorizer.fit_transform(documents)

# Convert to an array
bow_array = bow_matrix.toarray()

# Get feature names
feature_names = vectorizer.get_feature_names_out()

# Print the array and feature names
print(bow_matrix)
print(bow_array)
print(feature_names)

  (0, 11)	2
  (0, 9)	1
  (0, 2)	1
  (0, 4)	1
  (0, 6)	1
  (0, 8)	1
  (0, 7)	1
  (0, 3)	1
  (1, 11)	2
  (1, 3)	1
  (1, 10)	1
  (1, 13)	1
  (1, 12)	1
  (2, 5)	1
  (2, 1)	1
  (2, 14)	1
  (2, 0)	1
[[0 0 1 1 1 0 1 1 1 1 0 2 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 1 2 1 1 0]
 [1 1 0 0 0 1 0 0 0 0 0 0 0 0 1]]
['animals' 'are' 'brown' 'dog' 'fox' 'foxes' 'jumped' 'lazy' 'over'
 'quick' 'slept' 'the' 'tree' 'under' 'wild']


In [98]:
X = data["Review_Cleaned"]
y = data["Liked"]

In [99]:
from sklearn.model_selection import train_test_split

In [100]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [110]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import FunctionTransformer

In [125]:
def create_and_fit_pipeline(classifier, X_train, y_train):
    pipeline = Pipeline([
        ("bag", CountVectorizer()),
        ("to_dense", FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)),
        ("classifier", classifier)
    ])

    pipeline.fit(X_train, y_train)
    return pipeline

# 3) Model Application

In [119]:
from sklearn.metrics import f1_score,confusion_matrix,classification_report

In [132]:
def evaluate_pipeline(pipeline, X_test, y_test):
    predictions = pipeline.predict(X_test)
    f1 = f1_score(y_test, predictions)
    report = classification_report(y_test, predictions)
    return  f1, report

In [122]:
from sklearn.ensemble import RandomForestClassifier

In [136]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

classifiers = [
    RandomForestClassifier(),
    MultinomialNB(),
    GaussianNB(),
    SVC(),
    LogisticRegression()
]

for clf in classifiers:
    pipeline = create_and_fit_pipeline(clf, X_train, y_train)
    score, report = evaluate_pipeline(pipeline, X_test, y_test)
    print(f"Model: {clf.__class__.__name__}")
    print(f"F1: {score}")
    print(f"Classification Report:\n{report}\n")


Model: RandomForestClassifier
F1: 0.7857142857142858
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.86      0.81       152
           1       0.83      0.74      0.79       148

    accuracy                           0.80       300
   macro avg       0.80      0.80      0.80       300
weighted avg       0.80      0.80      0.80       300


Model: MultinomialNB
F1: 0.8013468013468014
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.80      0.81       152
           1       0.80      0.80      0.80       148

    accuracy                           0.80       300
   macro avg       0.80      0.80      0.80       300
weighted avg       0.80      0.80      0.80       300


Model: GaussianNB
F1: 0.7331378299120234
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.55      0.65       152
           1       0.65     