In [1]:
# Importing required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
# Load the dataset
file_path = 'restaurant_reviews.tsv'
df = pd.read_csv(file_path, sep='\t')

# Display the first few rows
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [11]:
# Print the column names of the dataset
print(df.columns)

Index(['Review', 'Liked'], dtype='object')


In [13]:
# Check the class distribution for the correct column name
print(df['Liked'].value_counts())

Liked
1    500
0    500
Name: count, dtype: int64


In [69]:
# Check for missing values
df.isnull().sum()

# Drop any rows with missing values
df.dropna(inplace=True)

In [21]:
# Features and Labels
X = df['Review'] 
y = df['Liked']  

In [23]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit and transform the training data
X_train_tfidf = tfidf.fit_transform(X_train)

# Transform the testing data
X_test_tfidf = tfidf.transform(X_test)

In [27]:
# Initialize and train the Logistic Regression model
model = LogisticRegression(random_state=42)
model.fit(X_train_tfidf, y_train)

In [29]:
# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

In [31]:
# Accuracy Score
print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')

# Confusion Matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

# Classification Report
print('Classification Report:')
print(classification_report(y_test, y_pred))

Accuracy: 0.7700
Confusion Matrix:
[[80 16]
 [30 74]]
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.83      0.78        96
           1       0.82      0.71      0.76       104

    accuracy                           0.77       200
   macro avg       0.77      0.77      0.77       200
weighted avg       0.78      0.77      0.77       200



In [35]:
from sklearn.model_selection import GridSearchCV

# Logistic Regression Hyperparameter tuning
param_grid = {'C': [0.1, 1, 10, 100]}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train_tfidf, y_train)

# Get the best parameters
print(f'Best parameters: {grid_search.best_params_}')

# Evaluate with the best model
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test_tfidf)
print(f'Optimized Logistic Regression Accuracy: {accuracy_score(y_test, y_pred_best):.4f}')


Best parameters: {'C': 10}
Optimized Logistic Regression Accuracy: 0.7900


In [53]:
from sklearn.naive_bayes import MultinomialNB

# Train a Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Evaluate the model
y_pred_nb = nb_model.predict(X_test_tfidf)
print(f'Naive Bayes Accuracy: {accuracy_score(y_test, y_pred_nb):.4f}')

Naive Bayes Accuracy: 0.7950


In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB

# Load your dataset
file_path = 'restaurant_reviews.tsv'  # Replace with your actual file path
df = pd.read_csv(file_path, sep='\t')

# Preprocess the data
df.dropna(inplace=True)
X = df['Review']
y = df['Liked']

# Train-Test Split (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train a Naive Bayes model with TF-IDF features
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Evaluate the model
y_pred_nb = nb_model.predict(X_test_tfidf)
print(f'Naive Bayes Accuracy: {accuracy_score(y_test, y_pred_nb):.4f}')

Naive Bayes Accuracy: 0.7850
