In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
import pickle

In [2]:
# Load the dataset
path = r"C:\Users\mukhi\tripadvisor_hotel_reviews.csv"
df = pd.read_csv(path)

# Display the first few rows to check the column names and contents
print(df.head())

                                              Review  Rating
0  nice hotel expensive parking got good deal sta...       4
1  ok nothing special charge diamond member hilto...       2
2  nice rooms not 4* experience hotel monaco seat...       3
3  unique, great stay, wonderful time hotel monac...       5
4  great stay great stay, went seahawk game aweso...       5


In [3]:
# Assuming the correct column names are 'Rating' and 'Review'
# Map the ratings to binary labels
def map_rating_to_sentiment(rating):
    if rating in [4, 5]:
        return 'positive'
    elif rating in [1, 2]:
        return 'negative'
    else:
        return 'neutral'

df['Sentiment'] = df['Rating'].apply(map_rating_to_sentiment)

# Check the distribution of sentiments
print(df['Sentiment'].value_counts())

Sentiment
positive    15093
negative     3214
neutral      2184
Name: count, dtype: int64


In [4]:
# Use the new 'Sentiment' column as the target variable
y = df['Sentiment'].values
X = df['Review'].values.astype('U')

# Initialize TfidfVectorizer
tfidf = TfidfVectorizer(
    use_idf=True,
    norm='l2',
    smooth_idf=True,
    stop_words='english'
)


In [5]:
# Transform the reviews
X_tfidf = tfidf.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, random_state=1, test_size=0.5, shuffle=False)

In [6]:
# Train a multi-class classifier
clf = LogisticRegressionCV(cv=5,
                           scoring='accuracy',
                           random_state=0,
                           n_jobs=-1,
                           verbose=3,
                           max_iter=300,
                           multi_class='ovr').fit(X_train, y_train)

# Save the model
saved_model = open('saved_model.sav', 'wb')
pickle.dump(clf, saved_model)
saved_model.close()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  15 | elapsed:   27.4s remaining:   41.2s
[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed:   51.2s remaining:   12.7s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   53.9s finished


In [7]:
# Load the model for prediction
filename = 'saved_model.sav'
saved_clf = pickle.load(open(filename, 'rb'))

# Evaluate the model
score = saved_clf.score(X_test, y_test)
print(f"Model accuracy: {score}")

Model accuracy: 0.8617997267226235


In [8]:
from sklearn.metrics import classification_report,confusion_matrix                                      

# Generate the classification report
y_pred = saved_clf.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.80      0.76      0.78      1513
     neutral       0.55      0.14      0.22      1055
    positive       0.88      0.98      0.93      7678

    accuracy                           0.86     10246
   macro avg       0.74      0.63      0.64     10246
weighted avg       0.84      0.86      0.83     10246

[[1150   68  295]
 [ 204  147  704]
 [  92   53 7533]]


In [10]:
# Example input text for prediction
input_text = ["This hotel was fantastic! I loved it.", 
              "The room was dirty and the service was terrible.",
              "It was an average stay, not too bad but not great either.",
              "The hotel's amenities were satisfactory and the location was convenient."]

# Transform the input text using the fitted vectorizer
X_input = tfidf.transform(input_text)

# Make predictions
predictions = saved_clf.predict(X_input)

# Print predictions
print(predictions)

['positive' 'negative' 'neutral' 'positive']
