### Supervised Sentiment Analysis
-  Reviews column as input
- Recommended column as the target (positive/negative sentiment)

In [3]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [4]:
print(sklearn.__version__)

1.4.2


In [5]:
# Load the dataset
df = pd.read_csv('airlines_reviews.csv')
df.head()

Unnamed: 0,Title,Name,Review Date,Airline,Verified,Reviews,Type of Traveller,Month Flown,Route,Class,Seat Comfort,Staff Service,Food & Beverages,Inflight Entertainment,Value For Money,Overall Rating,Recommended
0,Flight was amazing,Alison Soetantyo,2024-03-01,Singapore Airlines,True,Flight was amazing. The crew onboard this fl...,Solo Leisure,December 2023,Jakarta to Singapore,Business Class,4,4,4,4,4,9,yes
1,seats on this aircraft are dreadful,Robert Watson,2024-02-21,Singapore Airlines,True,Booking an emergency exit seat still meant h...,Solo Leisure,February 2024,Phuket to Singapore,Economy Class,5,3,4,4,1,3,no
2,Food was plentiful and tasty,S Han,2024-02-20,Singapore Airlines,True,Excellent performance on all fronts. I would...,Family Leisure,February 2024,Siem Reap to Singapore,Economy Class,1,5,2,1,5,10,yes
3,“how much food was available,D Laynes,2024-02-19,Singapore Airlines,True,Pretty comfortable flight considering I was f...,Solo Leisure,February 2024,Singapore to London Heathrow,Economy Class,5,5,5,5,5,10,yes
4,“service was consistently good”,A Othman,2024-02-19,Singapore Airlines,True,The service was consistently good from start ...,Family Leisure,February 2024,Singapore to Phnom Penh,Economy Class,5,5,5,5,5,10,yes


In [6]:
# Preprocessing
# Encode target: 'yes' -> 1, 'no' -> 0
df['Recommended'] = df['Recommended'].apply(lambda x: 1 if x.lower() == 'yes' else 0)

# Clean and preprocess the Reviews column
df['Reviews'] = df['Reviews'].str.lower().str.replace('[^a-zA-Z]', ' ', regex=True)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], df['Recommended'], test_size=0.2, random_state=42)

# Convert text data to numerical data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train_vect, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_vect)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.90      0.91       761
           1       0.92      0.93      0.92       859

    accuracy                           0.92      1620
   macro avg       0.92      0.92      0.92      1620
weighted avg       0.92      0.92      0.92      1620

Confusion Matrix:
[[687  74]
 [ 60 799]]


In [7]:
import joblib

# Save the trained Logistic Regression model
joblib.dump(model, 'sentiment_model.pkl')

# Save the trained TF-IDF vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']