In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [7]:
df = pd.read_csv("Downloads/Indian_Domestic_Airline.csv")

In [33]:
df

Unnamed: 0,Review,Recommond,sentiment,clean_review
0,✅ Trip Verified | I had booked this fare at a ...,yes,1,trip verified i had booked this fare at a ve...
1,✅ Trip Verified | I travel at least four times...,no,0,trip verified i travel at least four times a...
2,✅ Trip Verified | Taking this opportunity to a...,yes,1,trip verified taking this opportunity to app...
3,✅ Trip Verified | The worst airlines in the hi...,no,0,trip verified the worst airlines in the hist...
4,✅ Trip Verified | Would like to give big thum...,yes,1,trip verified would like to give big thumbs...
...,...,...,...,...
2205,"Goa to Mumbai, and Vistara was a good experien...",yes,1,goa to mumbai and vistara was a good experienc...
2206,Mumbai to Bengaluru. This was my third time fl...,yes,1,mumbai to bengaluru this was my third time fly...
2207,Flew Vistara for the first time in February fr...,yes,1,flew vistara for the first time in february fr...
2208,Bhubaneswar to Delhi with Vistara. I had booke...,yes,1,bhubaneswar to delhi with vistara i had booked...


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2210 entries, 0 to 2209
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Review        2210 non-null   object
 1   Recommond     2210 non-null   object
 2   sentiment     2210 non-null   int64 
 3   clean_review  2210 non-null   object
dtypes: int64(1), object(3)
memory usage: 69.2+ KB


In [35]:
df.describe()

Unnamed: 0,sentiment
count,2210.0
mean,0.344796
std,0.475409
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [37]:
df.isnull().sum()

Review          0
Recommond       0
sentiment       0
clean_review    0
dtype: int64

In [9]:
df = df[['Review', 'Recommond']]

In [39]:
df

Unnamed: 0,Review,Recommond,sentiment,clean_review
0,✅ Trip Verified | I had booked this fare at a ...,yes,1,trip verified i had booked this fare at a ve...
1,✅ Trip Verified | I travel at least four times...,no,0,trip verified i travel at least four times a...
2,✅ Trip Verified | Taking this opportunity to a...,yes,1,trip verified taking this opportunity to app...
3,✅ Trip Verified | The worst airlines in the hi...,no,0,trip verified the worst airlines in the hist...
4,✅ Trip Verified | Would like to give big thum...,yes,1,trip verified would like to give big thumbs...
...,...,...,...,...
2205,"Goa to Mumbai, and Vistara was a good experien...",yes,1,goa to mumbai and vistara was a good experienc...
2206,Mumbai to Bengaluru. This was my third time fl...,yes,1,mumbai to bengaluru this was my third time fly...
2207,Flew Vistara for the first time in February fr...,yes,1,flew vistara for the first time in february fr...
2208,Bhubaneswar to Delhi with Vistara. I had booke...,yes,1,bhubaneswar to delhi with vistara i had booked...


In [11]:
df['sentiment'] = df['Recommond'].map({'yes': 1, 'no': 0})

In [13]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # remove special characters
    return text

df['clean_review'] = df['Review'].apply(clean_text)

In [41]:
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_review'],
    df['sentiment'],
    test_size=0.3,
    random_state=42
)


In [43]:
tfidf = TfidfVectorizer(
    stop_words='english',
    max_features=5000,
    ngram_range=(1, 2)
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [45]:
model = LogisticRegression()

In [47]:
model

In [49]:
model.fit(X_train_tfidf, y_train)

In [51]:
y_pred = model.predict(X_test_tfidf)

In [53]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,

In [55]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

Accuracy: 0.9095022624434389

Classification Report:

              precision    recall  f1-score   support

           0       0.90      0.97      0.93       439
           1       0.93      0.79      0.86       224

    accuracy                           0.91       663
   macro avg       0.92      0.88      0.89       663
weighted avg       0.91      0.91      0.91       663



In [60]:
from sklearn.naive_bayes import MultinomialNB
model_NB = MultinomialNB()

In [62]:
model_NB

In [68]:
model_NB.fit(X_train_tfidf, y_train)

In [70]:
y_pred_NB = model_NB.predict(X_test_tfidf)

In [72]:
y_pred_NB

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,

In [74]:
print("Accuracy:", accuracy_score(y_test, y_pred_NB))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_NB))

Accuracy: 0.9034690799396682

Classification Report:

              precision    recall  f1-score   support

           0       0.89      0.97      0.93       439
           1       0.93      0.78      0.84       224

    accuracy                           0.90       663
   macro avg       0.91      0.87      0.89       663
weighted avg       0.91      0.90      0.90       663



In [78]:
from sklearn.svm import SVC

In [80]:
model_SVC = SVC()

In [82]:
model_SVC

In [84]:
model_SVC.fit(X_train_tfidf, y_train)

In [86]:
y_pred_SVC = model_NB.predict(X_test_tfidf)

In [88]:
y_pred_SVC

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,

In [90]:
print("Accuracy:", accuracy_score(y_test, y_pred_SVC))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_SVC))

Accuracy: 0.9034690799396682

Classification Report:

              precision    recall  f1-score   support

           0       0.89      0.97      0.93       439
           1       0.93      0.78      0.84       224

    accuracy                           0.90       663
   macro avg       0.91      0.87      0.89       663
weighted avg       0.91      0.90      0.90       663

