In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Uyama\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

### Data Processing

In [5]:
df=pd.read_csv("airlines_reviews.csv")
df.head()

Unnamed: 0,Title,Name,Review Date,Airline,Verified,Reviews,Type of Traveller,Month Flown,Route,Class,Seat Comfort,Staff Service,Food & Beverages,Inflight Entertainment,Value For Money,Overall Rating,Recommended
0,Flight was amazing,Alison Soetantyo,2024-03-01,Singapore Airlines,True,Flight was amazing. The crew onboard this fl...,Solo Leisure,December 2023,Jakarta to Singapore,Business Class,4,4,4,4,4,9,yes
1,seats on this aircraft are dreadful,Robert Watson,2024-02-21,Singapore Airlines,True,Booking an emergency exit seat still meant h...,Solo Leisure,February 2024,Phuket to Singapore,Economy Class,5,3,4,4,1,3,no
2,Food was plentiful and tasty,S Han,2024-02-20,Singapore Airlines,True,Excellent performance on all fronts. I would...,Family Leisure,February 2024,Siem Reap to Singapore,Economy Class,1,5,2,1,5,10,yes
3,“how much food was available,D Laynes,2024-02-19,Singapore Airlines,True,Pretty comfortable flight considering I was f...,Solo Leisure,February 2024,Singapore to London Heathrow,Economy Class,5,5,5,5,5,10,yes
4,“service was consistently good”,A Othman,2024-02-19,Singapore Airlines,True,The service was consistently good from start ...,Family Leisure,February 2024,Singapore to Phnom Penh,Economy Class,5,5,5,5,5,10,yes


In [6]:
# remove any columns with missing values
df = df.dropna(subset=["Reviews", "Recommended"])

In [7]:
# convert categorical feature to numerical feature
df["Recommended"] = df["Recommended"].apply(lambda x: 1 if x.lower()== "yes" else 0)

In [8]:
# check for missing values in Reviews column
df.isnull().sum()

Title                     0
Name                      0
Review Date               0
Airline                   0
Verified                  0
Reviews                   0
Type of Traveller         0
Month Flown               0
Route                     0
Class                     0
Seat Comfort              0
Staff Service             0
Food & Beverages          0
Inflight Entertainment    0
Value For Money           0
Overall Rating            0
Recommended               0
dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8100 entries, 0 to 8099
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Title                   8100 non-null   object
 1   Name                    8100 non-null   object
 2   Review Date             8100 non-null   object
 3   Airline                 8100 non-null   object
 4   Verified                8100 non-null   object
 5   Reviews                 8100 non-null   object
 6   Type of Traveller       8100 non-null   object
 7   Month Flown             8100 non-null   object
 8   Route                   8100 non-null   object
 9   Class                   8100 non-null   object
 10  Seat Comfort            8100 non-null   int64 
 11  Staff Service           8100 non-null   int64 
 12  Food & Beverages        8100 non-null   int64 
 13  Inflight Entertainment  8100 non-null   int64 
 14  Value For Money         8100 non-null   int64 
 15  Over

In [10]:
# drop unecessary columns
df= df.drop(columns=['Title'])


In [11]:
df.head()

Unnamed: 0,Name,Review Date,Airline,Verified,Reviews,Type of Traveller,Month Flown,Route,Class,Seat Comfort,Staff Service,Food & Beverages,Inflight Entertainment,Value For Money,Overall Rating,Recommended
0,Alison Soetantyo,2024-03-01,Singapore Airlines,True,Flight was amazing. The crew onboard this fl...,Solo Leisure,December 2023,Jakarta to Singapore,Business Class,4,4,4,4,4,9,1
1,Robert Watson,2024-02-21,Singapore Airlines,True,Booking an emergency exit seat still meant h...,Solo Leisure,February 2024,Phuket to Singapore,Economy Class,5,3,4,4,1,3,0
2,S Han,2024-02-20,Singapore Airlines,True,Excellent performance on all fronts. I would...,Family Leisure,February 2024,Siem Reap to Singapore,Economy Class,1,5,2,1,5,10,1
3,D Laynes,2024-02-19,Singapore Airlines,True,Pretty comfortable flight considering I was f...,Solo Leisure,February 2024,Singapore to London Heathrow,Economy Class,5,5,5,5,5,10,1
4,A Othman,2024-02-19,Singapore Airlines,True,The service was consistently good from start ...,Family Leisure,February 2024,Singapore to Phnom Penh,Economy Class,5,5,5,5,5,10,1


In [12]:
# checking the distribution of recommended column
df["Recommended"].value_counts()


Recommended
1    4287
0    3813
Name: count, dtype: int64

In [13]:
# applying stemming
import swifter
port_stem = PorterStemmer()

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
def stemming(content):
    # Remove non-alphabetic characters, keeping spaces
    stemmed_content = re.sub(r'[^a-zA-Z\s]', '', content)
    # Convert to lowercase
    stemmed_content = stemmed_content.lower()
    # Split the content into words
    stemmed_content = stemmed_content.split()
    # Stem each word and remove stopwords
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if word not in stopwords.words("english")]
    # Join the stemmed words back into a string
    stemmed_content = " ".join(stemmed_content)

    return stemmed_content

In [15]:
#create a new column and apply the stemming function
df["stemmed_content"] = df["Reviews"].swifter.apply(stemming)

Pandas Apply: 100%|██████████| 8100/8100 [05:59<00:00, 22.51it/s]


In [16]:
df.head()

Unnamed: 0,Name,Review Date,Airline,Verified,Reviews,Type of Traveller,Month Flown,Route,Class,Seat Comfort,Staff Service,Food & Beverages,Inflight Entertainment,Value For Money,Overall Rating,Recommended,stemmed_content
0,Alison Soetantyo,2024-03-01,Singapore Airlines,True,Flight was amazing. The crew onboard this fl...,Solo Leisure,December 2023,Jakarta to Singapore,Business Class,4,4,4,4,4,9,1,flight amaz crew onboard flight welcom gave go...
1,Robert Watson,2024-02-21,Singapore Airlines,True,Booking an emergency exit seat still meant h...,Solo Leisure,February 2024,Phuket to Singapore,Economy Class,5,3,4,4,1,3,0,book emerg exit seat still meant huge discomfo...
2,S Han,2024-02-20,Singapore Airlines,True,Excellent performance on all fronts. I would...,Family Leisure,February 2024,Siem Reap to Singapore,Economy Class,1,5,2,1,5,10,1,excel perform front would definit choos use ai...
3,D Laynes,2024-02-19,Singapore Airlines,True,Pretty comfortable flight considering I was f...,Solo Leisure,February 2024,Singapore to London Heathrow,Economy Class,5,5,5,5,5,10,1,pretti comfort flight consid fli economi class...
4,A Othman,2024-02-19,Singapore Airlines,True,The service was consistently good from start ...,Family Leisure,February 2024,Singapore to Phnom Penh,Economy Class,5,5,5,5,5,10,1,servic consist good start finish cabin crew sh...


In [39]:
# save the cleaned dataset for further analysis
df.to_csv('airline_reviews_clean.csv', index=False)


In [17]:
print(df["stemmed_content"])

0       flight amaz crew onboard flight welcom gave go...
1       book emerg exit seat still meant huge discomfo...
2       excel perform front would definit choos use ai...
3       pretti comfort flight consid fli economi class...
4       servic consist good start finish cabin crew sh...
                              ...                        
8095    ke brisban incheon ke incheon ulaanbaatar kore...
8096    recent flight fourth trip uk year sydney londo...
8097    flew korean air bali seoul prestig class busi ...
8098    seoul pari korean air travel triathlon bike so...
8099    hour flight busi class seoul toronto onboard k...
Name: stemmed_content, Length: 8100, dtype: object


In [18]:
print(df["Recommended"])

0       1
1       0
2       1
3       1
4       1
       ..
8095    1
8096    1
8097    0
8098    1
8099    1
Name: Recommended, Length: 8100, dtype: int64


In [19]:
# separating the data and label
X = df["stemmed_content"].values
y = df["Recommended"].values

In [20]:
print(X)

['flight amaz crew onboard flight welcom gave good atmospher crew serv aisl goe initi g kind help gave mom bday cake late celebr even though hr min flight seat well sanit legroom spaciou ife onboard mani varieti show music etc bathroom alway kept clean crew time food delici overal flight'
 'book emerg exit seat still meant huge discomfort seat far narrow poor pad mean back ach minut flight seat aircraft dread headphon sound entertain system dread'
 'excel perform front would definit choos use airlin aircraft wellmaintain staff welltrain hospit food plenti tasti'
 ...
 'flew korean air bali seoul prestig class busi class though one longest intraasia rout hour food servic onboard realli poor first proper meal serv take korean air probabl expect passeng would choos sleep instead first meal one satay two drop hot sauc second meal breakfast land one bowl wateri conge three differ top one worst cater experi secondli servic sloppi fa interact indiffer didnt want bother also notic serv quarter

### Split the data into training and testing sets


In [21]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify= y, random_state=42)

In [22]:
print(X.shape, X_train.shape, X_test.shape)

(8100,) (6480,) (1620,)


In [23]:
print(X_train)

['travel back forth pari tokyo japan airlin far best airlin travel great travel sinc work tourism found staff checkin board welcom help attent avail meal tray plenti good well snack servic final space seat wide allow leg unfold well size screen also perfect travel pleasantli would take compani without hesit custom fr jai voyag en allerretour pari tokyo avec japan airlin et cest de loin la meilleur compagni avec laquel jai voyag je sui une grand voyageus puisqu je travail dan le tourism jai trouv le personnel lenregistr et bord accueil serviabl lcout et dispon le plateaux repa taient copieux et tr bon ainsi que le enca entr chaqu prestat enfin lespac entr le sige est larg ce qui permet de bien drouler se jamb la taill de lcran tait parfait galement jai agrabl voyag je reprendrai cett compagni san hsitat et pour me client galement'
 'first experi qatar airway sure last one recommend airlin everyon like travel smoothli comfort stopov doha long ground flight servic perfect without problem 

In [24]:
print(X_test)

['travel intern sever time year first time take qatar airway impress cleanli two aircraft use trip profession crew crew attend need pleasant manner condit made long trip enjoy definit take qatar airway whenev possibl alreadi recommend famili friend keep qatar airway'
 'absolut disappoint busi class servic flown hr serv us even water bottl singl person even ask servic might need start great breakfast noth zero servic flight smooth issu flown mani mani time never anyway near qatar emir even british airway expect class wors intern biz class flown last year'
 'one fantast airlin seat product spaciou provid reason amount privaci busi class travel need portion food accept snack also includ passeng would get hungri journey visit loung c london heathrow airport amaz loung ambienc environ provid lot comfort choic food tend toward hong kong asian style pleas see high reput airlin cabin crew will serv howev inflight announc clear overal enjoy time cathay pacif'
 ...
 'bangkok tokyonarita never se

In [25]:
# convert the textual data to numerical values
 # Initialize TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

# Transform the text data into TF-IDF features
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [26]:
print(X_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 386847 stored elements and shape (6480, 12775)>
  Coords	Values
  (0, 11565)	0.10114653326477212
  (0, 847)	0.025550085207482892
  (0, 4389)	0.06158940026379799
  (0, 8193)	0.06461220691352706
  (0, 11426)	0.0769465733097679
  (0, 5903)	0.09309849710767254
  (0, 267)	0.04897239764357348
  (0, 4061)	0.036591533395320584
  (0, 1084)	0.028773545042256488
  (0, 4811)	0.026160539963953098
  (0, 10168)	0.03383734608073756
  (0, 12600)	0.029608395126597808
  (0, 11492)	0.15340997774272858
  (0, 4398)	0.03310367614220309
  (0, 10601)	0.02044819129848043
  (0, 1894)	0.031164876612599153
  (0, 1241)	0.021528560293760424
  (0, 12418)	0.03913729761663021
  (0, 5110)	0.025821141036206557
  (0, 734)	0.03240637319840351
  (0, 800)	0.03146400518513533
  (0, 6916)	0.02318618177031166
  (0, 11573)	0.04274118381176978
  (0, 8515)	0.04038649018933007
  (0, 4742)	0.02010646412282351
  :	:
  (6479, 10440)	0.08462098028604018
  (6479, 2177)	0.1121

In [27]:
print(X_test)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 93314 stored elements and shape (1620, 12775)>
  Coords	Values
  (0, 250)	0.11011498787796865
  (0, 286)	0.3543370349788432
  (0, 365)	0.148678710617093
  (0, 731)	0.11820979084044135
  (0, 2054)	0.2286210989072657
  (0, 2295)	0.18498331437192078
  (0, 2571)	0.1631254208076463
  (0, 2849)	0.15282533420516478
  (0, 3739)	0.14889754180934361
  (0, 4045)	0.15393705917217573
  (0, 4194)	0.09415281828084936
  (0, 4469)	0.19217130728187276
  (0, 5502)	0.15104206777305423
  (0, 5737)	0.16524821659642247
  (0, 6070)	0.15860030740553693
  (0, 6594)	0.11164236302021022
  (0, 6731)	0.11539029793972781
  (0, 6819)	0.2066705095653867
  (0, 7471)	0.11653967859519315
  (0, 8509)	0.14484305370293518
  (0, 8611)	0.1569359650702993
  (0, 8826)	0.1383371998038665
  (0, 8952)	0.33046320734323054
  (0, 9158)	0.13172662193525372
  (0, 9962)	0.14956172886026872
  :	:
  (1619, 6809)	0.1090303128753126
  (1619, 7073)	0.18029468772220456
  (1619, 709

### Model training

In [28]:
model = LogisticRegression(max_iter=1000)

model.fit(X_train,y_train)

### Model Evaluation

In [29]:
from sklearn.metrics import accuracy_score

# Predict on the training data
y_train_pred = model.predict(X_train)

# Calculate the accuracy on the training data
model_accuracy_train = accuracy_score(y_train, y_train_pred)

# Print the accuracy score
print("Training Accuracy:", model_accuracy_train)


Training Accuracy: 0.9333333333333333


In [30]:
print("X_train shape:", X_train.shape)
print("y_train shape:", len(y_train))
print("X_test shape:", X_test.shape)
print("y_test shape:", len(y_test))


X_train shape: (6480, 12775)
y_train shape: 6480
X_test shape: (1620, 12775)
y_test shape: 1620


In [31]:
# Fit the model on the training data
model.fit(X_train, y_train)

# Predict on the test data
y_test_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", accuracy)


Test Accuracy: 0.9055555555555556


In [32]:
#save the trained model
import pickle
pickle.dump(model,open("airlines_model.pkl","wb"))

### Making predictions using the saved model

In [34]:
# Load the saved model
with open("airlines_model.pkl", 'rb') as file:
    loaded_model = pickle.load(file)

In [35]:
# Extract the 200th sample from X_test
X_new = X_test[200]  # Use indexing to get the 200th sample

# Check the true label for the 200th sample
print(y_test[200])

1


In [37]:
prediction = loaded_model.predict(X_new)

# Print the prediction
print("Prediction:", prediction)

# Check if the prediction is positive or negative
if prediction[0] == 0:
    print("Negative Review")
else:
    print("Positive Review")

Prediction: [1]
Positive Review


In [38]:
X_new = X_test[68] 
print(y_test[68])

prediction = loaded_model.predict(X_new)

# Print the prediction
print("Prediction:", prediction)

# Check if the prediction is positive or negative
if prediction[0] == 0:
    print("Negative Review")
else:
    print("Positive Review")

0
Prediction: [0]
Negative Review
