##### Predicting reviewer‘s cabin class based on their written review on Skytrax

In [20]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pickle
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from textacy.preprocess import preprocess_text
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [2]:
# we open the file for reading
fileObject1 = open("df_air.pickle",'rb')
fileObject2 = open("df_main.pickle",'rb')
fileObject3 = open("star.pickle",'rb')
fileObject4 = open("budget.pickle",'rb')

# load the object from the file into variables
df_air = pickle.load(fileObject1)  
df = pickle.load(fileObject2) 
star = pickle.load(fileObject3) 
budget = pickle.load(fileObject4) 

In [3]:
df.reset_index(inplace=True)
df.drop(columns=['index'],axis=1,inplace=True)
df.head(50)

Unnamed: 0,airline_name,author_country,content,cabin_flown,overall_rating,seat_comfort_rating,cabin_staff_rating,food_beverages_rating,inflight_entertainment_rating,ground_service_rating,wifi_connectivity_rating,value_money_rating,recommend,airline,cabin_class,overall
0,0,54,Outbound flight FRA/PRN A319. 2 hours 10 min f...,1,7.0,4.0,4.0,4.0,0.0,3.0,3.0,4.0,1,adria-airways,Economy,3.5
1,0,149,Two short hops ZRH-LJU and LJU-VIE. Very fast ...,0,10.0,4.0,5.0,4.0,1.0,3.0,3.0,5.0,1,adria-airways,Business Class,5.0
2,0,138,Flew Zurich-Ljubljana on JP365 newish CRJ900. ...,1,9.0,5.0,5.0,4.0,0.0,3.0,3.0,5.0,1,adria-airways,Economy,4.5
3,0,129,Adria serves this 100 min flight from Ljubljan...,0,8.0,4.0,4.0,3.0,1.0,3.0,3.0,4.0,1,adria-airways,Business Class,4.0
4,0,118,WAW-SKJ Economy. No free snacks or drinks on t...,1,4.0,4.0,2.0,1.0,2.0,3.0,3.0,2.0,0,adria-airways,Economy,2.0
5,0,54,Sarajevo-Frankfurt via Ljubljana. I loved flyi...,1,9.0,4.0,4.0,3.0,3.0,3.0,3.0,4.0,1,adria-airways,Economy,4.5
6,0,51,I had flights from Paris to Sarajevo via Ljubl...,1,5.0,4.0,4.0,1.0,0.0,3.0,3.0,3.0,1,adria-airways,Economy,2.5
7,0,131,LJU to FRA and back both flights were on time....,1,9.0,5.0,5.0,4.0,3.0,3.0,3.0,4.0,1,adria-airways,Economy,4.5
8,0,129,On my Ljubljana - Munich flight in business cl...,0,8.0,4.0,3.0,4.0,1.0,3.0,3.0,4.0,1,adria-airways,Business Class,4.0
9,0,131,Flights from LJU to ZRH and back all on time. ...,1,10.0,5.0,5.0,4.0,4.0,3.0,3.0,4.0,1,adria-airways,Economy,5.0


In [4]:
df.columns

Index(['airline_name', 'author_country', 'content', 'cabin_flown',
       'overall_rating', 'seat_comfort_rating', 'cabin_staff_rating',
       'food_beverages_rating', 'inflight_entertainment_rating',
       'ground_service_rating', 'wifi_connectivity_rating',
       'value_money_rating', 'recommend', 'airline', 'cabin_class', 'overall'],
      dtype='object')

In [5]:
y = df['cabin_flown']
#.astype('uint8')

In [6]:
clean_text = [preprocess_text(x, fix_unicode=True, lowercase=True, transliterate=False,
                              no_urls=True, no_emails=True, no_phone_numbers=True, no_currency_symbols=True,
                              no_punct=True, no_accents=True)
              for x in df['content'].values]

#### MULTI-CLASS CLASSIFICATION MODEL

In [7]:
tfv = TfidfVectorizer(ngram_range=(1,2), max_features=50000)
X = tfv.fit_transform((clean_text))

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
rf = RandomForestClassifier(n_estimators=250, verbose=1)
knn = KNeighborsClassifier(n_neighbors=5)

In [10]:
rf.fit(X_train, y_train)
knn.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:  1.8min finished


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [11]:
# Random forest and KNN scores:
print('RF:', rf.score(X_test, y_test))
print('KNN:', knn.score(X_test, y_test))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:    2.8s finished


RF: 0.8846620450606586
KNN: 0.8304159445407279


In [12]:
rf_yhat = rf.predict(X_test)
#rf_yhat = knn.predict(X_test)

print(classification_report(y_test, rf_yhat))
print(confusion_matrix(y_test, rf_yhat))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:    2.9s finished


              precision    recall  f1-score   support

           0       0.83      0.72      0.77      1907
           1       0.89      0.98      0.94      8917
           2       0.92      0.13      0.23       266
           3       0.86      0.08      0.15       450

   micro avg       0.88      0.88      0.88     11540
   macro avg       0.88      0.48      0.52     11540
weighted avg       0.88      0.88      0.86     11540

[[1381  523    0    3]
 [ 154 8757    3    3]
 [  77  154   35    0]
 [  54  360    0   36]]


#### Preparing data for - Predicted cabin class result

In [13]:
y_predict = pd.DataFrame(rf_yhat)
y_predict.columns = ['predicted_class']
y_predict['predicted_class_desc'] = y_predict['predicted_class'].map({1:'Econ',0:'Business',2:'First',3:'Prem'}) 

#### Preparing data for - Tested subset dataset

In [14]:
y_test = df.loc[y_test.index]
y_test.reset_index(inplace=True)

In [15]:
result = pd.concat([y_test, y_predict], axis=1)

In [28]:
result[['content','cabin_flown','cabin_class','predicted_class','predicted_class_desc']].head(55)

Unnamed: 0,content,cabin_flown,cabin_class,predicted_class,predicted_class_desc
0,Travelled round trip SIN-ZRH-GVA. All flights ...,1,Economy,1,Econ
1,18th January EY0455 SYD-AUH. Flight back to Sy...,1,Economy,1,Econ
2,Flight to Cape Town from CDG. Booked premium e...,3,Premium Economy,1,Econ
3,Nadi-Auckland Economy on old 747. All seats an...,1,Economy,1,Econ
4,I have used Eastern Airways for a number of ye...,1,Economy,1,Econ
5,TBS-DOH-MXP. Both aircraft's seemed relatively...,1,Economy,1,Econ
6,EK005 7th Jan Business class with my wife and ...,0,Business Class,0,Business
7,Intersky is another airline charging high pric...,1,Economy,1,Econ
8,Jakarta - Kuala Lumpur - Amsterdam - Helsinki ...,1,Economy,1,Econ
9,HKG-CDG Flight 185. FA's were particularly fri...,0,Business Class,0,Business
