# Import lib


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import nltk
from tqdm import tqdm

from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Loading data

In [2]:
df = pd.read_csv('/kaggle/input/airline2/text_cleaning.csv')
df.head()

Unnamed: 0,Review,Recommended,Overall rating,Review_cleaned,Review_bigrams,Review_trigrams
0,i bought roundtrip and return tickets from _gp...,no,1.0,buy roundtrip return ticket _gpe_ _gpe_ via _g...,buy roundtrip return ticket _gpe_ _gpe_ via _g...,buy roundtrip return ticket _gpe_ _gpe_ via _g...
1,i am shocked at how far aeroflot standards hav...,no,1.0,shock far aeroflot standard fall since invade ...,shock far aeroflot standard fall since invade ...,shock far aeroflot standard fall since invade ...
2,aeroflot has set high standards of achieving a...,no,5.0,aeroflot set high standard achieve star airlin...,aeroflot set high standard achieve star airlin...,aeroflot set high standard achieve star airlin...
3,best airline in _gpe_ very nice staff comforta...,yes,10.0,best airline _gpe_ nice staff comfortable seat...,best airline _gpe_ nice staff comfortable seat...,best airline _gpe_ nice staff comfortable seat...
4,i was very impressed with the staff many thank...,yes,10.0,impress staff many thank aeroflot representati...,impress staff many thank aeroflot representati...,impress staff many thank aeroflot representati...


In [3]:
review = df['Review_cleaned']
y = df['Overall rating']

# Tokenize

In [5]:
review_tokenized =[nltk.word_tokenize(item) for item in tqdm(review, total=len(review))]

100%|██████████| 68467/68467 [00:59<00:00, 1144.84it/s]


# GloVe

In [6]:
glove_path = "/kaggle/input/glove6b300dtxt/glove.6B.300d.txt"

In [7]:
def load_glove_model(glove_file):
    print("Loading Glove Model")
    f = open(glove_file, 'r')
    total_lines = sum(1 for line in open(glove_file, 'r'))
    model = {}
    vector_size = 300
    for line in tqdm(f, total=total_lines, desc="Processing GloVe Embeddings"):
        split_line = line.split()
        word = " ".join(split_line[0:len(split_line) - vector_size])
        embedding = np.array([float(val) for val in split_line[-vector_size:]])
        model[word] = embedding
    print("Done.\n" + str(len(model)) + " words loaded!")
    return model

In [8]:
glove = load_glove_model(glove_path)

Loading Glove Model


Processing GloVe Embeddings: 100%|██████████| 400000/400000 [00:48<00:00, 8291.80it/s]

Done.
400000 words loaded!





In [9]:
def sentence_features_v2(s, embeddings=glove,emb_size=300):
    words=[w for w in s if w.isalpha() and w in embeddings]
    if len(words)==0:
        return np.hstack([np.zeros(emb_size)])
    M=np.array([embeddings[w] for w in words])
    return M.mean(axis=0)

In [10]:
review_embbeded = np.array([sentence_features_v2(x) for x in review_tokenized])
review_embbeded.shape 

(68467, 300)

# ML models

In [32]:
stat = pd.read_csv('/kaggle/input/airline2/stat_data_processed.csv')
stat.head()

Unnamed: 0,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,Value For Money,Recommended,Overall rating
0,1.0,1.0,1.0,1.0,1.0,1.0,0,1.0
1,2.0,2.0,1.0,1.0,1.0,1.0,0,1.0
2,5.0,2.0,4.0,5.0,2.0,2.0,0,5.0
3,5.0,5.0,5.0,3.0,5.0,5.0,1,10.0
4,4.0,5.0,4.0,4.0,5.0,4.0,1,10.0


In [34]:
stat.drop(columns=['Overall rating'], inplace=True)
X = np.concatenate((review_embbeded, stat.values), axis=1)
X.shape

(68467, 307)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((54773, 307), (13694, 307), (54773,), (13694,))

## SVM

In [36]:
clf = LinearSVC(random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         1.0       0.71      0.98      0.82      5602
         2.0       0.17      0.07      0.10      1476
         3.0       0.18      0.12      0.14       897
         4.0       0.09      0.00      0.00       519
         5.0       0.00      0.00      0.00       524
         6.0       0.00      0.00      0.00       418
         7.0       0.29      0.26      0.27       614
         8.0       0.37      0.42      0.39      1029
         9.0       0.40      0.31      0.35      1175
        10.0       0.58      0.85      0.69      1440

    accuracy                           0.57     13694
   macro avg       0.28      0.30      0.28     13694
weighted avg       0.46      0.57      0.50     13694





## Logistic Regression

In [37]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         1.0       0.75      0.95      0.84      5602
         2.0       0.28      0.13      0.18      1476
         3.0       0.23      0.17      0.20       897
         4.0       0.22      0.09      0.13       519
         5.0       0.34      0.24      0.28       524
         6.0       0.28      0.13      0.18       418
         7.0       0.36      0.38      0.37       614
         8.0       0.43      0.49      0.46      1029
         9.0       0.45      0.39      0.42      1175
        10.0       0.67      0.78      0.72      1440

    accuracy                           0.60     13694
   macro avg       0.40      0.38      0.38     13694
weighted avg       0.54      0.60      0.56     13694



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Random Forest

In [38]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         1.0       0.68      0.98      0.81      5602
         2.0       0.24      0.08      0.12      1476
         3.0       0.20      0.07      0.10       897
         4.0       0.16      0.02      0.03       519
         5.0       0.36      0.07      0.12       524
         6.0       0.31      0.04      0.07       418
         7.0       0.34      0.26      0.29       614
         8.0       0.39      0.43      0.41      1029
         9.0       0.38      0.33      0.35      1175
        10.0       0.59      0.82      0.69      1440

    accuracy                           0.58     13694
   macro avg       0.37      0.31      0.30     13694
weighted avg       0.49      0.58      0.50     13694



## Decision Tree

In [39]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         1.0       0.78      0.76      0.77      5602
         2.0       0.22      0.24      0.23      1476
         3.0       0.17      0.17      0.17       897
         4.0       0.13      0.13      0.13       519
         5.0       0.20      0.19      0.20       524
         6.0       0.15      0.15      0.15       418
         7.0       0.26      0.29      0.27       614
         8.0       0.31      0.31      0.31      1029
         9.0       0.33      0.33      0.33      1175
        10.0       0.60      0.58      0.59      1440

    accuracy                           0.49     13694
   macro avg       0.32      0.32      0.32     13694
weighted avg       0.50      0.49      0.49     13694



# Gaussian NB

In [40]:
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         1.0       0.79      0.78      0.79      5602
         2.0       0.21      0.18      0.19      1476
         3.0       0.18      0.12      0.14       897
         4.0       0.10      0.12      0.11       519
         5.0       0.12      0.17      0.14       524
         6.0       0.14      0.41      0.20       418
         7.0       0.24      0.27      0.26       614
         8.0       0.32      0.25      0.28      1029
         9.0       0.34      0.29      0.31      1175
        10.0       0.57      0.49      0.53      1440

    accuracy                           0.48     13694
   macro avg       0.30      0.31      0.30     13694
weighted avg       0.50      0.48      0.48     13694



## KNN

In [41]:
clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         1.0       0.75      0.92      0.82      5602
         2.0       0.28      0.20      0.23      1476
         3.0       0.24      0.16      0.19       897
         4.0       0.20      0.12      0.15       519
         5.0       0.25      0.14      0.18       524
         6.0       0.25      0.20      0.22       418
         7.0       0.32      0.40      0.35       614
         8.0       0.37      0.43      0.40      1029
         9.0       0.42      0.37      0.39      1175
        10.0       0.70      0.63      0.66      1440

    accuracy                           0.57     13694
   macro avg       0.38      0.35      0.36     13694
weighted avg       0.53      0.57      0.54     13694



## AdaBoost

In [42]:
adb = AdaBoostClassifier()
adb.fit(X_train, y_train)
y_pred = adb.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         1.0       0.78      0.78      0.78      5602
         2.0       0.24      0.31      0.27      1476
         3.0       0.21      0.21      0.21       897
         4.0       0.21      0.12      0.15       519
         5.0       0.31      0.15      0.20       524
         6.0       0.27      0.09      0.14       418
         7.0       0.25      0.62      0.36       614
         8.0       0.30      0.18      0.22      1029
         9.0       0.31      0.41      0.35      1175
        10.0       0.55      0.35      0.42      1440

    accuracy                           0.49     13694
   macro avg       0.34      0.32      0.31     13694
weighted avg       0.51      0.49      0.49     13694



## Gradient Boosting

In [43]:
gdb = GradientBoostingClassifier()
gdb.fit(X_train, y_train)
y_pred = gdb.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         1.0       0.77      0.94      0.85      5602
         2.0       0.34      0.21      0.26      1476
         3.0       0.22      0.18      0.19       897
         4.0       0.19      0.09      0.13       519
         5.0       0.30      0.12      0.17       524
         6.0       0.28      0.11      0.16       418
         7.0       0.36      0.39      0.37       614
         8.0       0.42      0.49      0.45      1029
         9.0       0.44      0.41      0.43      1175
        10.0       0.68      0.76      0.72      1440

    accuracy                           0.60     13694
   macro avg       0.40      0.37      0.37     13694
weighted avg       0.55      0.60      0.57     13694

