# Download package

In [None]:
!pip install lazypredict

Collecting lazypredict
  Downloading lazypredict-0.2.12-py2.py3-none-any.whl (12 kB)
Installing collected packages: lazypredict
Successfully installed lazypredict-0.2.12


# Import libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from lazypredict.Supervised import LazyClassifier

# Loading data

In [None]:
df = pd.read_csv('/content/drive/MyDrive/BA documents/data/text_cleaning_new.csv')
df.head()

Unnamed: 0,Review id,Review header,Review,Recommended,Review_cleaned,Review_bigrams,Review_trigrams
0,anchor885571,"""very terrible experience""",i bought roundtrip and return tickets from _gp...,no,buy roundtrip return ticket _gpe_ _gpe_ via _g...,buy roundtrip return ticket _gpe_ _gpe_ via _g...,buy roundtrip return ticket _gpe_ _gpe_ via _g...
1,anchor881710,"""very concerned about the safety of Aeroflot""",i am shocked at how far aeroflot standards hav...,no,shock far aeroflot standard fall since invade ...,shock far aeroflot standard fall since invade ...,shock far aeroflot standard fall since invade ...
2,anchor767446,"""felt very rushed and unpolished""",aeroflot has set high standards of achieving a...,no,aeroflot set high standard achieve star airlin...,aeroflot set high standard achieve star airlin...,aeroflot set high standard achieve star airlin...
3,anchor758822,"""Best airline in Russia""",best airline in _gpe_ very nice staff comforta...,yes,best airline _gpe_ nice staff comfortable seat...,best airline _gpe_ nice staff comfortable seat...,best airline _gpe_ nice staff comfortable seat...
4,anchor754420,"""cabin crew were outstanding""",i was very impressed with the staff many thank...,yes,impress staff many thank aeroflot representati...,impress staff many thank aeroflot representati...,impress staff many thank aeroflot representati...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57932 entries, 0 to 57931
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Review id        57932 non-null  object
 1   Review header    57932 non-null  object
 2   Review           57932 non-null  object
 3   Recommended      57932 non-null  object
 4   Review_cleaned   57932 non-null  object
 5   Review_bigrams   57932 non-null  object
 6   Review_trigrams  57932 non-null  object
dtypes: object(7)
memory usage: 3.1+ MB


In [None]:
df['Recommended'].value_counts()

no     34942
yes    22990
Name: Recommended, dtype: int64

In [None]:
df['Recommended'] = df['Recommended'].map({'yes': 1, 'no': 0})

In [None]:
X = df['Review_trigrams']
y = df['Recommended']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorizer

In [None]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
X_train_tfidf.shape, X_test_tfidf.shape

((46345, 29383), (11587, 29383))

# Machine learning models



In [None]:
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier


## SVM

In [None]:
clf = LinearSVC(random_state=0)
clf.fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.94      0.94      7050
           1       0.91      0.89      0.90      4537

    accuracy                           0.92     11587
   macro avg       0.92      0.91      0.92     11587
weighted avg       0.92      0.92      0.92     11587



## Logistic regression

In [None]:
clf = LogisticRegression()
clf.fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94      7050
           1       0.92      0.89      0.90      4537

    accuracy                           0.93     11587
   macro avg       0.92      0.92      0.92     11587
weighted avg       0.93      0.93      0.92     11587



## Random Forest



In [None]:
rf = RandomForestClassifier()
rf.fit(X_train_tfidf, y_train)
y_pred = rf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.93      0.92      7050
           1       0.89      0.84      0.86      4537

    accuracy                           0.90     11587
   macro avg       0.90      0.89      0.89     11587
weighted avg       0.90      0.90      0.90     11587



## Decision Tree

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train_tfidf, y_train)
y_pred = dt.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85      7050
           1       0.77      0.77      0.77      4537

    accuracy                           0.82     11587
   macro avg       0.81      0.81      0.81     11587
weighted avg       0.82      0.82      0.82     11587



## GaussianNB

In [None]:
nb = GaussianNB()
nb.fit(X_train_tfidf.toarray(), y_train)
y_pred = nb.predict(X_test_tfidf.toarray())
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.28      0.41      7050
           1       0.44      0.88      0.59      4537

    accuracy                           0.52     11587
   macro avg       0.61      0.58      0.50     11587
weighted avg       0.65      0.52      0.48     11587



## KNN

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train_tfidf, y_train)
y_pred = knn.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.90      0.87      7050
           1       0.83      0.73      0.78      4537

    accuracy                           0.84     11587
   macro avg       0.84      0.82      0.83     11587
weighted avg       0.84      0.84      0.84     11587



## AdaBoost

In [None]:
adb = AdaBoostClassifier()
adb.fit(X_train_tfidf, y_train)
y_pred = adb.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.92      0.90      7050
           1       0.86      0.82      0.84      4537

    accuracy                           0.88     11587
   macro avg       0.88      0.87      0.87     11587
weighted avg       0.88      0.88      0.88     11587



## Gradient Boosting

In [None]:
gdb = GradientBoostingClassifier()
gdb.fit(X_train_tfidf, y_train)
y_pred = gdb.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.93      0.91      7050
           1       0.88      0.82      0.85      4537

    accuracy                           0.89     11587
   macro avg       0.88      0.87      0.88     11587
weighted avg       0.89      0.89      0.89     11587

