In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

%matplotlib inline

In [14]:
df = pd.read_csv(r"C:\Users\wasim\tourism_package\data\Travel.csv")

In [15]:
df.shape

(4888, 20)

In [16]:
percentage = df.ProdTaken.value_counts(normalize=True)*100
pielabels = ["Bought","Not-Bought"]

# Plot PieChart with Ploty library
f1 = px.pie(values= percentage, names= pielabels, title="Percentage of package bought and not bought")
f1.update_traces(textposition="inside", textinfo="percent+label")
f1.update_layout(margin={"r":50,"t":50,"l":50,"b":50})
f1.show()

In [17]:
# Age
df['Age'].fillna(df['Age'].mean(), inplace=True)

# TypeofContact
df['TypeofContact'].fillna(df['TypeofContact'].mode()[0], inplace=True)

# DurationOfPitch
df['DurationOfPitch'].fillna(df['DurationOfPitch'].median(), inplace=True)

# NumberOfFollowups
df['NumberOfFollowups'].fillna(df['NumberOfFollowups'].mode()[0], inplace=True)

# PreferredPropertyStar
df['PreferredPropertyStar'].fillna(df['PreferredPropertyStar'].mode()[0], inplace=True)

# NumberOfTrips
df['NumberOfTrips'].fillna(0, inplace=True)

# NumberOfChildrenVisiting
df['NumberOfChildrenVisiting'].fillna(df['NumberOfChildrenVisiting'].mode()[0], inplace=True)

# MonthlyIncome
df['MonthlyIncome'].fillna(df['MonthlyIncome'].median(), inplace=True)

In [18]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder


categorical_columns = df.select_dtypes(include=['object']).columns

enc = OrdinalEncoder()
df[categorical_columns] = enc.fit_transform(df[categorical_columns])

In [19]:
# from collections import Counter
# from imblearn.over_sampling import SMOTE

# X = df.drop(columns=['ProdTaken', 'CustomerID'], axis=1)
# y = df['ProdTaken']

# # summarize class distribution
# counter = Counter(y)
# print(counter)
# # define pipeline
# over = SMOTE(sampling_strategy=0.4)

# # transform the dataset
# X, y = over.fit_resample(X, y)
# # summarize the new class distribution
# counter = Counter(y)
# print(counter)

# X.shape, y.shape

In [20]:
# import math
# from sklearn.model_selection import StratifiedShuffleSplit

# X[X.columns] = X[X.columns].apply(np.ceil) 
# sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=42)
# for train_index, test_index in sss.split(X, y):
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# print(X_train.shape, X_test.shape)

# df = pd.concat([X_train, y_train], axis=1)

# test_df = pd.concat([X_test, y_test], axis=1)
# test_df.to_csv(r'C:\Users\wasim\tourism_package\data\test_df.csv', index=False)

In [25]:
X = df.drop(columns=['ProdTaken', 'CustomerID'], axis=1)
y = df['ProdTaken']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,random_state=36)
X_train.shape, X_test.shape

((4399, 18), (489, 18))

In [26]:
models = {
    "Random Forest": RandomForestClassifier(n_estimators=1000, min_samples_split=2, max_features= 'auto', max_depth= 20),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Logistic Regression": LogisticRegression(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "XGBClassifier": XGBClassifier(), 
    "CatBoosting Classifier": CatBoostClassifier(verbose=False),
    "Support Vector Classifier": SVC(),
    "AdaBoost Classifier": AdaBoostClassifier()
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
    model_train_precision = precision_score(y_train, y_train_pred) # Calculate Precision
    model_train_recall = recall_score(y_train, y_train_pred) # Calculate Recall


    # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
    model_test_precision = precision_score(y_test, y_test_pred) # Calculate Precision
    model_test_recall = recall_score(y_test, y_test_pred) # Calculate Recall


    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:.4f}'.format(model_train_f1))
    print('- Precision: {:.4f}'.format(model_train_precision))
    print('- Recall: {:.4f}'.format(model_train_recall))
    
    print('----------------------------------')
    
    print('Model performance for Test set')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy))
    print('- F1 score: {:.4f}'.format(model_test_f1))
    print('- Precision: {:.4f}'.format(model_test_precision))
    print('- Recall: {:.4f}'.format(model_test_recall))
    
    print('='*35)
    print('\n')

Random Forest
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.9591
- F1 score: 0.9570
- Precision: 0.9844
- Recall: 0.7683


Naive Bayes
Model performance for Training set
- Accuracy: 0.8361
- F1 score: 0.8181
- Precision: 0.6206
- Recall: 0.3592
----------------------------------
Model performance for Test set
- Accuracy: 0.8896
- F1 score: 0.8792
- Precision: 0.7692
- Recall: 0.4878


Gradient Boosting
Model performance for Training set
- Accuracy: 0.8843
- F1 score: 0.8708
- Precision: 0.8507
- Recall: 0.4761
----------------------------------
Model performance for Test set
- Accuracy: 0.8916
- F1 score: 0.8772
- Precision: 0.8372
- Recall: 0.4390


Logistic Regression
Model performance for Training set
- Accuracy: 0.8177
- F1 score: 0.7451
- Precision: 0.8913
- Recall: 0.0489
----------------------------------
Model performance for Test set
-