# Data Preprocessing
# 1. Data cleaning

In [None]:
import numpy as np 
import pandas as pd 
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

df = pd.read_csv("hotel_bookings.csv") 
df.head()

In [None]:
df.shape

In [None]:
# Check missing values in the data.
df.isna().sum()

In [None]:
msno.matrix(df)

In [None]:
df['market_segment']

In [None]:
def missing_percentage(df): 
    # A function for returning missing ratios.
    
    total_nan = df.isnull().sum()
    total_nan_series = pd.Series(total_nan, index=df.columns)
    total = total_nan_series.sort_values(ascending=False)

    percent_nan = 100* df.isnull().sum() / len(df)
    percent_nan_series = pd.Series(percent_nan, index=df.columns)
    percent = percent_nan_series.sort_index(ascending=False)
    
    return pd.concat([total,percent],axis=1,keys=['Total','Percent'])

In [None]:
# checking 'NaN' values.

missing = missing_percentage(df)

fig,ax = plt.subplots(figsize=(20,5))
sns.barplot(x=missing.index,y='Percent',data=missing,palette='Reds_r')
plt.xticks(rotation=90)

display(missing.T.style.background_gradient(cmap='Reds',axis=1))

In [None]:
# Since 94% of rows are missing for company column, therefore, we drop the company column
df = df.drop(['company'],axis=1)

# The agent column has 13% missing values, we can either keep it or delete it. 
# By checking the metadata, agent column is the ID of the travel agency that made the booking, which is not relevant. 
# Therefore, we delete the agent column as well.
df = df.drop(['agent'],axis=1)

# A few of these features appear that it is just one or two rows missing the data, 
# it is more sense to drop a row, based on missing column features.
df.dropna(subset=["children","country"],inplace=True)

In [None]:
# Now, no more missing values in the dataset.
df.isnull().sum()

In [None]:
# Check duplicated data.
df.duplicated().sum()

In [None]:
# Remove duplicated data.
df.drop_duplicates(inplace = True)

In [None]:
# Now, no more duplicated data, and our data cleaning process have been finished.
df.duplicated().sum()

# 2. Data Integration

In [None]:
# Data Integrate (Feature Integration)
# I wanted to label them manually. I will do the rest with get.dummies or label_encoder.
df['hotel'] = df['hotel'].map({'Resort Hotel':0, 'City Hotel':1})

df['arrival_date_month'] = df['arrival_date_month'].map({'January':1, 'February': 2, 'March':3, 'April':4, 'May':5, 'June':6, 'July':7,
                                                            'August':8, 'September':9, 'October':10, 'November':11, 'December':12})

In [None]:
def family(data):
    if ((data['adults'] > 0) & (data['children'] > 0)):
        val = 1
    elif ((data['adults'] > 0) & (data['babies'] > 0)):
        val = 1
    else:
        val = 0
    return val

In [None]:
def feature(df):
    df["is_family"] = df.apply(family, axis = 1)
    df["total_customer"] = df["adults"] + df["children"] + df["babies"]
    df["total_nights"] = df["stays_in_weekend_nights"]+ df["stays_in_week_nights"]
    return df

df = feature(df)

# Information of these columns is also inside of new features, so it is better to drop them.
# I did not drop stays_nights features, I can't decide which feature is more important there.
df = df.drop(columns = ['adults', 'babies', 'children', 'deposit_type', 'reservation_status_date'])

In [None]:
# correlation analysis (Heat map)
plt.figure(figsize = (10,10)) 
correlation_df = df[df.columns].corr() 
mask = np.triu(correlation_df) 
sns.heatmap(correlation_df,mask = mask,cmap='coolwarm',annot=True,square = True,fmt='.1f',linewidths = 1)  
plt.show() 

In [None]:
from sklearn.preprocessing import LabelEncoder
cor_df = df.copy()
le = LabelEncoder()
# This data will not be used while predicting cancellation. This is just for checking correlation.
cor_df['meal'] = le.fit_transform(cor_df['meal'])
cor_df['distribution_channel'] = le.fit_transform(cor_df['distribution_channel'])
cor_df['reserved_room_type'] = le.fit_transform(cor_df['reserved_room_type'])
cor_df['assigned_room_type'] = le.fit_transform(cor_df['assigned_room_type'])
cor_df['customer_type'] = le.fit_transform(cor_df['customer_type'])
cor_df['reservation_status'] = le.fit_transform(cor_df['reservation_status'])
cor_df['market_segment'] = le.fit_transform(cor_df['market_segment'])

In [None]:
# correlation analysis (Heat map)
plt.figure(figsize = (15,15)) 
correlation_df = cor_df[cor_df.columns].corr()
mask = np.triu(correlation_df) 
sns.heatmap(correlation_df,mask = mask,cmap='coolwarm',annot=True,square = True,fmt='.1f',linewidths = 1)  
plt.show() 

In [None]:
# get sorted correlation series of is_canceled column
correlation_df_sorted = correlation_df["is_canceled"].sort_values()

In [None]:
# filter features with correlation's absolute value > 0.1
important_feature_series = correlation_df_sorted[abs(correlation_df_sorted) > 0.1]
target = 'is_canceled'
important_features = important_feature_series.index.drop([target,'reservation_status'])
important_features

In [None]:
# final dataframe
X = cor_df[important_features]
y = cor_df[target]
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

In [None]:
from sklearn.metrics import roc_curve, auc
# We can use the functions to apply the models and roc curves to save space.
def get_roc_curve(y_test, y_prob):
    false_positive_rate, true_positive_rate, threshold = roc_curve(y_test, y_prob)
    roc_auc = auc(false_positive_rate, true_positive_rate)

    plt.figure(figsize = (10,10))
    plt.title('Receiver Operating Characteristic')
    plt.plot(false_positive_rate, true_positive_rate, color = 'red', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1], linestyle = '--')
    plt.axis('tight')
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')

# 3. Model Evaluation
## 3.1 Decision Tree

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
# TODO: use graphviz to visualize the tree
# TODO: Tuning the parameters by using GridCV
# TODO: If speed is too slow, Cross-validation F1-score:{cross_val_score(clf, X, y, cv=10, scoring='f1').mean() can be deleted, this line is used to calculate the f1-score by cross-validation.
clf = DecisionTreeClassifier(max_depth=20)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pos_prob = clf.predict_proba(X_test)[:,1]
print("Decision Tree Model:")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}\nAccuracy Score:{accuracy_score(y_test,y_pred)}\nRecall score:{recall_score(y_test,y_pred)}\nPrecision Score:{precision_score(y_test, y_pred)}\nF1-score:{f1_score(y_test,y_pred)}\nCross-validation F1-score:{cross_val_score(clf, X, y, cv=10, scoring='f1').mean()}")
get_roc_curve(y_test,y_pos_prob)

## 3.2 SVM

In [None]:
from sklearn.svm import SVC
# TODO: Tuning the parameters by using GridCV
# TODO: If speed is too slow, Cross-validation F1-score:{cross_val_score(clf, X, y, cv=10, scoring='f1').mean() can be deleted, this line is used to calculate the f1-score by cross-validation.
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pos_prob = clf.predict_proba(X_test)[:,1]
print("SVM Model:")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}\nAccuracy Score:{accuracy_score(y_test,y_pred)}\nRecall score:{recall_score(y_test,y_pred)}\nPrecision Score:{precision_score(y_test, y_pred)}\nF1-score:{f1_score(y_test,y_pred)}")
get_roc_curve(y_test,y_pos_prob)

## 3.3 Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
# TODO: Tuning the parameters by using GridCV
# TODO: If speed is too slow, Cross-validation F1-score:{cross_val_score(clf, X, y, cv=10, scoring='f1').mean() can be deleted, this line is used to calculate the f1-score by cross-validation.
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pos_prob = clf.predict_proba(X_test)[:,1]
print("Random Forest Model:")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}\nAccuracy Score:{accuracy_score(y_test,y_pred)}\nRecall score:{recall_score(y_test,y_pred)}\nPrecision Score:{precision_score(y_test, y_pred)}\nF1-score:{f1_score(y_test,y_pred)}\nCross-validation F1-score:{cross_val_score(clf, X, y, cv=10, scoring='f1-score').mean()}")
get_roc_curve(y_test,y_pos_prob)

## 3.4 XGBoost

In [None]:
from xgboost import XGBClassifier
# TODO: Tuning the parameters by using GridCV
# TODO: If speed is too slow, Cross-validation F1-score:{cross_val_score(clf, X, y, cv=10, scoring='f1').mean() can be deleted, this line is used to calculate the f1-score by cross-validation.
clf = XGBClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pos_prob = clf.predict_proba(X_test)[:,1]
print("Random Forest Model:")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}\nAccuracy Score:{accuracy_score(y_test,y_pred)}\nRecall score:{recall_score(y_test,y_pred)}\nPrecision Score:{precision_score(y_test, y_pred)}\nF1-score:{f1_score(y_test,y_pred)}\nCross-validation F1-score:{cross_val_score(clf, X, y, cv=10, scoring='f1-score').mean()}")
get_roc_curve(y_test,y_pos_prob)