# Data Preparation and Quality Assessment

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
import os

# Create a folder named 'data' in the current working directory
os.makedirs('data', exist_ok=True)  # exist_ok=True avoids an error if the folder already exists


In [7]:

df = pd.read_csv("data/customer_booking.csv", encoding="ISO-8859-1")
df.head()

Unnamed: 0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
0,2,Internet,RoundTrip,262,19,7,Sat,AKLDEL,New Zealand,1,0,0,5.52,0
1,1,Internet,RoundTrip,112,20,3,Sat,AKLDEL,New Zealand,0,0,0,5.52,0
2,2,Internet,RoundTrip,243,22,17,Wed,AKLDEL,India,1,1,0,5.52,0
3,1,Internet,RoundTrip,96,31,4,Sat,AKLDEL,New Zealand,0,0,1,5.52,0
4,2,Internet,RoundTrip,68,22,15,Wed,AKLDEL,India,1,0,1,5.52,0


In [9]:
import numpy as np

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   num_passengers         50000 non-null  int64  
 1   sales_channel          50000 non-null  object 
 2   trip_type              50000 non-null  object 
 3   purchase_lead          50000 non-null  int64  
 4   length_of_stay         50000 non-null  int64  
 5   flight_hour            50000 non-null  int64  
 6   flight_day             50000 non-null  object 
 7   route                  50000 non-null  object 
 8   booking_origin         50000 non-null  object 
 9   wants_extra_baggage    50000 non-null  int64  
 10  wants_preferred_seat   50000 non-null  int64  
 11  wants_in_flight_meals  50000 non-null  int64  
 12  flight_duration        50000 non-null  float64
 13  booking_complete       50000 non-null  int64  
dtypes: float64(1), int64(8), object(5)
memory usage: 5.3+ 

In [13]:
df["flight_day"].unique()

array(['Sat', 'Wed', 'Thu', 'Mon', 'Sun', 'Tue', 'Fri'], dtype=object)

In [15]:
mapping = {
    "Mon": 1,
    "Tue": 2,
    "Wed": 3,
    "Thu": 4,
    "Fri": 5,
    "Sat": 6,
    "Sun": 7,
}

df["flight_day"] = df["flight_day"].map(mapping)

In [17]:
df.describe()

Unnamed: 0,num_passengers,purchase_lead,length_of_stay,flight_hour,flight_day,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,1.59124,84.94048,23.04456,9.06634,3.81442,0.66878,0.29696,0.42714,7.277561,0.14956
std,1.020165,90.451378,33.88767,5.41266,1.992792,0.470657,0.456923,0.494668,1.496863,0.356643
min,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4.67,0.0
25%,1.0,21.0,5.0,5.0,2.0,0.0,0.0,0.0,5.62,0.0
50%,1.0,51.0,17.0,9.0,4.0,1.0,0.0,0.0,7.57,0.0
75%,2.0,115.0,28.0,13.0,5.0,1.0,1.0,1.0,8.83,0.0
max,9.0,867.0,778.0,23.0,7.0,1.0,1.0,1.0,9.5,1.0


In [22]:
mapping={
    'RoundTrip':1,
    'CircleTrip':2,
    'OneWay':3,
}

df["trip_type"]=df["trip_type"].map(mapping)

In [24]:
mapping={
    'Internet':1,
    'Mobile':2,
}

df["sales_channel"]=df["sales_channel"].map(mapping)

In [26]:
# Create a dictionary to map unique strings to numerical values (starting from 0)
booking_mapping = dict(zip(df["booking_origin"].unique(), range(len(df["booking_origin"].unique()))))

# Convert the strings in the array to numerical values using the dictionary
df["booking_origin"] = df["booking_origin"].apply(lambda x: booking_mapping[x])

In [30]:
num_unique_items = df["route"].nunique()
print("Number of unique items:", num_unique_items)

Number of unique items: 799


In [34]:
# Create a dictionary to map unique strings to numerical values (starting from 0)
route_mapping = dict(zip(df["route"].unique(), range(len(df["route"].unique()))))

# Convert the strings in the array to numerical values using the dictionary
df["route"] = df["route"].apply(lambda x: route_mapping[x])

In [38]:
df["route"]

0          0
1          0
2          0
3          0
4          0
        ... 
49995    638
49996    638
49997    638
49998    638
49999    638
Name: route, Length: 50000, dtype: int64

In [44]:
# # Add new columns for departure and arrival city
# df['departure_city'] = df['route'].apply(lambda x: x[:3])
# df['arrival_city'] = df['route'].apply(lambda x: x[3:])

In [53]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
purchase_lead_scaled=scaler.fit_transform(df[["purchase_lead"]])

In [55]:
df["purchase_lead"]=np.array(purchase_lead_scaled).flatten()

In [57]:
scaler=MinMaxScaler()
length_of_stay_scaled=scaler.fit_transform(df[["length_of_stay"]])

In [59]:
df["length_of_stay"]=np.array(length_of_stay_scaled)

In [61]:
scaler=MinMaxScaler()
flight_hour_scaled=scaler.fit_transform(df[["flight_hour"]])

In [65]:
df["flight_hour"]=np.array(flight_hour_scaled)

# Importing Random Forest Classification

In [67]:
df

Unnamed: 0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
0,2,1,1,0.302191,0.024422,0.304348,6,0,0,1,0,0,5.52,0
1,1,1,1,0.129181,0.025707,0.130435,6,0,0,0,0,0,5.52,0
2,2,1,1,0.280277,0.028278,0.739130,3,0,1,1,1,0,5.52,0
3,1,1,1,0.110727,0.039846,0.173913,6,0,0,0,0,1,5.52,0
4,2,1,1,0.078431,0.028278,0.652174,3,0,1,1,0,1,5.52,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2,1,1,0.031142,0.007712,0.391304,6,638,36,1,0,1,5.62,0
49996,1,1,1,0.128028,0.007712,0.173913,7,638,36,0,0,0,5.62,0
49997,1,1,1,0.027682,0.007712,0.956522,6,638,36,0,0,1,5.62,0
49998,1,1,1,0.017301,0.007712,0.478261,1,638,36,1,0,1,5.62,0


In [69]:
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

# Dividing data into features and labels here x is features and y is label

In [None]:
x=customer_booking.drop("booking_complete",axis=1)
y=customer_booking["booking_complete"]

# Splitting data between train and test data set

In [None]:
from sklearn.model_selection import train_test_split
np.random.seed(42)
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [None]:
model=RandomForestClassifier(n_estimators=1000)
model.fit(x_train,y_train)
model.score(x_test,y_test)

In [None]:
y_predicted=model.predict(x_test)
y_predicted

In [None]:


model.feature_importances_

In [None]:
y_pred=pd.DataFrame(y_predicted)
y_pred

Confusion matrix : in this output shows that all the values in the diagonals are correctly classified [8405,165] and values off diagonal are misclassified

In [None]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_predicted)
cm

In [None]:
%matplotlib inline
plt.figure(figsize=(10,7))
sns.heatmap(cm,annot=True)
plt.xlabel("predicted")
plt.ylabel("Truth")
plt.savefig('confusion.png',dpi=100)

# Accuracy of the data

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

# Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(model, x_train, y_train, cv=3)

# Classification report

In [None]:
from sklearn.metrics import classification_report
report=classification_report(y_pred,y_test)
report

In [None]:
import pandas as pd

# Assuming 'report' contains the classification report
report_dict = classification_report(y_pred, y_test, output_dict=True)
df = pd.DataFrame(report_dict).transpose()
df

In [None]:
features=customer_booking.columns
importances=model.feature_importances_
indices=np.argsort(importances)

plt.title("feature importance")
plt.barh(range(len(indices)),importances[indices],color='b',align='center')
plt.yticks(range(len(indices)),[features[i]for i in indices])
plt.xlabel("relative importance")
plt.show