In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('customer_booking.csv', encoding='latin')

In [3]:
df.head()

Unnamed: 0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
0,2,Internet,RoundTrip,262,19,7,Sat,AKLDEL,New Zealand,1,0,0,5.52,0
1,1,Internet,RoundTrip,112,20,3,Sat,AKLDEL,New Zealand,0,0,0,5.52,0
2,2,Internet,RoundTrip,243,22,17,Wed,AKLDEL,India,1,1,0,5.52,0
3,1,Internet,RoundTrip,96,31,4,Sat,AKLDEL,New Zealand,0,0,1,5.52,0
4,2,Internet,RoundTrip,68,22,15,Wed,AKLDEL,India,1,0,1,5.52,0


Check for missing data

In [4]:
df.isnull().sum()

num_passengers           0
sales_channel            0
trip_type                0
purchase_lead            0
length_of_stay           0
flight_hour              0
flight_day               0
route                    0
booking_origin           0
wants_extra_baggage      0
wants_preferred_seat     0
wants_in_flight_meals    0
flight_duration          0
booking_complete         0
dtype: int64

In [5]:
df.dtypes

num_passengers             int64
sales_channel             object
trip_type                 object
purchase_lead              int64
length_of_stay             int64
flight_hour                int64
flight_day                object
route                     object
booking_origin            object
wants_extra_baggage        int64
wants_preferred_seat       int64
wants_in_flight_meals      int64
flight_duration          float64
booking_complete           int64
dtype: object

ENCODING CATEGORICAL VARIABLE (Encode columns with data type of objects)

In [6]:
def ConvertCat(output):
    columns = ['sales_channel', 'trip_type', 'flight_day', 'route', 
              'booking_origin']
    for col in columns :
        output[col] = LabelEncoder().fit_transform(output[col])
    return output

In [7]:
df = ConvertCat(df)

In [8]:
df

Unnamed: 0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
0,2,0,2,262,19,7,2,0,61,1,0,0,5.52,0
1,1,0,2,112,20,3,2,0,61,0,0,0,5.52,0
2,2,0,2,243,22,17,6,0,36,1,1,0,5.52,0
3,1,0,2,96,31,4,2,0,61,0,0,1,5.52,0
4,2,0,2,68,22,15,6,0,36,1,0,1,5.52,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2,0,2,27,6,9,2,720,4,1,0,1,5.62,0
49996,1,0,2,111,6,4,3,720,4,0,0,0,5.62,0
49997,1,0,2,24,6,22,2,720,4,0,0,1,5.62,0
49998,1,0,2,15,6,11,1,720,4,1,0,1,5.62,0


SPLITTING THE DATA INTO X AND Y

In [9]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [10]:
print(X)

[[2.   0.   2.   ... 0.   0.   5.52]
 [1.   0.   2.   ... 0.   0.   5.52]
 [2.   0.   2.   ... 1.   0.   5.52]
 ...
 [1.   0.   2.   ... 0.   1.   5.62]
 [1.   0.   2.   ... 0.   1.   5.62]
 [1.   0.   2.   ... 1.   0.   5.62]]


In [11]:
print(y)

[0 0 0 ... 0 0 0]


SPLITTING THE DATA INTO TRAINING AND TEST SETS

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=2)

In [13]:
print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))

40000
10000
40000
10000


FEATURE SCALING

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

df2 = pd.DataFrame(X_train, columns = ['num_passengers', 'sales_channel', 'trip_type','purchase_lead','length_of_stay','flight_hour','flight_day', 'route', 'booking_origin','wants_extra_baggage','wants_preferred_seat', 'wants_in_flight_meals','flight_duration'] )
scaled_features = df2

# col_names = ['num_passengers', 'sales_channel', 'trip_type','purchase_lead','length_of_stay','flight_hour','flight_day', 'route', 'booking_origin','wants_extra_baggage','wants_preferred_seat', 'wants_in_flight_meals','flight_duration']
col_standard_names = ['purchase_lead', 'length_of_stay','flight_hour','flight_duration']
features = scaled_features[col_standard_names]
scaler = StandardScaler().fit_transform(features.values)
# features = scaler.transform(features)

In [15]:
df2[col_standard_names] = scaler
print(df2)

       num_passengers  sales_channel  trip_type  purchase_lead  \
0                 1.0            0.0        2.0       1.139251   
1                 1.0            0.0        2.0      -0.672799   
2                 1.0            1.0        2.0      -0.728045   
3                 1.0            1.0        2.0       1.603313   
4                 1.0            1.0        2.0      -0.838536   
...               ...            ...        ...            ...   
39995             1.0            0.0        2.0       0.034342   
39996             3.0            0.0        2.0       2.907105   
39997             2.0            0.0        2.0       0.343717   
39998             1.0            0.0        2.0      -0.562308   
39999             1.0            0.0        2.0      -0.816437   

       length_of_stay  flight_hour  flight_day  route  booking_origin  \
0           -0.028010    -1.118072         3.0  154.0             4.0   
1            1.586692     1.834920         2.0  693.0        

In [16]:
X_train = df2.to_numpy()

In [17]:
len(X_train)

40000

In [18]:
X_test

array([[1.  , 0.  , 2.  , ..., 1.  , 1.  , 5.62],
       [4.  , 0.  , 2.  , ..., 0.  , 0.  , 7.  ],
       [2.  , 0.  , 2.  , ..., 1.  , 1.  , 7.  ],
       ...,
       [1.  , 0.  , 2.  , ..., 0.  , 0.  , 8.83],
       [2.  , 1.  , 2.  , ..., 0.  , 0.  , 8.83],
       [1.  , 0.  , 2.  , ..., 0.  , 1.  , 8.83]])

In [19]:
type(X_test)

numpy.ndarray

In [20]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=100, random_state = 0)
regressor.fit(X_train, y_train)

RandomForestRegressor(random_state=0)

In [21]:
y_pred = np.round(regressor.predict(X_test))
print(y_pred)

[1. 0. 0. ... 0. 1. 0.]


In [22]:
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)),1))

[[1. 0.]
 [0. 1.]
 [0. 0.]
 ...
 [0. 0.]
 [1. 0.]
 [0. 0.]]


In [23]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn import metrics
print('Confusion marix = ')
print(metrics.confusion_matrix(y_test, y_pred))
print('Accuracy score = ', round(accuracy_score(y_test, y_pred),2))

Confusion marix = 
[[5559 2965]
 [1074  402]]
Accuracy score =  0.6
