In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [38]:
df = pd.read_csv('customer_booking.csv', encoding='latin')

In [39]:
df.head()

Unnamed: 0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
0,2,Internet,RoundTrip,262,19,7,Sat,AKLDEL,New Zealand,1,0,0,5.52,0
1,1,Internet,RoundTrip,112,20,3,Sat,AKLDEL,New Zealand,0,0,0,5.52,0
2,2,Internet,RoundTrip,243,22,17,Wed,AKLDEL,India,1,1,0,5.52,0
3,1,Internet,RoundTrip,96,31,4,Sat,AKLDEL,New Zealand,0,0,1,5.52,0
4,2,Internet,RoundTrip,68,22,15,Wed,AKLDEL,India,1,0,1,5.52,0


Check for missing data

In [40]:
df.isnull().sum()

num_passengers           0
sales_channel            0
trip_type                0
purchase_lead            0
length_of_stay           0
flight_hour              0
flight_day               0
route                    0
booking_origin           0
wants_extra_baggage      0
wants_preferred_seat     0
wants_in_flight_meals    0
flight_duration          0
booking_complete         0
dtype: int64

In [41]:
df.dtypes

num_passengers             int64
sales_channel             object
trip_type                 object
purchase_lead              int64
length_of_stay             int64
flight_hour                int64
flight_day                object
route                     object
booking_origin            object
wants_extra_baggage        int64
wants_preferred_seat       int64
wants_in_flight_meals      int64
flight_duration          float64
booking_complete           int64
dtype: object

ENCODING CATEGORICAL VARIABLE (Encode columns with data type of objects)

In [42]:
def ConvertCat(output):
    columns = ['sales_channel', 'trip_type', 'flight_day', 'route', 
              'booking_origin']
    for col in columns :
        output[col] = LabelEncoder().fit_transform(output[col])
    return output

In [43]:
df = ConvertCat(df)

In [44]:
df

Unnamed: 0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
0,2,0,2,262,19,7,2,0,61,1,0,0,5.52,0
1,1,0,2,112,20,3,2,0,61,0,0,0,5.52,0
2,2,0,2,243,22,17,6,0,36,1,1,0,5.52,0
3,1,0,2,96,31,4,2,0,61,0,0,1,5.52,0
4,2,0,2,68,22,15,6,0,36,1,0,1,5.52,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2,0,2,27,6,9,2,720,4,1,0,1,5.62,0
49996,1,0,2,111,6,4,3,720,4,0,0,0,5.62,0
49997,1,0,2,24,6,22,2,720,4,0,0,1,5.62,0
49998,1,0,2,15,6,11,1,720,4,1,0,1,5.62,0


SPLITTING THE DATA INTO X AND Y

In [45]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [46]:
print(X)

[[2.   0.   2.   ... 0.   0.   5.52]
 [1.   0.   2.   ... 0.   0.   5.52]
 [2.   0.   2.   ... 1.   0.   5.52]
 ...
 [1.   0.   2.   ... 0.   1.   5.62]
 [1.   0.   2.   ... 0.   1.   5.62]
 [1.   0.   2.   ... 1.   0.   5.62]]


In [47]:
print(y)

[0 0 0 ... 0 0 0]


SPLITTING THE DATA INTO TRAINING AND TEST SETS

In [48]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1, random_state=2)

In [49]:
print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))

45000
5000
45000
5000


FEATURE SCALING

In [50]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

df2 = pd.DataFrame(X_train, columns = ['num_passengers', 'sales_channel', 'trip_type','purchase_lead','length_of_stay','flight_hour','flight_day', 'route', 'booking_origin','wants_extra_baggage','wants_preferred_seat', 'wants_in_flight_meals','flight_duration'] )
scaled_features = df2

# col_names = ['num_passengers', 'sales_channel', 'trip_type','purchase_lead','length_of_stay','flight_hour','flight_day', 'route', 'booking_origin','wants_extra_baggage','wants_preferred_seat', 'wants_in_flight_meals','flight_duration']
col_standard_names = ['purchase_lead', 'length_of_stay','flight_hour','flight_duration']
features = scaled_features[col_standard_names]
scaler = StandardScaler().fit_transform(features.values)
# features = scaler.transform(features)

In [51]:
df2[col_standard_names] = scaler
print(df2)

       num_passengers  sales_channel  trip_type  purchase_lead  \
0                 1.0            0.0        2.0      -0.529653   
1                 1.0            0.0        2.0      -0.374833   
2                 1.0            0.0        2.0      -0.430126   
3                 2.0            0.0        2.0      -0.695531   
4                 5.0            1.0        2.0      -0.861409   
...               ...            ...        ...            ...   
44995             1.0            0.0        2.0       0.034334   
44996             3.0            0.0        2.0       2.909558   
44997             2.0            0.0        2.0       0.343973   
44998             1.0            0.0        2.0      -0.562828   
44999             1.0            0.0        2.0      -0.817175   

       length_of_stay  flight_hour  flight_day  route  booking_origin  \
0            0.117982    -0.566712         3.0  770.0             4.0   
1            0.117982    -0.382186         3.0  621.0        

In [52]:
X_train = df2.to_numpy()

In [53]:
len(X_train)

45000

In [54]:
X_test

array([[1.  , 0.  , 2.  , ..., 1.  , 1.  , 5.62],
       [4.  , 0.  , 2.  , ..., 0.  , 0.  , 7.  ],
       [2.  , 0.  , 2.  , ..., 1.  , 1.  , 7.  ],
       ...,
       [1.  , 0.  , 2.  , ..., 1.  , 1.  , 6.62],
       [1.  , 0.  , 2.  , ..., 1.  , 1.  , 8.83],
       [1.  , 0.  , 2.  , ..., 0.  , 0.  , 8.58]])

In [55]:
type(X_test)

numpy.ndarray

In [56]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=10, random_state = 2)
regressor.fit(X_train, y_train)

RandomForestRegressor(n_estimators=10, random_state=2)

In [61]:
y_pred = regressor.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)),1))


[[0.7 0. ]
 [0.2 1. ]
 [0.2 0. ]
 ...
 [0.1 1. ]
 [0.8 0. ]
 [0.8 0. ]]


In [62]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

ValueError: Classification metrics can't handle a mix of binary and continuous targets