In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import streamlit as st
from pycaret.regression import *

In [3]:
LABEL_ENCODER = None

def remove_null(data):
    data.dropna(inplace=True)
    return data

def convert_dates_to_one_format(data):
    data['Travel Date'] = pd.to_datetime(data['Travel Date'], errors='coerce')
    data['Travel Date'] = data['Travel Date'].dt.strftime('%Y-%m-%d')
    return data

def replace_space_with_underscore(name):
    return name.replace(' ', '_')

def create_label_encoding(data):
    global LABEL_ENCODER
    cat_cols = data.select_dtypes(include=['object']).columns
    LABEL_ENCODER = LabelEncoder()
    for col in cat_cols:
        data[col] = LABEL_ENCODER.fit_transform(data[col])
        np.save('classes_{0}.npy'.format(replace_space_with_underscore(col)), LABEL_ENCODER.classes_, allow_pickle=True)

def change_int32_to_int64(data):
    for col in data.columns:
        if data[col].dtype == 'int64':
            data[col] = data[col].astype('int32')

def data_preprocessing(data):
    data.drop(['Uniq Id'], axis=1, inplace=True)
    remove_null(data)
    change_int32_to_int64(data)
    convert_dates_to_one_format(data)
    create_label_encoding(data)

def find_best_models(train):
    rgs1 = setup(data = train, target = "Per Person Price", silent=True)
    best_regression_models = compare_models()
    return best_regression_models

def create_best_model():
    best_model = create_model('rf') #random forest chosen from find_best_model()
    return best_model

In [5]:
train = pd.read_csv('dataset\Train.csv')
display(train)

Unnamed: 0,Uniq Id,Package Name,Package Type,Destination,Itinerary,Places Covered,Travel Date,Hotel Details,Start City,Airline,Flight Stops,Meals,Sightseeing Places Covered,Cancellation Rules,Per Person Price
0,e788ab76d9d8cf1e6ed2f139645ca5d1,Best of Shimla and Manali Holiday from Delhi,Standard,New Delhi|Shimla|Manali|Chandigarh,1N New Delhi . 2N Shimla . 2N Manali . 1N Chan...,New Delhi|Shimla|Manali|Chandigarh,30-07-2021,Not Available,Mumbai,Not Available,2,3,Not Available,Not Available,11509.0
1,178f892630ce3e335a5a41d5d83937fd,Kashmir Valley vacation,Premium,Srinagar|Pahalgam|Srinagar,1N Srinagar . 2N Pahalgam . 1N Srinagar,Srinagar|Pahalgam|Srinagar,08-12-2021,The Orchard Retreat & Spa:4.6|WelcomHotel Pine...,New Delhi,IndiGo|IndiGo,0,5,Dal Lake | Avantipura Ruins | Mughal Gardens ...,Cancellation any time after making the 1st pay...,22485.5
2,f060f2954840503cc2fdaf495357b7df,Might of Mewar- Udaipur and Chittorgarh,Luxury,Udaipur|Chittorgarh,2N Udaipur . 1N Chittorgarh,Udaipur|Chittorgarh,26-04-2021,The Ananta:4.4|juSTa Lake Nahargarh Palace:4,New Delhi,IndiGo,0,4,Lake Pichola | Jag Mandir Palace | Saheliyon ...,Cancellation any time after making the 1st pay...,12421.5
3,32a19a6c171e67448f2346da46c619dc,Colorful Kerala ( Romantic Getaway ),Premium,Munnar|Kumarakom|Allepey|Kovalam and Poovar,2N Munnar . 1N Kumarakom . 1N Allepey . 2N Kov...,Munnar|Kumarakom|Allepey|Kovalam and Poovar,27-08-2021,Elixir Hills Suites Resort & Spa-MMT Holidays ...,New Delhi,IndiGo,0,5,Mattupetty Dam | Echo Point | Tata Tea Museum...,Cancellation any time after making the 1st pay...,35967.0
4,107b068aa0ca03bc6248966f594d105f,A Week In Bangkok & Pattaya,Premium,Pattaya|Bangkok,4N Pattaya . 3N Bangkok,Pattaya|Bangkok,12-12-2021,Dusit Thani Pattaya - MMT Special:4.5|Amari Wa...,New Delhi,Spicejet|Go Air,0,5,"Coral Island Tour with Indian Lunch, Join Spe...",Cancellation any time after making the 1st pay...,25584.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20995,ae3ae6ca299180d470145a0e154cbf2b,Shimla & Manali Retreat from Delhi,Deluxe,Shimla|Manali|New Delhi,2N Shimla . 3N Manali . 1N New Delhi,Shimla|Manali|New Delhi,15-04-2021,Summit Thistle Villa Resort & Spa:Four|Snow Va...,Mumbai,Not Available,1,3,Mall road | Indira Bunglow | Himalayan Zoo | ...,Cancellation any time after making the 1st pay...,16168.5
20996,6ccc538e3597a909f609fcb2eba9be5e,Bedazzling Kashmir Holiday,Luxury,Srinagar|Gulmarg|Pahalgam|Srinagar,1N Srinagar . 1N Gulmarg . 2N Pahalgam . 3N Sr...,Srinagar|Gulmarg|Pahalgam|Srinagar,30-08-2021,The Orchard Retreat & Spa:4.6|The Khyber Himal...,New Delhi,Vistara|Vistara,0,4,Dal Lake | Gondola Point | Avantipura Ruins |...,Cancellation any time after making the 1st pay...,33770.5
20997,d3b412fa94ce5503204024c6db2f60b7,A relaxing week in Colombo (Bentota and Galle ...,Deluxe,Colombo,5N Colombo,Colombo,27-02-2021,Mount Lavinia Hotel-MMT HOLIDAYS SPECIAL:4.0,New Delhi,IndiGo|IndiGo|IndiGo|IndiGo,1,3,Sigiriya Excursion | Nuwara Eliya Excursion,Cancellation any time after making the 1st pay...,29964.0
20998,51d843fb852ec7abde8299fe31111bd5,Awesome North East with Pelling,Deluxe,Darjeeling|Pelling|Gangtok,2N Darjeeling . 2N Pelling . 2N Gangtok,Darjeeling|Pelling|Gangtok,13-01-2022,Ramada by Wyndham Darjeeling Gandhi Road:4.4|T...,New Delhi,IndiGo,1,3,Tiger Hill | Himalayan Mountaineering Institu...,Cancellation any time after making the 1st pay...,28145.5


In [6]:
types = train.dtypes
print(types)

Uniq Id                        object
Package Name                   object
Package Type                   object
Destination                    object
Itinerary                      object
Places Covered                 object
Travel Date                    object
Hotel Details                  object
Start City                     object
Airline                        object
Flight Stops                    int64
Meals                           int64
Sightseeing Places Covered     object
Cancellation Rules             object
Per Person Price              float64
dtype: object


In [5]:
validation = pd.read_csv('dataset\Test.csv')

In [6]:
data_preprocessing(train)
data_preprocessing(validation)

In [7]:
best = find_best_models(train)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,3580.1378,40102586.8843,6307.5632,0.7081,0.255,0.1948,1.603
lightgbm,Light Gradient Boosting Machine,4048.1617,42818570.1881,6520.4599,0.6887,0.2788,0.2273,0.091
et,Extra Trees Regressor,3735.7401,43120615.2324,6540.8535,0.686,0.2652,0.2037,1.262
gbr,Gradient Boosting Regressor,5052.491,61336738.7032,7818.7912,0.5532,0.3413,0.2951,0.804
dt,Decision Tree Regressor,4607.2288,70763591.0109,8396.4619,0.4817,0.3356,0.2402,0.059
knn,K Neighbors Regressor,5745.1354,81123768.0,8996.9377,0.4071,0.4108,0.3506,0.041
lr,Linear Regression,7190.2584,112998247.2039,10620.9904,0.1761,0.4775,0.4493,1.041
lasso,Lasso Regression,7190.2552,112998408.2735,10621.0031,0.1761,0.4775,0.4493,0.026
ridge,Ridge Regression,7190.2525,112998211.9705,10620.9895,0.1761,0.4775,0.4493,0.018
br,Bayesian Ridge,7190.5334,113009923.228,10621.5874,0.176,0.4773,0.4494,0.019


2023-01-05 17:42:30.836 INFO    logs: create_model_container: 18
2023-01-05 17:42:30.838 INFO    logs: master_model_container: 18
2023-01-05 17:42:30.839 INFO    logs: display_container: 2
2023-01-05 17:42:30.841 INFO    logs: RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=5503, verbose=0, warm_start=False)
2023-01-05 17:42:30.842 INFO    logs: compare_models() succesfully completed......................................


In [8]:
results = pull()
print(best)
save_model(best, 'random_forest_model')

2023-01-05 17:44:56.977 INFO    logs: Initializing save_model()
2023-01-05 17:44:56.985 INFO    logs: save_model(model=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=5503, verbose=0, warm_start=False), model_name=random_forest_model, prep_pipe_=Pipeline(memory=None,
         steps=[('dtypes',
                 DataTypes_Auto_infer(categorical_features=[],
                                      display_types=False, features_todrop=[],
                                      id_columns=[], ml_usecase='regression',
                                      numerical_features=[],
            

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=5503, verbose=0, warm_start=False)


2023-01-05 17:44:57.249 INFO    logs: random_forest_model.pkl saved in current working directory
2023-01-05 17:44:57.253 INFO    logs: Pipeline(memory=None,
         steps=[('dtypes',
                 DataTypes_Auto_infer(categorical_features=[],
                                      display_types=False, features_todrop=[],
                                      id_columns=[], ml_usecase='regression',
                                      numerical_features=[],
                                      target='Per Person Price',
                                      time_features=[])),
                ('imputer',
                 Simple_Imputer(categorical_strategy='not_available',
                                fill_value_categorical=None,
                                fill_value_numerical=None,
                                numeric_...
                 RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                       criterion='mse', max_depth=None,
        

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=False, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[],
                                       target='Per Person Price',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_...
                  RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                        criterion='mse', max_depth=None,
                                        max_features='auto', max_leaf_nodes=None,
                                        max_sa