In [None]:
## Content for loading data

In [1]:
import pandas as pd
import numpy as np

def load_data(path):
    df = pd.read_excel(path)
    return df

In [None]:
## Content for preprocessing.py

In [2]:
def convert_duration_to_minutes(time):
    '''
    This function converts duration in h m to minutes:
    input : hh:mm ,hh, mm
    return:
        min
    '''
    if len(time.split(' ')) >1 :
        hh,mm = time.split(' ')
        hh,mm = int(hh[:-1]),int(mm[:-1])
        duration = hh*60+mm
    else:
        if 'h' in time:
            duration = int(time[:-1])*60
        else:
            duration= int(time[:-1])
            
    return duration


def create_preprocess_date_time(df):
    '''
    This Function preprocess date_of_journey and duration to create departure and arrival date time.
    '''
    df['DepartureDateTime'] = df['Date_of_Journey'] + " "+ df['Dep_Time']
    df['DepartureDateTime'] = pd.to_datetime(df['DepartureDateTime'],infer_datetime_format=True)
    df['Duration_min'] = df['Duration'].apply(lambda x: convert_duration_to_minutes(x))
    df['Duration_timedelta'] = pd.to_timedelta(df['Duration_min'], unit='m')
    df["ArrivalDateTime"] = df['DepartureDateTime'] + df['Duration_timedelta']
    return df

stops_dict = {
    'non-stop':0,
    '2 stops':2,
    '1 stop':1,
    '3 stops':3,
    '4 stops':4
}

## Running the process of prediction is also referred as inference:
def sanity_check(df,train=True):
    '''
    This function performs sanity check on the airline data.
    inputs:
        df: dataframe that we need to perform sanity check
        train: This is used for process of training and inference.
            train is having default value of True and can be set as False if we are running inference.
            
        ## process for training    
        sanity_check(df)
        # process of prediction
        sanity_check(df,train=False)
    returns:
        df
    '''
    if train:
        df.drop_duplicates(inplace=True)
    
    create_preprocess_date_time(df)
    df['Total_Stops'] = df['Total_Stops'].replace(stops_dict)
    df.drop(columns=['Date_of_Journey','Dep_Time','Arrival_Time','Duration','Additional_Info'],axis=1,inplace=True)
    
    return df

In [3]:
def handle_missing_value(df,train=True):
    """
    This function helps to handle missing value.
    Since for Airline data there is just one missing value we can choose to drop missing value.
    inputs:
         df: dataframe which requires imputation.
         
    returns:
        df
    
    """
    df.dropna(inplace=True)
    return df

In [4]:
def frequency_encoder(df,col):
    """
    This function encodes a categorical column based on the frequency of their occurence.
    input:
        df : Input DataFrame in which encoding has to be created 
        col : Column name which has to be encoded
    return: 
          frequency encoded dictionary for columns
    """
    freq_value = df.groupby(col).size()/len(df)
    freq_dict = freq_value.to_dict()
    df["Freq_encoded_"+col] = df[col].replace(freq_dict)
    return freq_dict


def mean_encoder(df,col,target_col):
    """
    This function encodes a categorical column based on the frequency of their occurence.
    input:
        df : Input DataFrame in which encoding has to be created 
        col : Column name which has to be encoded
    return: 
          Mean encoded dict for column
    """
    mean_value = df.groupby(col)[target_col].mean()
    mean_dict = mean_value.to_dict()
    df["Mean_encoded_"+col] = df[col].replace(mean_dict)
    return mean_dict


from sklearn.preprocessing import LabelEncoder

def label_encoder(df,col):
    """
    This function encodes a categorical column based on the basis of their order label.
    input:
        df : Input DataFrame in which encoding has to be created 
        col : Column name which has to be encoded
    return: 
          label encoded dict for column
    """
    le = LabelEncoder()
    le.fit(df[col])
    label_dict = dict(zip((le.classes_),le.transform(le.classes_)))
    df["Label_encoded_"+col] = df[col].replace(label_dict)
    return label_dict


## Create a function to handle categorical value
def handle_categorical_values(df,target):
    '''
      This function handles categorical value and create a dataframe.
      Input:
        df : Dataframe which require categorical value treatment
      returns :
         Dataframe with all categorical value handled.
    '''
    encoded_dict = {}
    # Getting all object columns
    object_columns = df.select_dtypes(object).columns

    ## generate frequency encoded categorical values
    frequency_encoded_dict ={} 
    for col in object_columns:
        freq_dict = frequency_encoder(df,col)
        frequency_encoded_dict[col] = freq_dict

    ## generate target mean encoded categorical values
    mean_encoded_dict ={} 
    for col in object_columns:
        mean_dict = mean_encoder(df,col,target)
        mean_encoded_dict[col] = mean_dict

    
    ## generate label encoded categorical values
    label_encoded_dict ={} 
    for col in object_columns:
        label_dict = label_encoder(df,col)
        label_encoded_dict[col] = label_dict
    
    encoded_dict["Frequency"] = frequency_encoded_dict
    encoded_dict["Mean"] = mean_encoded_dict
    encoded_dict["Label"] = label_encoded_dict

    return df, encoded_dict


def airline_handle_categorical_data(df,target):
    df['Destination'] = df['Destination'].replace({'New Delhi':'Delhi'})
    df, encoded_dict = handle_categorical_values(df,target)
    categorical_cols = df.select_dtypes(object).columns
    df.drop(columns=categorical_cols,inplace=True)
    return df, encoded_dict


In [5]:
## Content for feature Engineering

In [10]:
def time_of_day(hr):
    '''
    This function gives the time of day based on logic:
        # 3-8 early_morning or 1
        # 8-12 morning or 2
        # 12-16 afternoon or 3
        # 16-20 evening or 4
        # 20-00 night or 5
        # 00-3 late_night or 6
        # invalid or 0
    input:
        hr
    return: tuple
        (timeOfDay,timeOfDay_encoded
    '''
    if hr in range(0,3) :
        str_val = 'late_night'
        val = 6
    elif hr in range(20,23):
        str_val = 'night'
        val = 5
    elif hr in range(16,20):
        str_val = 'evening'
        val = 4
    elif hr in range(12,26):
        str_val = 'after_noon'
        val = 3
    elif hr in range(8,12):
        str_val = 'morning'
        val = 2
    elif hr in range(3,8):
        str_val = 'early_morning'
        val = 1
    else:
        str_val = 'invalid'
        val = 0
    return (str_val, val)


def time_based_feature_Engineering(df):

    df['dep_hr'] = df['DepartureDateTime'].dt.hour
    df['arr_hr'] = df['ArrivalDateTime'].dt.hour

    df['dep_month'] = df['DepartureDateTime'].dt.month
    df['dep_day_of_month'] = df['DepartureDateTime'].dt.day

    df['arr_month'] = df['ArrivalDateTime'].dt.month
    df['arr_day_of_month'] = df['ArrivalDateTime'].dt.day

    df['dep_day_of_week'] = df['DepartureDateTime'].dt.day_of_week 
    df['arr_day_of_week'] = df['ArrivalDateTime'].dt.day_of_week 

    df['dep_weekday'] = np.where(df["dep_day_of_week"].isin([5,6]),0,1)
    df['arr_weekday'] = np.where(df["arr_day_of_week"].isin([5,6]),0,1)

    df['departure_timeOfDay_encoded'] = df['dep_hr'].apply(lambda x: time_of_day(x)[1])
    df['arrival_timeOfDay_encoded'] = df['arr_hr'].apply(lambda x: time_of_day(x)[1])

#     df['departure_timeOfDay'] = df['dep_hr'].apply(lambda x: time_of_day(x)[0])
#     df['arrival_timeOfDay'] = df['arr_hr'].apply(lambda x: time_of_day(x)[0])
#     one_hot_cols = ['departure_timeOfDay','arrival_timeOfDay']
#     df_oneHotEncoded = pd.get_dummies(df[one_hot_cols])
#     new_df = pd.concat([df,df_oneHotEncoded],axis=1)
    
    new_df = df
    
    drop_cols = [
                'DepartureDateTime',
                'Duration_timedelta',
                'ArrivalDateTime',
                'Freq_encoded_Source',
                'Mean_encoded_Source',
                'Mean_encoded_Destination',
#                 'departure_timeOfDay_encoded',
                'arr_month', 
                'arr_day_of_month',
#                 'departure_timeOfDay_early_morning',
#                 'departure_timeOfDay_evening',
#                 'arrival_timeOfDay_late_night',
#                 'arrival_timeOfDay_morning', 
#                 'arrival_timeOfDay_night'
    ]
    
#     new_df.drop(columns=one_hot_cols,inplace=True)
    new_df.drop(columns=drop_cols,inplace=True)
    return new_df

In [11]:
## content for model model.py

In [12]:
from sklearn.model_selection import train_test_split

from sklearn import linear_model
from sklearn import metrics

from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import SplineTransformer

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor


def regression(X_train, X_test, y_train, y_test):
    ## Baseline model
    reg = linear_model.LinearRegression()
    reg.fit(X_train,y_train)
    print(metrics.r2_score(y_train,reg.predict(X_train)), metrics.r2_score(y_test,reg.predict(X_test)))
    return reg

def knearestneighbour(X_train, X_test, y_train, y_test):
    knn_regressor = KNeighborsRegressor(n_neighbors=10)
    knn_regressor.fit(X_train, y_train)
    print(metrics.r2_score(y_train,knn_regressor.predict(X_train)),metrics.r2_score(y_test,knn_regressor.predict(X_test)))
    return knn_regressor

def decisiontree(X_train, X_test, y_train, y_test):
    tree = DecisionTreeRegressor(max_depth=10)
    tree.fit(X_train,y_train)
    print(metrics.r2_score(y_train,tree.predict(X_train)),metrics.r2_score(y_test,tree.predict(X_test)))
    return tree

def randomForest(X_train, X_test, y_train, y_test):
    random_regressor = RandomForestRegressor(n_estimators=500, min_samples_split=10,min_samples_leaf=2)
    random_regressor.fit(X_train, y_train)
    print(metrics.r2_score(y_train,random_regressor.predict(X_train)),metrics.r2_score(y_test,random_regressor.predict(X_test)))
    return random_regressor

from xgboost import XGBRegressor
def xgboost(X_train, X_test, y_train, y_test):
    xgboost_regressor = XGBRegressor()
    xgboost_regressor.fit(X_train, y_train)
    print(metrics.r2_score(y_train,xgboost_regressor.predict(X_train)),metrics.r2_score(y_test,xgboost_regressor.predict(X_test)))
    return xgboost_regressor

from catboost import  CatBoostRegressor
def catboost(X_train, X_test, y_train, y_test):
    catboost_regressor = CatBoostRegressor(verbose=False)
    catboost_regressor.fit(X_train, y_train)
    print(metrics.r2_score(y_train,catboost_regressor.predict(X_train)),metrics.r2_score(y_test,catboost_regressor.predict(X_test)))
    return catboost_regressor

from sklearn.ensemble import GradientBoostingRegressor
def gboost(X_train, X_test, y_train, y_test):
    gboost_regressor = GradientBoostingRegressor()
    gboost_regressor.fit(X_train, y_train)
    print(metrics.r2_score(y_train,gboost_regressor.predict(X_train)),metrics.r2_score(y_test,gboost_regressor.predict(X_test)))
    return gboost_regressor


def train(X,y,modelType):
    # Split your dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    model = modelType(X_train, X_test, y_train, y_test)
    return model

import pickle

def save_pickle(file_path,obj):
    with open(file_path,'wb') as file:
            pickle.dump(obj,file)
        
def load_pickle(file_path):
    with open(file_path,'rb') as file:
            obj = pickle.load(file)
    return obj

In [13]:
## train.py

In [14]:
target ='Price'

train_data_path = "../data/Train.xlsx"
model_path = '../models/xgboost_demo.pickle'
encoded_path = '../models/encoded_dict_demo.pickle'

## process for training   
df = load_data(train_data_path)
sanity_check(df)
handle_missing_value(df)

df, encoded_dict = airline_handle_categorical_data(df,target)
final_df = time_based_feature_Engineering(df)
X = final_df.drop(columns=target)
y = final_df[target]

In [15]:
X.columns

Index(['Total_Stops', 'Duration_min', 'Freq_encoded_Airline',
       'Freq_encoded_Destination', 'Freq_encoded_Route',
       'Mean_encoded_Airline', 'Mean_encoded_Route', 'Label_encoded_Airline',
       'Label_encoded_Source', 'Label_encoded_Destination',
       'Label_encoded_Route', 'dep_hr', 'arr_hr', 'dep_month',
       'dep_day_of_month', 'dep_day_of_week', 'arr_day_of_week', 'dep_weekday',
       'arr_weekday', 'departure_timeOfDay_encoded',
       'arrival_timeOfDay_encoded'],
      dtype='object')

In [16]:
model = train(X,y,xgboost)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


0.9464148626879361 0.8435327728663693


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [17]:
save_pickle(encoded_path,encoded_dict)
save_pickle(model_path,model)

In [18]:
len(X.columns)

21

In [43]:
def encode_predict_input(df,encoded_dict):
    '''
    This function encodes categorical values with same values as training encoded values.
    Input:
      df : DataFrame
      encoded_dict : Category encoded dictionary
    returns :None
    '''
    encoded_cols = ['Airline', 'Source', 'Destination', 'Route']
    
    frequency_dict = encoded_dict['Frequency']
    mean_dict = encoded_dict['Mean']
    label_dict = encoded_dict['Label']
    for col in encoded_cols:
        df["Freq_encoded_"+col] = df[col].replace(frequency_dict[col])
        df["Mean_encoded_"+col] = df[col].replace(mean_dict[col])
        df["Label_encoded_"+col] = df[col].replace(label_dict[col])
    df.drop(columns=encoded_cols,inplace=True)

    return df


def predict_price(df, encoded_path, model_path):
    sanity_check(df,train=False)

    loaded_dict = load_pickle(encoded_path)

    # transform categorical values 

    df['Destination'] = df['Destination'].replace({'New Delhi':'Delhi'})
    df = encode_predict_input(df,loaded_dict)

    test_X = time_based_feature_Engineering(df)
    loaded_model = load_pickle(model_path)
    result = loaded_model.predict(test_X)
    return result

test_data_path = "../data/Test.xlsx"
model_path = '../models/xgboost_demo.pickle'
encoded_path = '../models/encoded_dict_demo.pickle'

# process of prediction
df = load_data(test_data_path)
df= df.iloc[:5,:]

output = predict_price(df, encoded_path, model_path)
print(output)

[4833.053  6014.1943 3824.391  5154.1626 2270.1477]


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [None]:
## to predict.py

In [19]:
test_data_path = "../data/Test.xlsx"
model_path = '../models/xgboost_demo.pickle'
encoded_path = '../models/encoded_dict_demo.pickle'

# process of prediction
df = load_data(test_data_path)
df= df.iloc[:5,:]

In [20]:
df.columns

Index(['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',
       'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
       'Additional_Info'],
      dtype='object')

In [21]:
df

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info
0,Jet Airways,6/06/2019,Delhi,Cochin,DEL → BOM → COK,17:30,04:25 07 Jun,10h 55m,1 stop,No info
1,IndiGo,12/05/2019,Kolkata,Banglore,CCU → MAA → BLR,06:20,10:20,4h,1 stop,No info
2,Jet Airways,21/05/2019,Delhi,Cochin,DEL → BOM → COK,19:15,19:00 22 May,23h 45m,1 stop,In-flight meal not included
3,Multiple carriers,21/05/2019,Delhi,Cochin,DEL → BOM → COK,08:00,21:00,13h,1 stop,No info
4,Air Asia,24/06/2019,Banglore,Delhi,BLR → DEL,23:55,02:45 25 Jun,2h 50m,non-stop,No info


In [22]:
sanity_check(df,train=False)
df.columns

Index(['Airline', 'Source', 'Destination', 'Route', 'Total_Stops',
       'DepartureDateTime', 'Duration_min', 'Duration_timedelta',
       'ArrivalDateTime'],
      dtype='object')

In [23]:
loaded_dict = load_pickle(encoded_path)

# transform categorical values 
## write down a function.
def encode_predict_input(df,encoded_dict):
    '''
    This function encodes categorical values with same values as training encoded values.
    Input:
      df : DataFrame
      encoded_dict : Category encoded dictionary
    returns :None
    '''
    encoded_cols = ['Airline', 'Source', 'Destination', 'Route']
    
    frequency_dict = encoded_dict['Frequency']
    mean_dict = encoded_dict['Mean']
    label_dict = encoded_dict['Label']
    for col in encoded_cols:
        df["Freq_encoded_"+col] = df[col].replace(frequency_dict[col])
        df["Mean_encoded_"+col] = df[col].replace(mean_dict[col])
        df["Label_encoded_"+col] = df[col].replace(label_dict[col])
    df.drop(columns=encoded_cols,inplace=True)

    return df
        

In [24]:
df['Destination'] = df['Destination'].replace({'New Delhi':'Delhi'})
df = encode_predict_input(df,loaded_dict)

In [25]:
df.columns

Index(['Total_Stops', 'DepartureDateTime', 'Duration_min',
       'Duration_timedelta', 'ArrivalDateTime', 'Freq_encoded_Airline',
       'Mean_encoded_Airline', 'Label_encoded_Airline', 'Freq_encoded_Source',
       'Mean_encoded_Source', 'Label_encoded_Source',
       'Freq_encoded_Destination', 'Mean_encoded_Destination',
       'Label_encoded_Destination', 'Freq_encoded_Route', 'Mean_encoded_Route',
       'Label_encoded_Route'],
      dtype='object')

In [26]:
df

Unnamed: 0,Total_Stops,DepartureDateTime,Duration_min,Duration_timedelta,ArrivalDateTime,Freq_encoded_Airline,Mean_encoded_Airline,Label_encoded_Airline,Freq_encoded_Source,Mean_encoded_Source,Label_encoded_Source,Freq_encoded_Destination,Mean_encoded_Destination,Label_encoded_Destination,Freq_encoded_Route,Mean_encoded_Route,Label_encoded_Route
0,1,2019-06-06 17:30:00,655,0 days 10:55:00,2019-06-07 04:25:00,0.353661,11599.021081,4,0.415313,10461.60069,2,0.415313,10461.60069,1,0.227108,10954.205808,104
1,1,2019-12-05 06:20:00,240,0 days 04:00:00,2019-12-05 10:20:00,0.195278,5668.469897,3,0.27337,9143.083566,3,0.27337,9143.083566,0,0.006213,5240.876923,90
2,1,2019-05-21 19:15:00,1425,0 days 23:45:00,2019-05-22 19:00:00,0.353661,11599.021081,4,0.415313,10461.60069,2,0.415313,10461.60069,1,0.227108,10954.205808,104
3,1,2019-05-21 08:00:00,780,0 days 13:00:00,2019-05-21 21:00:00,0.114318,10902.678094,6,0.415313,10461.60069,2,0.415313,10461.60069,1,0.227108,10954.205808,104
4,0,2019-06-24 23:55:00,170,0 days 02:50:00,2019-06-25 02:45:00,0.030491,5590.260188,0,0.208278,8022.872877,0,0.208278,8022.872877,2,0.146817,5552.235677,18


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype          
---  ------                     --------------  -----          
 0   Total_Stops                5 non-null      int64          
 1   DepartureDateTime          5 non-null      datetime64[ns] 
 2   Duration_min               5 non-null      int64          
 3   Duration_timedelta         5 non-null      timedelta64[ns]
 4   ArrivalDateTime            5 non-null      datetime64[ns] 
 5   Freq_encoded_Airline       5 non-null      float64        
 6   Mean_encoded_Airline       5 non-null      float64        
 7   Label_encoded_Airline      5 non-null      int32          
 8   Freq_encoded_Source        5 non-null      float64        
 9   Mean_encoded_Source        5 non-null      float64        
 10  Label_encoded_Source       5 non-null      int32          
 11  Freq_encoded_Destination   5 non-null      float64        
 12

In [28]:
test_X = time_based_feature_Engineering(df)

In [29]:
df.columns

Index(['Total_Stops', 'Duration_min', 'Freq_encoded_Airline',
       'Mean_encoded_Airline', 'Label_encoded_Airline', 'Label_encoded_Source',
       'Freq_encoded_Destination', 'Label_encoded_Destination',
       'Freq_encoded_Route', 'Mean_encoded_Route', 'Label_encoded_Route',
       'dep_hr', 'arr_hr', 'dep_month', 'dep_day_of_month', 'dep_day_of_week',
       'arr_day_of_week', 'dep_weekday', 'arr_weekday',
       'departure_timeOfDay_encoded', 'arrival_timeOfDay_encoded'],
      dtype='object')

In [30]:
len(test_X.columns)

21

In [31]:
X.columns

Index(['Total_Stops', 'Duration_min', 'Freq_encoded_Airline',
       'Freq_encoded_Destination', 'Freq_encoded_Route',
       'Mean_encoded_Airline', 'Mean_encoded_Route', 'Label_encoded_Airline',
       'Label_encoded_Source', 'Label_encoded_Destination',
       'Label_encoded_Route', 'dep_hr', 'arr_hr', 'dep_month',
       'dep_day_of_month', 'dep_day_of_week', 'arr_day_of_week', 'dep_weekday',
       'arr_weekday', 'departure_timeOfDay_encoded',
       'arrival_timeOfDay_encoded'],
      dtype='object')

In [32]:
test_X.columns

Index(['Total_Stops', 'Duration_min', 'Freq_encoded_Airline',
       'Mean_encoded_Airline', 'Label_encoded_Airline', 'Label_encoded_Source',
       'Freq_encoded_Destination', 'Label_encoded_Destination',
       'Freq_encoded_Route', 'Mean_encoded_Route', 'Label_encoded_Route',
       'dep_hr', 'arr_hr', 'dep_month', 'dep_day_of_month', 'dep_day_of_week',
       'arr_day_of_week', 'dep_weekday', 'arr_weekday',
       'departure_timeOfDay_encoded', 'arrival_timeOfDay_encoded'],
      dtype='object')

In [33]:
list_A = ['Total_Stops', 'Duration_min', 'Freq_encoded_Airline',
       'Freq_encoded_Destination', 'Freq_encoded_Route',
       'Mean_encoded_Airline', 'Mean_encoded_Route', 'Label_encoded_Airline',
       'Label_encoded_Source', 'Label_encoded_Destination',
       'Label_encoded_Route', 'dep_hr', 'arr_hr', 'dep_month',
       'dep_day_of_month', 'dep_day_of_week', 'arr_day_of_week', 'dep_weekday',
       'arr_weekday', 'departure_timeOfDay_encoded',
       'arrival_timeOfDay_encoded'] 


list_B = ['Total_Stops', 'Duration_min', 'Freq_encoded_Airline',
       'Mean_encoded_Airline', 'Label_encoded_Airline', 'Label_encoded_Source',
       'Freq_encoded_Destination', 'Label_encoded_Destination',
       'Freq_encoded_Route', 'Mean_encoded_Route', 'Label_encoded_Route',
       'dep_hr', 'arr_hr', 'dep_month', 'dep_day_of_month', 'dep_day_of_week',
       'arr_day_of_week', 'dep_weekday', 'arr_weekday',
       'departure_timeOfDay_encoded', 'arrival_timeOfDay_encoded']

set(list_A) - set(list_B)

set()

In [42]:
loaded_model = load_pickle(model_path)
loaded_model.predict(test_X)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


array([4833.053 , 6014.1943, 3824.391 , 5154.1626, 2270.1477],
      dtype=float32)

In [None]:
def predict_price(df, encoded_path, model_path):
    sanity_check(df,train=False)

    loaded_dict = load_pickle(encoded_path)

    # transform categorical values 

    df['Destination'] = df['Destination'].replace({'New Delhi':'Delhi'})
    df = encode_predict_input(df,loaded_dict)

    test_X = time_based_feature_Engineering(df)
    loaded_model = load_pickle(model_path)
    result = loaded_model.predict(test_X)
    return result


In [80]:
# from load import load_data, train_path, test_path

# from freatureengineering import time_based_feature_Engineering

# from preprocessing import sanity_check, handle_missing_value, airline_handle_categorical_data

# from model import train, xgboost, load_pickle, save_pickle

# from check import list_A


# import pandas as pd
# import numpy as np


def encode_predict_input(df,encoded_dict, error_handle=True):
    '''
    This function encodes categorical values with same values as training encoded values.
    Input:
      df : DataFrame
      encoded_dict : Category encoded dictionary
    returns :None
    '''
    encoded_cols = ['Airline', 'Source', 'Destination', 'Route']
    
    frequency_dict = encoded_dict['Frequency']
    mean_dict = encoded_dict['Mean']
    label_dict = encoded_dict['Label']
    for col in encoded_cols:
        df["Freq_encoded_"+col] = df[col].replace(frequency_dict[col])
        df["Mean_encoded_"+col] = df[col].replace(mean_dict[col])
        df["Label_encoded_"+col] = df[col].replace(label_dict[col])
    df.drop(columns=encoded_cols,inplace=True)

    # this is to handle the unseen routes in test files.
    # replacing unseen value with -1. 
    #  df[df['Freq_encoded_Route'].str.contains('→',na=False)].index.to_list() -->  [6, 72, 484, 966, 1838, 1980]
    if error_handle:
        for index in [6, 72, 484, 966, 1838, 1980]:
            df.loc[index,'Freq_encoded_Route'] = -1
            df.loc[index,'Mean_encoded_Route'] = -1
            df.loc[index,'Label_encoded_Route'] = -1

        df['Freq_encoded_Route'] = df['Freq_encoded_Route'].astype('float')
        df['Mean_encoded_Route'] = df['Mean_encoded_Route'].astype('float')
        df['Label_encoded_Route'] = df['Label_encoded_Route'].astype('float')

    return df


def predict_price(df, encoded_path, model_path,error_handle=True):
    sanity_check(df,train=False)

    loaded_dict = load_pickle(encoded_path)

    # transform categorical values 

    df['Destination'] = df['Destination'].replace({'New Delhi':'Delhi'})
    df = encode_predict_input(df,loaded_dict,error_handle=True)
    
    print(df.info())

    test_X = time_based_feature_Engineering(df)
    loaded_model = load_pickle(model_path)
    # this is to handle feature mismatch in xgboost for inference.
    # https://stackoverflow.com/questions/42338972/valueerror-feature-names-mismatch-in-xgboost-in-the-predict-function
    test_X = test_X[list_A] 
    result = loaded_model.predict(test_X)
    return result


test_path = "../data/Test.xlsx"
model_path = '../models/xgboost_demo.pickle'
encoded_path = '../models/encoded_dict_demo.pickle'

# process of prediction
df = load_data(test_path)
df= df.iloc[:,:]


output = predict_price(df, encoded_path, model_path, error_handle=True)
print(output)

# output_path = "../output/results.pickle"
# save_pickle(output_path,output)
    

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2671 entries, 0 to 2670
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype          
---  ------                     --------------  -----          
 0   Total_Stops                2671 non-null   int64          
 1   DepartureDateTime          2671 non-null   datetime64[ns] 
 2   Duration_min               2671 non-null   int64          
 3   Duration_timedelta         2671 non-null   timedelta64[ns]
 4   ArrivalDateTime            2671 non-null   datetime64[ns] 
 5   Freq_encoded_Airline       2671 non-null   float64        
 6   Mean_encoded_Airline       2671 non-null   float64        
 7   Label_encoded_Airline      2671 non-null   int32          
 8   Freq_encoded_Source        2671 non-null   float64        
 9   Mean_encoded_Source        2671 non-null   float64        
 10  Label_encoded_Source       2671 non-null   int32          
 11  Freq_encoded_Destination   2671 non-null   float64      

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [None]:
## Error Encountered because of Routes not present in training set.

In [71]:
# process of prediction
df = load_data(test_data_path)
sanity_check(df,train=False)
loaded_dict = load_pickle(encoded_path)

In [72]:
encoded_cols = ['Route']
frequency_dict = encoded_dict['Frequency']
mean_dict = encoded_dict['Mean']
label_dict = encoded_dict['Label']
for col in encoded_cols:
    df["Freq_encoded_"+col] = df[col].replace(frequency_dict[col])
    df["Mean_encoded_"+col] = df[col].replace(mean_dict[col])
    df["Label_encoded_"+col] = df[col].replace(label_dict[col])

df['Destination'] = df['Destination'].replace({'New Delhi':'Delhi'})
df = encode_predict_input(df,loaded_dict)

In [73]:
df

Unnamed: 0,Total_Stops,DepartureDateTime,Duration_min,Duration_timedelta,ArrivalDateTime,Freq_encoded_Route,Mean_encoded_Route,Label_encoded_Route,Freq_encoded_Airline,Mean_encoded_Airline,Label_encoded_Airline,Freq_encoded_Source,Mean_encoded_Source,Label_encoded_Source,Freq_encoded_Destination,Mean_encoded_Destination,Label_encoded_Destination
0,1,2019-06-06 17:30:00,655,0 days 10:55:00,2019-06-07 04:25:00,0.227108,10954.205808,104,0.353661,11599.021081,4,0.415313,10461.600690,2,0.415313,10461.600690,1
1,1,2019-12-05 06:20:00,240,0 days 04:00:00,2019-12-05 10:20:00,0.006213,5240.876923,90,0.195278,5668.469897,3,0.273370,9143.083566,3,0.273370,9143.083566,0
2,1,2019-05-21 19:15:00,1425,0 days 23:45:00,2019-05-22 19:00:00,0.227108,10954.205808,104,0.353661,11599.021081,4,0.415313,10461.600690,2,0.415313,10461.600690,1
3,1,2019-05-21 08:00:00,780,0 days 13:00:00,2019-05-21 21:00:00,0.227108,10954.205808,104,0.114318,10902.678094,6,0.415313,10461.600690,2,0.415313,10461.600690,1
4,0,2019-06-24 23:55:00,170,0 days 02:50:00,2019-06-25 02:45:00,0.146817,5552.235677,18,0.030491,5590.260188,0,0.208278,8022.872877,0,0.208278,8022.872877,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2666,1,2019-06-06 20:30:00,1435,0 days 23:55:00,2019-06-07 20:25:00,0.054005,10763.258407,73,0.161919,9556.608028,1,0.273370,9143.083566,3,0.273370,9143.083566,0
2667,0,2019-03-27 14:20:00,155,0 days 02:35:00,2019-03-27 16:55:00,0.069203,4556.055249,64,0.195278,5668.469897,3,0.273370,9143.083566,3,0.273370,9143.083566,0
2668,1,2019-06-03 21:50:00,395,0 days 06:35:00,2019-06-04 04:25:00,0.227108,10954.205808,104,0.353661,11599.021081,4,0.415313,10461.600690,2,0.415313,10461.600690,1
2669,1,2019-06-03 04:00:00,915,0 days 15:15:00,2019-06-03 19:15:00,0.227108,10954.205808,104,0.161919,9556.608028,1,0.415313,10461.600690,2,0.415313,10461.600690,1


In [74]:
df.head(10)

Unnamed: 0,Total_Stops,DepartureDateTime,Duration_min,Duration_timedelta,ArrivalDateTime,Freq_encoded_Route,Mean_encoded_Route,Label_encoded_Route,Freq_encoded_Airline,Mean_encoded_Airline,Label_encoded_Airline,Freq_encoded_Source,Mean_encoded_Source,Label_encoded_Source,Freq_encoded_Destination,Mean_encoded_Destination,Label_encoded_Destination
0,1,2019-06-06 17:30:00,655,0 days 10:55:00,2019-06-07 04:25:00,0.227108,10954.205808,104,0.353661,11599.021081,4,0.415313,10461.60069,2,0.415313,10461.60069,1
1,1,2019-12-05 06:20:00,240,0 days 04:00:00,2019-12-05 10:20:00,0.006213,5240.876923,90,0.195278,5668.469897,3,0.27337,9143.083566,3,0.27337,9143.083566,0
2,1,2019-05-21 19:15:00,1425,0 days 23:45:00,2019-05-22 19:00:00,0.227108,10954.205808,104,0.353661,11599.021081,4,0.415313,10461.60069,2,0.415313,10461.60069,1
3,1,2019-05-21 08:00:00,780,0 days 13:00:00,2019-05-21 21:00:00,0.227108,10954.205808,104,0.114318,10902.678094,6,0.415313,10461.60069,2,0.415313,10461.60069,1
4,0,2019-06-24 23:55:00,170,0 days 02:50:00,2019-06-25 02:45:00,0.146817,5552.235677,18,0.030491,5590.260188,0,0.208278,8022.872877,0,0.208278,8022.872877,2
5,1,2019-12-06 18:15:00,1100,0 days 18:20:00,2019-12-07 12:35:00,0.227108,10954.205808,104,0.353661,11599.021081,4,0.415313,10461.60069,2,0.415313,10461.60069,1
6,1,2019-12-03 07:30:00,905,0 days 15:05:00,2019-12-03 22:35:00,BLR → TRV → DEL,BLR → TRV → DEL,BLR → TRV → DEL,0.161919,9556.608028,1,0.208278,8022.872877,0,0.208278,8022.872877,2
7,1,2019-01-05 15:15:00,315,0 days 05:15:00,2019-01-05 20:30:00,0.005066,4910.018868,80,0.195278,5668.469897,3,0.27337,9143.083566,3,0.27337,9143.083566,0
8,0,2019-03-15 10:10:00,165,0 days 02:45:00,2019-03-15 12:55:00,0.069203,4556.055249,64,0.195278,5668.469897,3,0.27337,9143.083566,3,0.27337,9143.083566,0
9,1,2019-05-18 16:30:00,365,0 days 06:05:00,2019-05-18 22:35:00,0.093577,11487.78856,66,0.353661,11599.021081,4,0.27337,9143.083566,3,0.27337,9143.083566,0


In [75]:
mean_dict['Route']['BLR → TRV → DEL']

KeyError: 'BLR → TRV → DEL'

In [76]:
df[df['Freq_encoded_Route'].str.contains('→',na=False)]

Unnamed: 0,Total_Stops,DepartureDateTime,Duration_min,Duration_timedelta,ArrivalDateTime,Freq_encoded_Route,Mean_encoded_Route,Label_encoded_Route,Freq_encoded_Airline,Mean_encoded_Airline,Label_encoded_Airline,Freq_encoded_Source,Mean_encoded_Source,Label_encoded_Source,Freq_encoded_Destination,Mean_encoded_Destination,Label_encoded_Destination
6,1,2019-12-03 07:30:00,905,0 days 15:05:00,2019-12-03 22:35:00,BLR → TRV → DEL,BLR → TRV → DEL,BLR → TRV → DEL,0.161919,9556.608028,1,0.208278,8022.872877,0,0.208278,8022.872877,2
72,3,2019-01-04 05:50:00,960,0 days 16:00:00,2019-01-04 21:50:00,CCU → IXR → BBI → BOM → BLR,CCU → IXR → BBI → BOM → BLR,CCU → IXR → BBI → BOM → BLR,0.161919,9556.608028,1,0.27337,9143.083566,3,0.27337,9143.083566,0
484,3,2019-03-24 05:50:00,960,0 days 16:00:00,2019-03-24 21:50:00,CCU → IXR → BBI → BOM → BLR,CCU → IXR → BBI → BOM → BLR,CCU → IXR → BBI → BOM → BLR,0.161919,9556.608028,1,0.27337,9143.083566,3,0.27337,9143.083566,0
966,2,2019-06-03 08:00:00,610,0 days 10:10:00,2019-06-03 18:10:00,BOM → VGA → TIR → HYD,BOM → VGA → TIR → HYD,BOM → VGA → TIR → HYD,0.161919,9556.608028,1,0.066622,5059.708752,4,0.066622,5059.708752,3
1838,1,2019-03-03 07:30:00,905,0 days 15:05:00,2019-03-03 22:35:00,BLR → TRV → DEL,BLR → TRV → DEL,BLR → TRV → DEL,0.161919,9556.608028,1,0.208278,8022.872877,0,0.208278,8022.872877,2
1980,2,2019-12-03 07:15:00,560,0 days 09:20:00,2019-12-03 16:35:00,BOM → IXC → DEL → HYD,BOM → IXC → DEL → HYD,BOM → IXC → DEL → HYD,0.353661,11599.021081,4,0.066622,5059.708752,4,0.066622,5059.708752,3


In [77]:
df['Freq_encoded_Route'].str.contains('→',na=False)

0       False
1       False
2       False
3       False
4       False
        ...  
2666    False
2667    False
2668    False
2669    False
2670    False
Name: Freq_encoded_Route, Length: 2671, dtype: bool

In [65]:
df[df['Freq_encoded_Route'].str.contains('→',na=False)].index.to_list()

[6, 72, 484, 966, 1838, 1980]

In [54]:
df.loc[6,'Freq_encoded_Route'] = -1
df.iloc[3:7, :]

In [66]:
# for index in df[df['Freq_encoded_Route'].str.contains('→',na=False)].index.to_list():
#     df.loc[index,'Freq_encoded_Route'] = -1
#     df.loc[index,'Mean_encoded_Route'] = -1
#     df.loc[index,'Label_encoded_Route'] = -1

for index in df[df['Freq_encoded_Route'].str.contains('→',na=False)].index.to_list():
    df.loc[index,'Freq_encoded_Route'] = df['Freq_encoded_Route'].mode()
    df.loc[index,'Mean_encoded_Route'] = df['Mean_encoded_Route'].mode()
    df.loc[index,'Label_encoded_Route'] = df['Label_encoded_Route'].mode()

In [67]:
df[df['Freq_encoded_Route'].str.contains('→',na=False)]

Unnamed: 0,Total_Stops,DepartureDateTime,Duration_min,Duration_timedelta,ArrivalDateTime,Freq_encoded_Route,Mean_encoded_Route,Label_encoded_Route,Freq_encoded_Airline,Mean_encoded_Airline,Label_encoded_Airline,Freq_encoded_Source,Mean_encoded_Source,Label_encoded_Source,Freq_encoded_Destination,Mean_encoded_Destination,Label_encoded_Destination


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2671 entries, 0 to 2670
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype          
---  ------                     --------------  -----          
 0   Total_Stops                2671 non-null   int64          
 1   DepartureDateTime          2671 non-null   datetime64[ns] 
 2   Duration_min               2671 non-null   int64          
 3   Duration_timedelta         2671 non-null   timedelta64[ns]
 4   ArrivalDateTime            2671 non-null   datetime64[ns] 
 5   Freq_encoded_Route         2671 non-null   object         
 6   Mean_encoded_Route         2671 non-null   object         
 7   Label_encoded_Route        2671 non-null   object         
 8   Freq_encoded_Airline       2671 non-null   float64        
 9   Mean_encoded_Airline       2671 non-null   float64        
 10  Label_encoded_Airline      2671 non-null   int32          
 11  Freq_encoded_Source        2671 non-null   float64      

In [None]:
['Total_Stops', 'Duration_min', 'Freq_encoded_Airline', 'Freq_encoded_Destination', 'Freq_encoded_Route', 'Mean_encoded_Airline', 'Mean_encoded_Route', 'Label_encoded_Airline', 'Label_encoded_Source', 'Label_encoded_Destination', 'Label_encoded_Route', 'dep_hr', 'arr_hr', 'dep_month', 'dep_day_of_month', 'dep_day_of_week', 'arr_day_of_week', 'dep_weekday', 'arr_weekday', 'departure_timeOfDay_encoded', 'arrival_timeOfDay_encoded'] 

['Total_Stops', 'Duration_min', 'Freq_encoded_Airline', 'Mean_encoded_Airline', 'Label_encoded_Airline', 'Label_encoded_Source', 'Freq_encoded_Destination', 'Label_encoded_Destination', 'Freq_encoded_Route', 'Mean_encoded_Route', 'Label_encoded_Route', 'dep_hr', 'arr_hr', 'dep_month', 'dep_day_of_month', 'dep_day_of_week', 'arr_day_of_week', 'dep_weekday', 'arr_weekday', 'departure_timeOfDay_encoded', 'arrival_timeOfDay_encoded']