In [45]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

train_path = "../data/Train.xlsx"
test_path = '../data/Test.xlsx'

def load_data(path):
    '''
    This function loads the data from excel file into a dataframe.
    inputs:
        path: Path of data file
    returns:
        df : dataframe
    '''
    df = pd.read_excel(path)
    return df

if __name__ == "__main__":
    path = "../data/Train.xlsx"
    # Read the data 
    df = load_data(path)
    print(df.shape)

(10683, 11)


In [46]:
df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [48]:
df['Airline'].unique()

array(['IndiGo', 'Air India', 'Jet Airways', 'SpiceJet',
       'Multiple carriers', 'GoAir', 'Vistara', 'Air Asia',
       'Vistara Premium economy', 'Jet Airways Business',
       'Multiple carriers Premium economy', 'Trujet'], dtype=object)

In [49]:
df['Destination'].unique()

array(['New Delhi', 'Banglore', 'Cochin', 'Kolkata', 'Delhi', 'Hyderabad'],
      dtype=object)

In [47]:
df['Source'].unique()

array(['Banglore', 'Kolkata', 'Delhi', 'Chennai', 'Mumbai'], dtype=object)

In [50]:
df['Route'].unique()

array(['BLR → DEL', 'CCU → IXR → BBI → BLR', 'DEL → LKO → BOM → COK',
       'CCU → NAG → BLR', 'BLR → NAG → DEL', 'CCU → BLR',
       'BLR → BOM → DEL', 'DEL → BOM → COK', 'DEL → BLR → COK',
       'MAA → CCU', 'CCU → BOM → BLR', 'DEL → AMD → BOM → COK',
       'DEL → PNQ → COK', 'DEL → CCU → BOM → COK', 'BLR → COK → DEL',
       'DEL → IDR → BOM → COK', 'DEL → LKO → COK',
       'CCU → GAU → DEL → BLR', 'DEL → NAG → BOM → COK',
       'CCU → MAA → BLR', 'DEL → HYD → COK', 'CCU → HYD → BLR',
       'DEL → COK', 'CCU → DEL → BLR', 'BLR → BOM → AMD → DEL',
       'BOM → DEL → HYD', 'DEL → MAA → COK', 'BOM → HYD',
       'DEL → BHO → BOM → COK', 'DEL → JAI → BOM → COK',
       'DEL → ATQ → BOM → COK', 'DEL → JDH → BOM → COK',
       'CCU → BBI → BOM → BLR', 'BLR → MAA → DEL',
       'DEL → GOI → BOM → COK', 'DEL → BDQ → BOM → COK',
       'CCU → JAI → BOM → BLR', 'CCU → BBI → BLR', 'BLR → HYD → DEL',
       'DEL → TRV → COK', 'CCU → IXR → DEL → BLR',
       'DEL → IXU → BOM → COK', 'CCU 

In [39]:
import pandas as pd
import numpy as np
## Applying Label Encoder to Categorical columns as Hit and Trail.
from sklearn.preprocessing import LabelEncoder

# import load module from load.py
# from loadData import load_data , train_path


# Duration converted to Minutes.
def to_minutes(x):
    if len(x.split(" ")) > 1:
        hour_value = int(x.split(" ")[0].replace("h",""))
        minute_value = int(x.split(" ")[1].replace("m",""))
    else:
        if x.endswith("h"):
            hour_value = int(x.replace("h",""))
            minute_value = 0
        else:
            hour_value = 0
            minute_value = int(x.replace("m",""))
    duration = hour_value*60 + minute_value
    return duration

# Total_Stops column converted to integer.
stop_map = {
    'non-stop': 0 ,
    '2 stops' : 2 ,
    '1 stop'  : 1 ,
    '3 stops' : 3 ,
    '4 stops' : 4
}

## Buildng a sanity_check function
def sanity_check(df,mode='train'):
    '''
      This function perform sanity and check create a dataframe.
      Input:
        df : Dataframe which require sanity-check
        mode : train or predict/inference
      return : None
    '''
    if mode == 'train':
        # Drop any duplicaties (check size before and after dropping duplicates.)
        df.drop_duplicates(inplace=True)

    # Date_of_Journey ,Arrival_Time and Dep_Time must be datetime object
    df['Date_of_Journey'] = pd.to_datetime(df['Date_of_Journey'])
    df['Arrival_Time'] = pd.to_datetime(df['Arrival_Time'])
    df['Dep_Time'] = pd.to_datetime(df['Dep_Time'])

    # Duration converted to Minutes.
    df['Duration'] = df['Duration'].apply(lambda x: to_minutes(x))

    # Total_Stops column converted to integer.
    df['Total_Stops'] = df['Total_Stops'].replace(stop_map)
    return df


## Building function to handle missing value
def handle_missing_value(df,mode='train'):
    '''
      This function handles missing value create a dataframe.
      Input:
        df : Dataframe which require missing value treatment
        mode : train or predict/Inference
      returns :
         Dataframe with all missing value handled.
    '''
    if mode == 'train':
        # Seems to be the same row and make sense  if you don't have route can't decide on stops.
        # It would be appropiate to drop them in this case.
        df.dropna(inplace=True)

    # Dropping Additional_Info columns as around 78% of values are not provided.
    df.drop('Additional_Info',axis=1,inplace=True)  
    return df

def frequency_encoder(df,col):
    """
    This function encodes a categorical column based on the frequency of their occurence.
    input:
        df : Input DataFrame in which encoding has to be created 
        col : Column name which has to be encoded
    return: 
          frequency encoded dictionary for columns
    """
    freq_value = df.groupby(col).size()/len(df)
    freq_dict = freq_value.to_dict()
    df["Freq_encoded_"+col] = df[col].replace(freq_dict)
    return freq_dict

def mean_encoder(df,col,target_col):
    """
    This function encodes a categorical column based on the frequency of their occurence.
    input:
        df : Input DataFrame in which encoding has to be created 
        col : Column name which has to be encoded
    return: 
          Mean encoded dict for column
    """
    mean_value = df.groupby(col)[target_col].mean()
    mean_dict = mean_value.to_dict()
    df["Mean_encoded_"+col] = df[col].replace(mean_dict)
    return mean_dict

## Label encoder for function and later usages:
def label_encoder(df,col):
    """
    This function encodes a categorical column based on the basis of their order label.
    input:
        df : Input DataFrame in which encoding has to be created 
        col : Column name which has to be encoded
    return: 
          label encoded dict for column
    """
    le = LabelEncoder()
    le.fit(df[col])
    label_dict = dict(zip((le.classes_),le.transform(le.classes_)))
    df["Label_encoded_"+col] = df[col].replace(label_dict)
    return label_dict


## Create a function to handle categorical value
def handle_categorical_values(df,target):
    '''
      This function handles categorical value and create a dataframe.
      Input:
        df : Dataframe which require categorical value treatment
      returns :
         Dataframe with all categorical value handled.
    '''
    encoded_dict = dict()
    # Getting all object columns
    object_columns = df.select_dtypes(object).columns

    ## generate frequency encoded categorical values
    frequency_encoded_dict =dict() 
    for col in object_columns:
        freq_dict = frequency_encoder(df,col)
        frequency_encoded_dict[col] = freq_dict

    ## generate target mean encoded categorical values
    mean_encoded_dict =dict()
    for col in object_columns:
        mean_dict = mean_encoder(df,col,target)
        mean_encoded_dict[col] = mean_dict

    
    ## generate label encoded categorical values
    label_encoded_dict =dict()
    for col in object_columns:
        label_dict = label_encoder(df,col)
        label_encoded_dict[col] = label_dict
    
    encoded_dict["Frequency"] = frequency_encoded_dict
    encoded_dict["Mean"] = mean_encoded_dict
    encoded_dict["Label"] = label_encoded_dict

    return df, encoded_dict


def generate_additional_features(df):
    '''
    This Function generates additional features.
    Input :
        df : DataFrame from which feature has to be genrated
    return None
    '''
    # Time based features can be genrated.
    # Day_of_week
    df['day_of_week'] = df['Date_of_Journey'].dt.day_of_week 
    # Day_of_month
    df['day_of_month'] = df['Date_of_Journey'].dt.day
    # Weekdays 
    df['weekday'] = np.where(df["day_of_week"].isin([5,6]),0,1)
    # Month of Travel
    df['month'] = df['Date_of_Journey'].dt.month
    # Hour of Departure etc.
    df['dep_hour'] = df['Dep_Time'].dt.hour

    return df

def filter_predictor_columns(df):
    '''
    This function filters predictor columns from the incoming Data
    '''
    predictor_columns = ['Duration', 'Total_Stops', 'Label_encoded_Airline',
                            'Label_encoded_Source', 'Label_encoded_Destination',
                            'Label_encoded_Route', 'Freq_encoded_Airline', 'Freq_encoded_Source',
                            'Freq_encoded_Destination', 'Freq_encoded_Route',
                            'Mean_encoded_Airline', 'Mean_encoded_Source',
                            'Mean_encoded_Destination', 'Mean_encoded_Route', 'day_of_week',
                            'day_of_month', 'weekday', 'month', 'dep_hour']
    return df[predictor_columns]

def pre_process(df,target):
    '''
      This function applies pre-processing on any incoming observations
    Input:
      df : DataFrame which require pre-processing
      target : dependent variable
    return clean_df : Cleaned Dataframe
    '''
    sanity_check(df)
    handle_missing_value(df)
    df,encoded_dict = handle_categorical_values(df,target)
    generate_additional_features(df)
    X = filter_predictor_columns(df)
    y = df[target]
    # robust_transformer = RobustScaler().fit(X)
    # robust_transformer.transform(X)
    return X,y,encoded_dict
    

if __name__ == "__main__": 

    df = load_data(train_path)
    print(df.shape)
    print(df.head())
    print("-"*72)
    print("Data Pre-Processing.")
    print("-"*72)
    target = 'Price'
    print(df.shape)
    print(df.head())
    X,y,encoded_dict = pre_process (df,target)
    print(encoded_dict)
    
    print(X.head())
    print(X.shape,y.shape)

(10683, 11)
       Airline Date_of_Journey    Source Destination                  Route  \
0       IndiGo      24/03/2019  Banglore   New Delhi              BLR → DEL   
1    Air India       1/05/2019   Kolkata    Banglore  CCU → IXR → BBI → BLR   
2  Jet Airways       9/06/2019     Delhi      Cochin  DEL → LKO → BOM → COK   
3       IndiGo      12/05/2019   Kolkata    Banglore        CCU → NAG → BLR   
4       IndiGo      01/03/2019  Banglore   New Delhi        BLR → NAG → DEL   

  Dep_Time  Arrival_Time Duration Total_Stops Additional_Info  Price  
0    22:20  01:10 22 Mar   2h 50m    non-stop         No info   3897  
1    05:50         13:15   7h 25m     2 stops         No info   7662  
2    09:25  04:25 10 Jun      19h     2 stops         No info  13882  
3    18:05         23:30   5h 25m      1 stop         No info   6218  
4    16:50         21:35   4h 45m      1 stop         No info  13302  
------------------------------------------------------------------------
Data Pre-Proce

In [40]:


from sklearn import linear_model
from sklearn import metrics

from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import SplineTransformer

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor


def regression(X_train, X_test, y_train, y_test):
    ## Baseline model
    reg = linear_model.LinearRegression()
    reg.fit(X_train,y_train)
    print(metrics.r2_score(y_train,reg.predict(X_train)), metrics.r2_score(y_test,reg.predict(X_test)))
    return reg

def knearestneighbour(X_train, X_test, y_train, y_test):
    k_range = list(range(1, 30))
    params = dict(n_neighbors = k_range)
    knn_regressor = GridSearchCV(KNeighborsRegressor(), params, cv =10, scoring = 'neg_mean_squared_error')
    knn_regressor.fit(X_train, y_train)
    print(metrics.r2_score(y_train,knn_regressor.predict(X_train)),metrics.r2_score(y_test,knn_regressor.predict(X_test)))
    return knn_regressor

def decisiontree(X_train, X_test, y_train, y_test):
    depth  =list(range(3,30))
    param_grid =dict(max_depth =depth)
    tree =GridSearchCV(DecisionTreeRegressor(),param_grid,cv =10)
    tree.fit(X_train,y_train)
    print(metrics.r2_score(y_train,tree.predict(X_train)),metrics.r2_score(y_test,tree.predict(X_test)))
    return tree

def randomForest(X_train, X_test, y_train, y_test):
    tuned_params = {'n_estimators': [100,  300,  500], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}
    random_regressor = RandomizedSearchCV(RandomForestRegressor(), tuned_params, n_iter = 20, scoring = 'neg_mean_absolute_error', cv = 5, n_jobs = -1)
    random_regressor.fit(X_train, y_train)
    print(metrics.r2_score(y_train,random_regressor.predict(X_train)),metrics.r2_score(y_test,random_regressor.predict(X_test)))
    return random_regressor

In [41]:
import pandas as pd
import pickle
import joblib
import os
from sklearn.model_selection import train_test_split

# from loadData import load_data, train_path
# from preProcessing import pre_process
# from model import regression, decisiontree, knearestneighbour, randomForest

import warnings
warnings.filterwarnings("ignore")


def save_model(model,file_name):
    joblib.dump(model,file_name)

def save_pickle(model,file_name):
    with open(file_name, 'wb') as handle:
        pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_model(file_name):
    model = joblib.load(file_name)
    return model

def train(X,y,modelType):
    # Split your dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    model = modelType(X_train, X_test, y_train, y_test)
    return model

if __name__ == "__main__":
    
    target = "Price"
    print("Loading the Data.")
    df = load_data(train_path)

    print("Starting Pre-processing of Data")
    X,y,encoded_dict = pre_process(df,target)
    # Store data (serialize)
    with open('../models/encoded.pickle', 'wb') as handle:
        pickle.dump(encoded_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

    print("Statrted Training the model.")
    regression_model = train(X,y,regression)
    print("Saving the model.")
    file_name = "../models/linearRegressionModel.pickle" 
    save_pickle(regression_model,file_name)

    print("Statrted Training the decisiontree model.")
    decisiontree_model = train(X,y,decisiontree)
    print("Saving the model.")
    file_name = "../models/decisiontreeModel.pickle" 
    save_pickle(decisiontree_model,file_name)

    print("Statrted Training the knearestneighbour model.")
    knearestneighbour_model = train(X,y,knearestneighbour)
    print("Saving the model.")
    file_name = "../models/knearestneighbourModel.pickle" 
    save_pickle(knearestneighbour_model,file_name)

    print("Statrted Training the randomForest model.")
    randomForest_model = train(X,y,randomForest)
    print("Saving the model.")
    file_name = "../models/randomForestModel.pickle" 
    save_pickle(randomForest_model,file_name)





Loading the Data.
Starting Pre-processing of Data
Statrted Training the model.
0.6607842122749666 0.6479600704779591
Saving the model.
Statrted Training the decisiontree model.
0.9126011673702669 0.8050430433690648
Saving the model.
Statrted Training the knearestneighbour model.
0.7771346821318521 0.6707353003128252
Saving the model.
Statrted Training the randomForest model.
0.9220609292428852 0.8471779226927647
Saving the model.


In [44]:
# from preProcessing import sanity_check, handle_missing_value, generate_additional_features, filter_predictor_columns
import pickle
# from loadData import load_data , test_path
import joblib

import warnings
warnings.filterwarnings("ignore")



def encode_predict_input(df,encoded_dict):
    '''
    This function encodes categorical values with same values as training encoded values.
    Input:
      df : DataFrame
      encoded_dict : Category encoded dictionary
    returns :None
    '''
    encoded_cols = ['Airline', 'Source', 'Destination', 'Route']
    frequency_dict = encoded_dict['Frequency']
    mean_dict = encoded_dict['Mean']
    label_dict = encoded_dict['Label']
    for col in encoded_cols:
        df["Freq_encoded_"+col] = df[col].replace(frequency_dict[col])
        df["Mean_encoded_"+col] = df[col].replace(mean_dict[col])
        df["Label_encoded_"+col] = df[col].replace(label_dict[col])


def preprocess_and_predict(df,encoded_dict):
    '''
      This function takes in new dataframe or row of observation and generate all features
    Input :
        df : DataFrame or row of observation
        encoded_dict : Dictonary created while training for Categorical Encoded Value.
    '''
    sanity_check(df,mode='predict')
    handle_missing_value(df,mode='predict')
    
    encode_predict_input(df,encoded_dict)
    generate_additional_features(df)
    X = filter_predictor_columns(df)
    return X

if __name__ == "__main__":

    print("Loading the TeatData.")
    # Load data (deserialize)
    with open('../models/encoded.pickle', 'rb') as handle:
        encoded_dict = pickle.load(handle)

    print(type(encoded_dict))
    
    model_path = "../models/randomForestModel.pickle"
    saved_model= joblib.load(model_path)

    test_df = load_data(test_path)
    test_input = preprocess_and_predict(test_df,encoded_dict)
    print(test_input.head())
    saved_model.predict(test_input.iloc[0:5,:])
    print(saved_model.predict(test_input.iloc[0:5,:]))
 

Loading the TeatData.
<class 'dict'>
   Duration  Total_Stops  Label_encoded_Airline  Label_encoded_Source  \
0       655            1                      4                     2   
1       240            1                      3                     3   
2      1425            1                      4                     2   
3       780            1                      6                     2   
4       170            0                      0                     0   

   Label_encoded_Destination Label_encoded_Route  Freq_encoded_Airline  \
0                          1                 104              0.353661   
1                          0                  90              0.195278   
2                          1                 104              0.353661   
3                          1                 104              0.114318   
4                          2                  18              0.030491   

   Freq_encoded_Source  Freq_encoded_Destination Freq_encoded_Route  \
0       