# Modeling

In [1]:
# Libraries
import warnings
warnings.filterwarnings(action = 'ignore')

## Basic libraries
import numpy as np
import pandas as pd
import seaborn as sns

import math

## Plotting
import matplotlib.pyplot as plt

# Basic cleaning
## Variance
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import cross_val_score, RepeatedKFold
from sklearn.metrics import mean_squared_log_error, make_scorer, mean_squared_error
from scipy.stats import iqr

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, RobustScaler
from sklearn.pipeline import make_pipeline, make_union
from sklearn.compose import make_column_transformer, TransformedTargetRegressor
from sklearn.linear_model import SGDRegressor, Lasso, ElasticNet, Perceptron,SGDRegressor,Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import QuantileTransformer 
from sklearn.neighbors import KNeighborsRegressor

In [2]:
def extract_cols_outliers(df):
        """
    Extract the columns with outliers.
    Args:
        df (DataFrame): Raw data
    Returns:
        - a list of numerical columns with outliers
        - a list of nnumerical columns without outliers
    """
        # Identify the columns with outliers
        numerical_columns_w_outliers = []
        numerical_columns_no_outliers = []
        
        num_df = df.select_dtypes(include='number')
        numerical_columns = num_df.columns
        
        for col in numerical_columns: 
            # Calculate IQR
            iqr_value = iqr(df[col],nan_policy='omit')        
            #Calculate 1st quartile
            q1 = np.percentile(df[col],25)        
            #Calculate 3rd quartile
            q3 = np.percentile(df[col],75)        
            #Calculate lower limit below which data point is considered an outlier
            outlier_lim_low = q1 - 1.5 * iqr_value
        
            #Calculate higher limit above which data point is considered an outlier
            outlier_lim_high = q3 + 1.5 * iqr_value
            
            #Calculate number of 'low' outliers
            outlier_condition_low = df[col] < outlier_lim_low
            number_outliers_low = len(df[outlier_condition_low][col])
            
            #Calculate number of 'high' outliers
            outlier_condition_high = df[col] > outlier_lim_high
            number_outliers_high = len(df[outlier_condition_high][col])
            
            #Calculate total number of outliers
            number_outliers_total = number_outliers_low + number_outliers_high
            
            #If any outliers in column, column is added to a list of columns with outliers
            if number_outliers_total > 0:
                numerical_columns_w_outliers.append(col)
            elif number_outliers_total == 0:
                numerical_columns_no_outliers.append(col)

        return numerical_columns_w_outliers, numerical_columns_no_outliers

In [3]:
def check_duplicates(df):
    # Check number of rows before removing duplicates
    print(f"Number of rows : {len(df)}")

    # Compute the number of duplicated rows
    num_dups = df.duplicated().sum()
    
    print(f"Number of duplicated rows : {num_dups}")

    if df.duplicated().any():
        # Remove duplicates
        df_no_duplicates = df.drop_duplicates()
        print(f"{num_dups} duplicated row(s) removed")
        return df_no_duplicates
    else:
        return "No duplicated rows found !"  

In [4]:
# load the dataset
def load_dataset(filename, target):
    # load the dataset
    data = pd.read_csv(filename, index_col='id') 

    #Check duplicates (Any duplicated rows are dropped)
    data_no_dups = check_duplicates(data)
    
    # split into input and output variables
    X = data_no_dups.drop(columns=[target])
    y = data_no_dups[[target]]

    # Display shapes
    display(f"Shape of X : {X.shape}")
    display(f"Shape of y : {y.shape}")
    
    return X, y

In [5]:
# Define the root mean squared logaritmic error
def root_mean_squared_log_error(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred, squared=True))

# Define a RMSLE scorer
rmsle_scorer = make_scorer(root_mean_squared_log_error, greater_is_better=False)

In [6]:
# def preprocess_input_data(X):
#     # Prepare features columns names
#     categorical_columns = X.select_dtypes(exclude='number').columns.tolist()
#     numerical_columns = X.select_dtypes(include='number').columns.tolist()

#     # Prepare Column Transformer (input features)
#     preproc = make_column_transformer((OneHotEncoder(drop='if_binary', sparse_output=False),categorical_columns),(MinMaxScaler(), numerical_columns))

#     return preproc

In [7]:
# def preprocess_input_data(X):
#     # Prepare features columns names
#     categorical_columns = X.select_dtypes(exclude='number').columns.tolist()
#     numerical_columns = X.select_dtypes(include='number').columns.tolist()

#     # Prepare Column Transformer (input features)
#     preproc = make_column_transformer((OneHotEncoder(drop='if_binary', sparse_output=False),categorical_columns),(RobustScaler(), numerical_columns))

#     return preproc

In [8]:
def preprocess_input_data(X):
    # Prepare features columns names
    categorical_columns = X.select_dtypes(exclude='number').columns.tolist()
    numerical_columns = X.select_dtypes(include='number').columns.tolist()

    # Prepare Column Transformer (input features)
    preproc = make_column_transformer((OneHotEncoder(drop='if_binary', sparse_output=False),categorical_columns),
                                      (QuantileTransformer(n_quantiles=100, output_distribution='uniform'), numerical_columns))
    return preproc

In [9]:
# def preprocess_input_data(X):
#     # Prepare features columns names
#     categorical_columns = X.select_dtypes(exclude='number').columns.tolist()
#     numerical_columns_w_outliers, numerical_columns_no_outliers = extract_cols_outliers(X)

#     # Prepare Column Transformer (input features)
#     preproc = make_column_transformer((OneHotEncoder(drop='if_binary', sparse_output=False),categorical_columns),
#                                       (MinMaxScaler(), numerical_columns_no_outliers),
#                                       (RobustScaler(), numerical_columns_w_outliers))

#     return preproc

In [10]:
# Prepare target
def preprocess_output_data(y):
    # Apply log1p transform to y
    y_log = np.log1p(y)
    return y_log

In [11]:
# evaluate a given model using cross-validation 
def evaluate_model(model, X, y):
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) 
    scores = cross_val_score(model, X, y, scoring=rmsle_scorer, cv=cv, n_jobs=-1) 
    return scores

In [12]:
def report_model_performance(scores):
    mean_rmsle = -np.mean(n_scores)
    std_rmsle = np.std(n_scores)
    return mean_rmsle, std_rmsle

In [13]:
# Get model (preprocessing + actual model)
def get_models(X_train, models_dict):
    # Preprocess input data
    preproc = preprocess_input_data(X_train)

    # define and configure the model
    pipeline = make_pipeline(preproc,model)
    
    return pipeline

In [14]:
# # Get model (preprocessing + actual model)
# def get_models(X_train):
#     # Preprocess input data
#     preproc = preprocess_input_data(X_train)

#     # define and configure the model
#     model = LinearRegression()
#     pipeline = make_pipeline(preproc,model)
#     return pipeline

In [15]:
# # Paths
# calorie_train_data_path = "../data/raw_data/train.csv"
# target_name = "Calories"

# # Load dataset
# X,y = load_dataset(calorie_train_data_path,target_name)

# # split into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) 

# # Preprocess output data
# y_train_log = preprocess_output_data(y_train)

# # Define pipeline
# models_dict = {
#                 'LinearRegression' : LinearRegression(),
#                 'Ridge': Ridge(random_state=1),
#                 'Lasso' : Lasso(random_state=1),            
#                 'ElasticNet' : ElasticNet(random_state=1),         
#                 'DecisionTreeRegressor' : DecisionTreeRegressor(random_state=1),
#                 'KNeighborsRegressor':KNeighborsRegressor()
#                 # 'SGDRegressor' : SGDRegressor(random_state=1),
#             }

# for model_name, model in models_dict.items():
#     # Get preprocessing and model chained
#     pipeline = get_models(X_train, model)

#     # evaluate a given model using cross-validation
#     n_scores = evaluate_model(pipeline, X_train, y_train_log)

#     # report model performance
#     mean_rmsle, std_rmsle = report_model_performance(n_scores)
#     print(f"> {model_name} RMSLE: {mean_rmsle:.7f} ({std_rmsle:.3f})")

I choose to use **DecisionTreeRegressor()** as it leads to the best results in this case.

In [16]:
# # Paths
# calorie_train_data_path = "../data/raw_data/train.csv"
# target_name = "Calories"

# # Load dataset
# X,y = load_dataset(calorie_train_data_path,target_name)

# # split into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) 

# # Preprocess output data
# y_train_log = preprocess_output_data(y_train)

# # Prepare features columns names
# categorical_columns = X_train.select_dtypes(exclude='number').columns.tolist()
# numerical_columns = X_train.select_dtypes(include='number').columns.tolist()

# # n_quants = [10, 50, 100, 500, 1000]
# n_quants = np.arange(10, 1000, 100)

# for i, n in enumerate(n_quants):
#         # Prepare Column Transformer (input features)
#         preproc = make_column_transformer((OneHotEncoder(drop='if_binary', sparse_output=False),categorical_columns),
#                                           (QuantileTransformer(n_quantiles=n, output_distribution='uniform'), numerical_columns))

#         # define and configure the model
#         pipeline = make_pipeline(preproc,DecisionTreeRegressor(random_state=1))

#         # evaluate a given model using cross-validation
#         n_scores = evaluate_model(pipeline, X_train, y_train_log)

#         # report model performance
#         mean_rmsle, std_rmsle = report_model_performance(n_scores)
#         print(f"> {n} RMSLE: {mean_rmsle:.7f} ({std_rmsle:.3f})")

In [17]:
# Paths
calorie_train_data_path = "../data/raw_data/train.csv"
target_name = "Calories"

# Load dataset
X,y = load_dataset(calorie_train_data_path,target_name)

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) 

# Preprocess output data
y_train_log = preprocess_output_data(y_train)

# Prepare features columns names
categorical_columns = X_train.select_dtypes(exclude='number').columns.tolist()
numerical_columns = X_train.select_dtypes(include='number').columns.tolist()

# Prepare Column Transformer (input features)
preproc = make_column_transformer((OneHotEncoder(drop='if_binary', sparse_output=False),categorical_columns),
                                  (QuantileTransformer(n_quantiles=10, output_distribution='uniform'), numerical_columns))

# define and configure the model
pipeline = make_pipeline(preproc,DecisionTreeRegressor(random_state=1))

# evaluate a given model using cross-validation
n_scores = evaluate_model(pipeline, X_train, y_train_log)

# report model performance
mean_rmsle, std_rmsle = report_model_performance(n_scores)
print(f"> RMSLE: {mean_rmsle:.7f} ({std_rmsle:.3f})")

Number of rows : 750000
Number of duplicated rows : 2841
2841 duplicated row(s) removed


'Shape of X : (747159, 7)'

'Shape of y : (747159, 1)'

> RMSLE: 0.0249189 (0.000)


I chose to use **n_quantiles=10** leading to best results in this case.

In [20]:
# Paths
calorie_train_data_path = "../data/raw_data/train.csv"
target_name = "Calories"

# Load dataset
X,y = load_dataset(calorie_train_data_path,target_name)

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) 

# Preprocess output data
y_train_log = preprocess_output_data(y_train)

# Prepare features columns names
categorical_columns = X_train.select_dtypes(exclude='number').columns.tolist()
numerical_columns = X_train.select_dtypes(include='number').columns.tolist()

# Prepare Column Transformer (input features)
preproc = make_column_transformer((OneHotEncoder(drop='if_binary', sparse_output=False),categorical_columns),
                                  (QuantileTransformer(n_quantiles=10, output_distribution='uniform'), numerical_columns))

# define and configure the model
pipeline = make_pipeline(preproc,DecisionTreeRegressor(random_state=1))

# Fit pipeline
pipeline.fit(X_train, y_train_log)

# Predict and inverse log
y_pred_log = pipeline.predict(X_test)
y_pred = np.expm1(y_pred_log)
y_pred = pd.DataFrame(y_pred, index=y_test.index)
display(y_pred)
display(y_test)

Number of rows : 750000
Number of duplicated rows : 2841
2841 duplicated row(s) removed


'Shape of X : (747159, 7)'

'Shape of y : (747159, 1)'

Unnamed: 0_level_0,0
id,Unnamed: 1_level_1
102083,64.0
135855,146.0
81092,86.0
112227,80.0
147816,178.0
...,...
299109,19.0
91423,53.0
104015,65.0
86823,21.0


Unnamed: 0_level_0,Calories
id,Unnamed: 1_level_1
102083,65.0
135855,147.0
81092,84.0
112227,75.0
147816,177.0
...,...
299109,18.0
91423,50.0
104015,66.0
86823,21.0


In [18]:
# # Fit pipeline
# pipeline.fit(X_train, y_train_log)

# # Predict and inverse log
# y_pred_log = pipeline.predict(X_test)
# y_pred = np.expm1(y_pred_log)
# y_pred = pd.DataFrame(y_pred, index=y_test.index)

In [19]:
# y_test