# Modeling

In [1]:
# Libraries
## Basic libraries
import numpy as np
import pandas as pd
import seaborn as sns

import math

## Plotting
import matplotlib.pyplot as plt

# Basic cleaning
## Variance
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import cross_val_score, RepeatedKFold
from sklearn.metrics import mean_squared_log_error, make_scorer, mean_squared_error
from scipy.stats import iqr

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, RobustScaler
from sklearn.pipeline import make_pipeline, make_union
from sklearn.compose import make_column_transformer, TransformedTargetRegressor

In [2]:
def check_duplicates(df):
    # Check number of rows before removing duplicates
    print(f"Number of rows : {len(df)}")

    # Compute the number of duplicated rows
    num_dups = df.duplicated().sum()
    
    print(f"Number of duplicated rows : {num_dups}")

    if df.duplicated().any():
        # Remove duplicates
        df_no_duplicates = df.drop_duplicates()
        print(f"{num_dups} duplicated row(s) removed")
        return df_no_duplicates
    else:
        return "No duplicated rows found !"  

In [3]:
# load the dataset
def load_dataset(filename, target):
    # load the dataset
    data = pd.read_csv(filename, index_col='id') 

    #Check duplicates (Any duplicated rows are dropped)
    data_no_dups = check_duplicates(data)
    
    # split into input and output variables
    X = data_no_dups.drop(columns=[target])
    y = data_no_dups[[target]]

    # Display shapes
    display(f"Shape of X : {X.shape}")
    display(f"Shape of y : {y.shape}")
    
    return X, y

In [4]:
# Define the root mean squared logaritmic error
def root_mean_squared_log_error(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred, squared=True))

# Define a RMSLE scorer
rmsle_scorer = make_scorer(root_mean_squared_log_error, greater_is_better=False)

In [5]:
def preprocess_input_data(X):
    # Prepare features columns names
    categorical_columns = X.select_dtypes(exclude='number').columns.tolist()
    numerical_columns = X.select_dtypes(include='number').columns.tolist()

    # Prepare Column Transformer (input features)
    preproc = make_column_transformer((OneHotEncoder(drop='if_binary', sparse_output=False),categorical_columns),(MinMaxScaler(), numerical_columns))

    return preproc

In [6]:
# def preprocess_input_data(X):
#     # Prepare features columns names
#     categorical_columns = X.select_dtypes(exclude='number').columns.tolist()
#     numerical_columns = X.select_dtypes(include='number').columns.tolist()

#     # Prepare Column Transformer (input features)
#     preproc = make_column_transformer((OneHotEncoder(drop='if_binary', sparse_output=False),categorical_columns),(RobustScaler(), numerical_columns))

#     return preproc

In [7]:
# Prepare target
def preprocess_output_data(y):
    # Apply log1p transform to y
    y_log = np.log1p(y)
    return y_log

In [8]:
# evaluate a given model using cross-validation 
def evaluate_model(model, X, y):
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) 
    scores = cross_val_score(model, X, y, scoring=rmsle_scorer, cv=cv, n_jobs=-1) 
    return scores

In [9]:
def report_model_performance(scores):
    mean_rmsle = -np.mean(n_scores)
    std_rmsle = np.std(n_scores)
    return mean_rmsle, std_rmsle

In [10]:
# Get model (preprocessing + actual model)
def get_models(X_train):
    # Preprocess input data
    preproc = preprocess_input_data(X_train)

    # define and configure the model
    model = LinearRegression()
    pipeline = make_pipeline(preproc,model)
    return pipeline

In [None]:
# SGD Regressor
# Lasso
# Elasticnet
# SVR(kernel='rbf')
# Ridge regressor
# SVR(kernel='linear')
# decission tree regressor

In [15]:
# Paths
calorie_train_data_path = "../data/raw_data/train.csv"
target_name = "Calories"

# Load dataset
X,y = load_dataset(calorie_train_data_path,target_name)

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) 

# Preprocess output data
y_train_log = preprocess_output_data(y_train)

# Define pipeline
pipeline = get_models(X_train)

display(pipeline)
    
# # evaluate a given model using cross-validation
# n_scores = evaluate_model(pipeline, X_train, y_train_log)

# # report model performance
# mean_rmsle, std_rmsle = report_model_performance(n_scores)
# print(f"RMSLE: {mean_rmsle:.7f} ({std_rmsle:.3f})")

Number of rows : 750000
Number of duplicated rows : 2841
2841 duplicated row(s) removed


'Shape of X : (747159, 7)'

'Shape of y : (747159, 1)'

In [13]:
# # Fit pipeline
# pipeline.fit(X_train, y_train_log)

# # Predict and inverse log
# y_pred_log = pipeline.predict(X_test)
# y_pred = np.expm1(y_pred_log)
# y_pred = pd.DataFrame(y_pred, index=y_test.index)

In [14]:
# y_test