# Final predictions

## Libraries

In [1]:
# Libraries
import warnings
warnings.filterwarnings(action = 'ignore')

## Basic libraries
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime
import math

## Plotting
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score, RepeatedKFold
from sklearn.metrics import mean_squared_log_error, make_scorer, mean_squared_error

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline

from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import QuantileTransformer 
from sklearn.compose import make_column_transformer

## Functions

In [2]:
def check_duplicates(df):
    # Check number of rows before removing duplicates
    print(f"Number of rows : {len(df)}")

    # Compute the number of duplicated rows
    num_dups = df.duplicated().sum()
    
    print(f"Number of duplicated rows : {num_dups}")

    if df.duplicated().any():
        # Remove duplicates
        df_no_duplicates = df.drop_duplicates()
        print(f"{num_dups} duplicated row(s) removed")
        return df_no_duplicates
    else:
        return "No duplicated rows found !"  

In [3]:
# load the dataset
def load_dataset(filename, target):
    # load the dataset
    data = pd.read_csv(filename, index_col='id') 

    #Check duplicates (Any duplicated rows are dropped)
    data_no_dups = check_duplicates(data)
    
    # split into input and output variables
    X = data_no_dups.drop(columns=[target])
    y = data_no_dups[[target]]

    # Display shapes
    display(f"Shape of X : {X.shape}")
    display(f"Shape of y : {y.shape}")
    
    return X, y

In [4]:
# Prepare target
def preprocess_output_data(y):
    # Apply log1p transform to y
    y_log = np.log1p(y)
    return y_log

## Predictions

### Using Train_data

In [5]:
# Paths
calorie_train_data_path = "../data/raw_data/train.csv"
target_name = "Calories"

# Load dataset
X,y = load_dataset(calorie_train_data_path,target_name)

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) 

# Preprocess output data
y_train_log = preprocess_output_data(y_train)

# Prepare features columns names
categorical_columns = X_train.select_dtypes(exclude='number').columns.tolist()
numerical_columns = X_train.select_dtypes(include='number').columns.tolist()

# Prepare Column Transformer (input features)
preproc = make_column_transformer((OneHotEncoder(drop='if_binary', sparse_output=False),categorical_columns),
                                  (QuantileTransformer(n_quantiles=10, output_distribution='uniform'), numerical_columns))

# define and configure the model
pipeline = make_pipeline(preproc, DecisionTreeRegressor(random_state=1))

# Fit pipeline
pipeline.fit(X_train, y_train_log)

# Predict and inverse log
y_pred_log = pipeline.predict(X_test)
y_pred = np.expm1(y_pred_log)
# y_pred = np.trunc(y_pred * 1000) / 1000
y_pred = pd.DataFrame(y_pred, index=y_test.index, columns=['Predictions'])
display(y_pred)
display(y_test)

Number of rows : 750000
Number of duplicated rows : 2841
2841 duplicated row(s) removed


'Shape of X : (747159, 7)'

'Shape of y : (747159, 1)'

Unnamed: 0_level_0,Predictions
id,Unnamed: 1_level_1
102083,64.0
135855,146.0
81092,86.0
112227,85.0
147816,178.0
...,...
299109,20.0
91423,53.0
104015,65.0
86823,21.0


Unnamed: 0_level_0,Calories
id,Unnamed: 1_level_1
102083,65.0
135855,147.0
81092,84.0
112227,75.0
147816,177.0
...,...
299109,18.0
91423,50.0
104015,66.0
86823,21.0


### Using Test_data

I will train the pipeline on the entire train set. And Then I will predict the test set.

In [6]:
# Paths
calorie_train_data_path = "../data/raw_data/train.csv"
calorie_test_data_path = "../data/raw_data/test.csv"
target_name = "Calories"

# Load TRAIN dataset (Features AND target)
X_train,y_train = load_dataset(calorie_train_data_path,target_name)

# load TEST dataset (ONLY  FEATURES)
X_test = pd.read_csv(calorie_test_data_path, index_col='id') 

# Preprocess output data
y_train_log = preprocess_output_data(y_train)

# Prepare features columns names
categorical_columns = X_train.select_dtypes(exclude='number').columns.tolist()
numerical_columns = X_train.select_dtypes(include='number').columns.tolist()

# Prepare Column Transformer (input features)
preproc = make_column_transformer((OneHotEncoder(drop='if_binary', sparse_output=False),categorical_columns),
                                  (QuantileTransformer(n_quantiles=10, output_distribution='uniform'), numerical_columns))

# define and configure the model
pipeline = make_pipeline(preproc, DecisionTreeRegressor(random_state=1))
DecisionTreeRegressor()
# Fit pipeline
pipeline.fit(X_train, y_train_log)

# Predict and inverse log
y_pred_log = pipeline.predict(X_test)
y_pred = np.expm1(y_pred_log)
y_pred = np.round(y_pred,4)
# y_pred = np.trunc(y_pred * 1000) / 1000
y_pred = pd.DataFrame(y_pred, index=X_test.index, columns=['Calories'])
display(y_pred)

Number of rows : 750000
Number of duplicated rows : 2841
2841 duplicated row(s) removed


'Shape of X : (747159, 7)'

'Shape of y : (747159, 1)'

Unnamed: 0_level_0,Calories
id,Unnamed: 1_level_1
750000,27.0
750001,103.0
750002,88.0
750003,133.0
750004,78.0
...,...
999995,26.0
999996,8.0
999997,75.0
999998,166.0


In [7]:
y_pred.shape

(250000, 1)

# Save predictions

In [8]:
# Generate filename
filename = "submission_calories.csv"
now = datetime.now()
dated_filename = now.strftime("%Y%m%d_%H-%M-%S_"+filename)
saving_path = "../data/processed_data/" + dated_filename
y_pred.to_csv(saving_path, index=True, sep=',', encoding='utf-8')