In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing # One-hot-Encoder y LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import root_mean_squared_error
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np
#import opendatasets as od # Download of kaggle od.download(url)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
sns.set_style('darkgrid')
plt.rcParams['font.size'] = 8
plt.rcParams['figure.facecolor'] = '#00000000'

In [None]:
# get data
ross_df = pd.read_csv("./dataset/train.csv", low_memory=False)
ross_df.head()

In [None]:
store_df = pd.read_csv("./dataset/store.csv")
store_df.head()

In [None]:
# merge data
merged_df = ross_df.merge(store_df, how="left", on="Store")
merged_df.head()

In [None]:
merged_df.shape

In [None]:
test_df = pd.read_csv("./dataset/test.csv")
test_df.head()

In [None]:
merged_test_df = test_df.merge(store_df, how="left", on="Store")
merged_test_df.head()

In [None]:
merged_test_df.shape

In [None]:
# Cleaning Data
merged_df.info()

In [None]:
round(merged_df.describe().T, 2) # T transponer

In [None]:
merged_df.duplicated().sum()

In [None]:
merged_df["Date"] = pd.to_datetime(merged_df.Date)
merged_test_df["Date"] = pd.to_datetime(merged_test_df.Date)

merged_df.Date.min(), merged_df.Date.max()

In [None]:
merged_df.Date.head()

In [None]:
# EDA
sns.histplot(merged_df, x="Sales");

In [None]:
# there is 172817 values 0
merged_df.Open.value_counts()

In [None]:
# select only Open == 1
merged_df = merged_df[merged_df.Open==1].copy()
sns.histplot(merged_df, x="Sales");

In [None]:
plt.figure(figsize=(18,8))
temp_df = merged_df.sample(40000)
sns.scatterplot(x=temp_df.Sales, y=temp_df.Customers, hue=temp_df.Date.dt.year, alpha=0.8)
plt.title("Sales Vs Customers");

In [None]:
plt.figure(figsize=(18,8))
temp_df = merged_df.sample(10000)
sns.scatterplot(x=temp_df.Store, y=temp_df.Sales, hue=temp_df.Date.dt.year, alpha=0.8)
plt.title("Stores Vs Sales")
plt.show()

In [None]:
sns.barplot(merged_df, x="DayOfWeek", y="Sales", hue="DayOfWeek");

In [None]:
sns.barplot(merged_df, x="Promo", y="Sales", hue="Promo");

In [None]:
merged_df.select_dtypes(include="number").corr()

In [None]:
# Feature Engineering
merged_df['Day'] = merged_df.Date.dt.day
merged_df['Month'] = merged_df.Date.dt.month
merged_df['Year'] = merged_df.Date.dt.year

merged_test_df['Day'] = merged_test_df.Date.dt.day
merged_test_df['Month'] = merged_test_df.Date.dt.month
merged_test_df['Year'] = merged_test_df.Date.dt.year

sns.barplot(data=merged_df, x='Year', y='Sales', hue="Year");

In [None]:
sns.barplot(data=merged_df, x='Month', y='Sales', hue="Month");

In [None]:
# training/test/validation split
len(merged_df)

In [None]:
train_size = int(.75 * len(merged_df))

sorted_df = merged_df.sort_values('Date')
train_df, val_df = sorted_df[:train_size], sorted_df[train_size:]

len(train_df), len(val_df)

In [None]:
train_df.columns

In [None]:
# Input and Target
input_cols = ['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'StoreType', 'Assortment', 'Day', 'Month', 'Year']
target_col = "Sales"

merged_df[input_cols].nunique()

In [None]:
train_inputs = train_df[input_cols].copy()
train_targets = train_df[target_col].copy()

val_inputs = val_df[input_cols].copy()
val_targets = val_df[target_col].copy()

test_inputs = merged_test_df[input_cols].copy()

numeric_cols = ['Store', 'Day', 'Month', 'Year']
categorical_cols = ['DayOfWeek', 'Promo', 'StateHoliday', 'StoreType', 'Assortment']

In [None]:
# Imputation, Scaling and Encode
imputer = SimpleImputer(strategy="mean").fit(train_inputs[numeric_cols])

train_inputs[numeric_cols] = imputer.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = imputer.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = imputer.transform(test_inputs[numeric_cols])

# MinMaxScaler
scaler = MinMaxScaler().fit(train_inputs[numeric_cols])

train_inputs[numeric_cols] = scaler.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = scaler.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])

# OneHotEncoder
encoder = preprocessing.OneHotEncoder(sparse_output=False, handle_unknown='ignore').fit(train_inputs[categorical_cols])
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))

train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
val_inputs[encoded_cols] = encoder.transform(val_inputs[categorical_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols])

# numeric data
X_train = train_inputs[numeric_cols + encoded_cols]
X_val = val_inputs[numeric_cols + encoded_cols]
X_test = test_inputs[numeric_cols + encoded_cols]

In [None]:
# Create quick & easy baseline models to benchmark future models Fixed/Random Guess
# Let's define a model that always returns the mean value of Sales as the prediction.
def return_mean(inputs):
    return np.full(len(inputs), merged_df.Sales.mean())

train_preds = return_mean(X_train)
train_preds

In [None]:
root_mean_squared_error(train_preds, train_targets)

In [None]:
root_mean_squared_error(return_mean(X_val), val_targets)

In [None]:
# input random between the lowest and highest sale
def guess_random(inputs):
    lo, hi = merged_df.Sales.min(), merged_df.Sales.max()
    return np.random.random(len(inputs)) * (hi - lo) + lo

train_preds = guess_random(X_train)
train_preds

In [None]:
root_mean_squared_error(train_preds, train_targets)

In [None]:
root_mean_squared_error(guess_random(X_val), val_targets)

In [None]:
# ML model
linreg = LinearRegression().fit(X_train, train_targets)
train_preds = linreg.predict(X_train)
train_preds

In [None]:
root_mean_squared_error(train_targets,train_preds)

In [None]:
root_mean_squared_error(linreg.predict(X_val), val_targets)

## Linear Models

In [None]:
# Pick a strategy, train a model & tune hyperparameters
def try_model(model):
    # Fit the model
    model.fit(X_train, train_targets)
    
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_val)
    
    train_rmse = root_mean_squared_error(train_targets, train_preds)
    val_rmse = root_mean_squared_error(val_targets, val_preds)
    return train_rmse, val_rmse

In [None]:
# Ridge
try_model(LinearRegression())

In [None]:
# Ridge
try_model(Ridge())

In [None]:
# Lasso
try_model(Lasso())

In [None]:
# ElasticNet
try_model(ElasticNet())

In [None]:
# SGDRegressor
try_model(SGDRegressor())

In [None]:
# tree
tree = DecisionTreeRegressor(random_state=42)
try_model(tree)

In [None]:
plt.figure(figsize=(40,20))
plot_tree(tree, max_depth=3, filled=True, feature_names=numeric_cols+encoded_cols);

In [None]:
# Random Forest
rf = RandomForestRegressor(random_state=42, n_jobs=-1)
try_model(rf)

In [None]:
rf.feature_importances_

In [None]:
importance_df = pd.DataFrame({'feature': numeric_cols+encoded_cols,
                              'importance': rf.feature_importances_}).sort_values('importance', ascending=False)
importance_df.head(10)

In [None]:
sns.barplot(data=importance_df.head(10), x='importance', y='feature');

In [None]:
# Looking at individual predictions
def predict_input(model, single_input):
    if single_input['Open'] == 0:
        return 0.
    input_df = pd.DataFrame([single_input])
    input_df['Date'] = pd.to_datetime(input_df.Date)
    input_df['Day'] = input_df.Date.dt.day
    input_df['Month'] = input_df.Date.dt.month
    input_df['Year'] = input_df.Date.dt.year
    input_df[numeric_cols] = imputer.transform(input_df[numeric_cols])
    input_df[numeric_cols] = scaler.transform(input_df[numeric_cols])
    input_df[encoded_cols] = encoder.transform(input_df[categorical_cols])
    X_input = input_df[numeric_cols + encoded_cols]
    pred = model.predict(X_input)[0]
    return pred

# one input
sample_input = {'Id': 1, 'Store': 1, 'DayOfWeek': 4, 'Date': '2015-09-17 00:00:00',
 'Open': 1.0, 'Promo': 1, 'StateHoliday': '0', 'SchoolHoliday': 0, 'StoreType': 'c',
 'Assortment': 'a', 'CompetitionDistance': 1270.0, 'CompetitionOpenSinceMonth': 9.0,
 'CompetitionOpenSinceYear': 2008.0, 'Promo2': 0, 'Promo2SinceWeek': np.nan,
 'Promo2SinceYear': np.nan, 'PromoInterval': np.nan}

sample_input

In [None]:
predict_input(rf, sample_input)

In [None]:
test_preds = rf.predict(X_test)
test_preds