In [2]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
# For Preprocessing part
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler

# For Modelling part
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from mlxtend.regressor import StackingCVRegressor

# For Evaluation and Tuning part
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score 
from sklearn.model_selection import cross_val_score
import optuna


In [3]:
data = pd.read_csv('car_last.csv')
df = data.copy()

In [4]:
data

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.90,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.242170,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.50,2.625,211.350143,8.106
...,...,...,...,...,...,...,...,...
6430,45,28-09-2012,713173.95,0,64.88,3.997,192.013558,8.684
6431,45,05-10-2012,733455.07,0,64.89,3.985,192.170412,8.667
6432,45,12-10-2012,734464.36,0,54.47,4.000,192.327265,8.667
6433,45,19-10-2012,718125.53,0,56.47,3.969,192.330854,8.667


In [8]:
data['Date'] = pd.to_datetime(data['Date'], dayfirst=True)

In [10]:
data['Date'].max()

Timestamp('2012-10-26 00:00:00')

In [4]:
data.describe(include = 'all')

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
count,6435.0,6435,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0
unique,,143,,,,,,
top,,05-02-2010,,,,,,
freq,,45,,,,,,
mean,23.0,,1046965.0,0.06993,60.663782,3.358607,171.578394,7.999151
std,12.988182,,564366.6,0.255049,18.444933,0.45902,39.356712,1.875885
min,1.0,,209986.2,0.0,-2.06,2.472,126.064,3.879
25%,12.0,,553350.1,0.0,47.46,2.933,131.735,6.891
50%,23.0,,960746.0,0.0,62.67,3.445,182.616521,7.874
75%,34.0,,1420159.0,0.0,74.94,3.735,212.743293,8.622


In [5]:
data.isnull().sum()

Store           0
Date            0
Weekly_Sales    0
Holiday_Flag    0
Temperature     0
Fuel_Price      0
CPI             0
Unemployment    0
dtype: int64

In [6]:
class DateModification(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, data, y=None):
        return self

    def transform(self, data, y=None):
        data['day'] = pd.to_datetime(data['Date'], errors='coerce').dt.day  # Handle potential date errors
        data['month'] = pd.to_datetime(data['Date'], errors='coerce').dt.month
        data['year'] = pd.to_datetime(data['Date'], errors='coerce').dt.year
        return data.drop(columns=['Date'], inplace=False)  # Don't modify in-place


In [7]:
class OutlierCapping(BaseEstimator, TransformerMixin):
    def __init__(self, factor):
        self.factor = factor

    def fit(self, data, y = None):
        self.q1 = data.quantile(0.25, numeric_only = True)
        self.q3 = data.quantile(0.75, numeric_only = True)

        self.IQR = self.q3 - self.q1 
        self.Lower = self.q1 - self.factor * self.IQR
        self.Upper = self.q3 + self.factor * self.IQR

        return self
        
    def transform(self, data, y = None):
        for column in data.columns:
            data[column] = np.where(data[column] > self.Upper[column], self.Upper[column], data[column])
            data[column] = np.where(data[column] < self.Lower[column], self.Lower[column], data[column])
        return data

In [8]:
class TargetCorrelation(BaseEstimator, TransformerMixin):
    def __init__(self, threshold, target):
        self.threshold = threshold
        self.target = target
        
    def fit(self, data, y = None):
        data['Weekly_Sales'] = y
        corr_matrix = data.corr(method='spearman', numeric_only = True)
        target_correlations = corr_matrix[self.target].abs() > self.threshold
        
        variables_explaining_target = target_correlations[target_correlations].index.tolist()
    
        variables_explaining_target.remove(self.target)
        
        self.result_explaining_target = pd.DataFrame(
        {'Variable': variables_explaining_target,
        'Correlation with Target': corr_matrix.loc[variables_explaining_target, self.target]})
        
        self.result_explaining_target.reset_index(drop=True, inplace=True)
        
        return self

    def transform(self, data, y = None):
        return data[self.result_explaining_target.Variable]

In [9]:
pipeline = Pipeline([
    ('Date_Modification', DateModification())
    ('outlier_capper', OutlierCapping(factor=1.5)),
    ('Target_Correlation', TargetCorrelation(threshold = 0.01, target = 'Weekly_Sales')),
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

In [10]:
input = data.drop(columns = ['Weekly_Sales'])
output = data['Weekly_Sales']

X_train, X_test, y_train, y_test = train_test_split(input, output, test_size=0.2, random_state=42)

In [11]:
pipeline.fit(X_train, y_train)

In [12]:
y_pred = pipeline.predict(X_test)
r2_score = metrics.r2_score(y_test, y_pred)
print(f"R2_score: {r2_score}")

R2_score: 0.14580148149111927


# Other Models

In [13]:
class DateModification(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, data, y=None):
        return self

    def transform(self, data, y=None):
        data['day'] = pd.to_datetime(data['Date'], errors='coerce').dt.day  # Handle potential date errors
        data['month'] = pd.to_datetime(data['Date'], errors='coerce').dt.month
        data['year'] = pd.to_datetime(data['Date'], errors='coerce').dt.year
        return data.drop(columns=['Date'], inplace=False)  # Don't modify in-place


In [14]:
input = df.drop(columns = ['Weekly_Sales'])
output = df['Weekly_Sales']

X_train, X_test, y_train, y_test = train_test_split(input, output, test_size=0.2, random_state=42)

In [15]:
models = {
    "xgb": XGBRegressor(),
    "cat": CatBoostRegressor(),
    "lgbm": LGBMRegressor(),
    "rf": RandomForestRegressor()
}

In [16]:
r2_score = []

for model_name, model in models.items():
    pipeline_models = Pipeline([
        ('Date_modification', DateModification()),
        ("scaler", StandardScaler()),
        (model_name, model)
    ])

    pipeline_models.fit(X_train, y_train)
    y_pred = pipeline_models.predict(X_test)
    y_pred_2 = pipeline_models.predict(X_train)
    r2_score.append([model_name, metrics.r2_score(y_train, y_pred_2), metrics.r2_score(y_test, y_pred)])


  data['day'] = pd.to_datetime(data['Date'], errors='coerce').dt.day  # Handle potential date errors
  data['month'] = pd.to_datetime(data['Date'], errors='coerce').dt.month
  data['year'] = pd.to_datetime(data['Date'], errors='coerce').dt.year
  data['day'] = pd.to_datetime(data['Date'], errors='coerce').dt.day  # Handle potential date errors
  data['month'] = pd.to_datetime(data['Date'], errors='coerce').dt.month
  data['year'] = pd.to_datetime(data['Date'], errors='coerce').dt.year
  data['day'] = pd.to_datetime(data['Date'], errors='coerce').dt.day  # Handle potential date errors
  data['month'] = pd.to_datetime(data['Date'], errors='coerce').dt.month
  data['year'] = pd.to_datetime(data['Date'], errors='coerce').dt.year


Learning rate set to 0.053042
0:	learn: 549942.3740904	total: 182ms	remaining: 3m 1s
1:	learn: 537645.4758019	total: 188ms	remaining: 1m 33s
2:	learn: 526598.2866273	total: 195ms	remaining: 1m 4s
3:	learn: 514396.4936246	total: 202ms	remaining: 50.3s
4:	learn: 504070.9395663	total: 211ms	remaining: 42s
5:	learn: 496121.5785061	total: 218ms	remaining: 36.1s
6:	learn: 486766.9180850	total: 225ms	remaining: 31.9s
7:	learn: 479375.8764765	total: 231ms	remaining: 28.6s
8:	learn: 469017.6688638	total: 237ms	remaining: 26.1s
9:	learn: 458427.7354644	total: 243ms	remaining: 24.1s
10:	learn: 449712.9979825	total: 249ms	remaining: 22.4s
11:	learn: 442996.9832451	total: 254ms	remaining: 21s
12:	learn: 435347.2862735	total: 260ms	remaining: 19.7s
13:	learn: 429285.6392617	total: 267ms	remaining: 18.8s
14:	learn: 423191.2213241	total: 273ms	remaining: 18s
15:	learn: 418451.6233034	total: 279ms	remaining: 17.2s
16:	learn: 413239.2143811	total: 285ms	remaining: 16.5s
17:	learn: 407882.2731646	total: 

  data['day'] = pd.to_datetime(data['Date'], errors='coerce').dt.day  # Handle potential date errors
  data['month'] = pd.to_datetime(data['Date'], errors='coerce').dt.month
  data['year'] = pd.to_datetime(data['Date'], errors='coerce').dt.year
  data['day'] = pd.to_datetime(data['Date'], errors='coerce').dt.day  # Handle potential date errors
  data['month'] = pd.to_datetime(data['Date'], errors='coerce').dt.month
  data['year'] = pd.to_datetime(data['Date'], errors='coerce').dt.year


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000939 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1111
[LightGBM] [Info] Number of data points in the train set: 5148, number of used features: 9
[LightGBM] [Info] Start training from score 1044996.414472


  data['day'] = pd.to_datetime(data['Date'], errors='coerce').dt.day  # Handle potential date errors
  data['month'] = pd.to_datetime(data['Date'], errors='coerce').dt.month
  data['year'] = pd.to_datetime(data['Date'], errors='coerce').dt.year
  data['day'] = pd.to_datetime(data['Date'], errors='coerce').dt.day  # Handle potential date errors
  data['month'] = pd.to_datetime(data['Date'], errors='coerce').dt.month
  data['year'] = pd.to_datetime(data['Date'], errors='coerce').dt.year
  data['day'] = pd.to_datetime(data['Date'], errors='coerce').dt.day  # Handle potential date errors
  data['month'] = pd.to_datetime(data['Date'], errors='coerce').dt.month
  data['year'] = pd.to_datetime(data['Date'], errors='coerce').dt.year


In [17]:
print('Model Name    Train Score    Test Score')
for model_name, train_score, test_score in r2_score:
    print(f'{model_name}\t\t{train_score.round(2)}\t\t{test_score.round(2)}')

Model Name    Train Score    Test Score
xgb		1.0		0.76
cat		0.99		0.91
lgbm		0.99		0.92
rf		0.99		0.92


In [18]:
pipeline_model = Pipeline([
        ('Date_modification', DateModification()),
        ("scaler", StandardScaler()),
        ('LGBM', LGBMRegressor())
    ])

In [19]:
pipeline_model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000487 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1111
[LightGBM] [Info] Number of data points in the train set: 5148, number of used features: 9
[LightGBM] [Info] Start training from score 1044996.414472


  data['day'] = pd.to_datetime(data['Date'], errors='coerce').dt.day  # Handle potential date errors
  data['month'] = pd.to_datetime(data['Date'], errors='coerce').dt.month
  data['year'] = pd.to_datetime(data['Date'], errors='coerce').dt.year
