In [1]:
import logging
import matplotlib.pyplot as plt
import sys
import os
import pandas as pd
from datetime import datetime
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split



In [2]:
sys.path.append(os.path.abspath('../scripts'))

In [3]:
from eda_analysis_function import load_and_merge_data, handle_missing_values,setup_logger
from model_prediction import datetime_features_added,scale_features,build_random_forest_pipeline_with_encoding, feature_importance

In [4]:
train_path = '../data/train.csv'
store_path = '../data/store.csv'

In [5]:
# Load and merge datasets
merged_df = load_and_merge_data(train_path, store_path)
if merged_df is not None:
    logging.info("Data loaded successfully in notebook.")
else:
    logging.error("Data loading failed.")

  train_df = pd.read_csv(train_path)


In [6]:
# Clean Data
cleaned_df = handle_missing_values(merged_df)

if cleaned_df is not None:
    logging.info("Data cleaned successfully in notebook.")
else:
    logging.error("Data cleaning failed.")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['CompetitionDistance'].fillna(df['CompetitionDistance'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['CompetitionOpenSinceMonth'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the inter

In [7]:
print(cleaned_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 18 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   Store                      1017209 non-null  int64  
 1   DayOfWeek                  1017209 non-null  int64  
 2   Date                       1017209 non-null  object 
 3   Sales                      1017209 non-null  int64  
 4   Customers                  1017209 non-null  int64  
 5   Open                       1017209 non-null  int64  
 6   Promo                      1017209 non-null  int64  
 7   StateHoliday               1017209 non-null  object 
 8   SchoolHoliday              1017209 non-null  int64  
 9   StoreType                  1017209 non-null  object 
 10  Assortment                 1017209 non-null  object 
 11  CompetitionDistance        1017209 non-null  float64
 12  CompetitionOpenSinceMonth  1017209 non-null  float64
 13  CompetitionO

In [8]:
cleaned_df['Date'] = pd.to_datetime(cleaned_df['Date'])
cleaned_df = datetime_features_added(cleaned_df, 'Date')
print(cleaned_df)

         Store  DayOfWeek       Date  Sales  Customers  Open  Promo  \
0            1          4 2015-07-31   5263        555     1      1   
1            2          4 2015-07-31   6064        625     1      1   
2            3          4 2015-07-31   8314        821     1      1   
3            4          4 2015-07-31  13995       1498     1      1   
4            5          4 2015-07-31   4822        559     1      1   
...        ...        ...        ...    ...        ...   ...    ...   
1017204   1111          1 2013-01-01      0          0     0      0   
1017205   1112          1 2013-01-01      0          0     0      0   
1017206   1113          1 2013-01-01      0          0     0      0   
1017207   1114          1 2013-01-01      0          0     0      0   
1017208   1115          1 2013-01-01      0          0     0      0   

        StateHoliday  SchoolHoliday StoreType  ... CompetitionOpenSinceMonth  \
0                  0              1         c  ...                 

In [9]:
# Scale the features
df = scale_features(cleaned_df)


In [10]:
# Split the data into features (X) and target (y)
X = df.drop(columns=['Sales', 'Date'])  # Assuming 'Sales' is the target column
y = df['Sales']

In [11]:
non_numeric_columns = X.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_columns)

Non-numeric columns: Index(['StateHoliday', 'StoreType', 'Assortment', 'PromoInterval'], dtype='object')


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
model, X_test, y_test = build_random_forest_pipeline_with_encoding(X, y)

# Log training completion
logging.info("Random Forest model pipeline training completed.")


In [15]:
# Assuming you have a trained model, and X_test and y_test are defined
y_pred = model.predict(X_test)

# Calculate MAE and MSE
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

# Print the results
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")

Mean Absolute Error (MAE): 0.08600859239189444
Mean Squared Error (MSE): 0.01927753079220474


In [23]:
    # Get feature names
feature_names = X.columns
    # Calculate feature importance
importance_df = feature_importance(model, feature_names)
print(importance_df)

ValueError: All arrays must be of the same length

In [19]:
# Get feature names
feature_names = X.columns
print("Feature Names:", feature_names.tolist())


Feature Names: ['Store', 'DayOfWeek', 'Customers', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'IsWeekend', 'IsBeginningOfMonth', 'IsMidMonth', 'IsEndOfMonth']


In [21]:
print("Number of Features:", len(feature_names))



Number of Features: 20


In [24]:
# Calculate feature importance
importance_df = feature_importance(model, feature_names)
print(importance_df)

ValueError: All arrays must be of the same length