## Preprocessing

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load data
train_df = pd.read_csv('../assets/datas/train.csv')
store_df = pd.read_csv('../assets/datas/store.csv')

# Merge data
data = pd.merge(train_df, store_df, on='Store')

# Convert Date column to datetime
data['Date'] = pd.to_datetime(data['Date'])

# Extract new features from Date
data['Weekday'] = data['Date'].dt.weekday
data['Weekend'] = data['Weekday'].apply(lambda x: 1 if x >= 5 else 0)
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day
data['Year'] = data['Date'].dt.year
data['WeekOfYear'] = data['Date'].dt.isocalendar().week

# Example: Days to next holiday (assuming holidays are known)
holidays = pd.to_datetime(['2023-12-25', '2023-01-01'])  # Add more holidays as needed
data['DaysToHoliday'] = data['Date'].apply(lambda x: (holidays - x).days.min())

# Handle NaN values
data.fillna(0, inplace=True)

# Convert categorical columns to numeric
data = pd.get_dummies(data, columns=['StoreType', 'Assortment', 'StateHoliday'])

# Drop columns that won't be used
data.drop(['Date', 'Customers'], axis=1, inplace=True)

# Ensure all columns are numeric
non_numeric_columns = data.select_dtypes(include=['object']).columns
if not non_numeric_columns.empty:
    print(f"Non-numeric columns found: {non_numeric_columns.tolist()}")

# Convert any remaining non-numeric columns to numeric
# Example: Handling 'PromoInterval'
if 'PromoInterval' in data.columns:
    data['PromoInterval'] = data['PromoInterval'].apply(
        lambda x: 0 if x == 0 else len(x.split(',')) if isinstance(x, str) else 0
    )

# Split data into features and target
X = data.drop('Sales', axis=1)
y = data['Sales']

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


  train_df = pd.read_csv('../assets/datas/train.csv')


Non-numeric columns found: ['PromoInterval']


## Building Models with Sklearn Pipelines

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')