# Sales Forecasting (Regression)

**Overview:**
- Dataset: retail sales dataset from Kaggle (or any time-series CSV)
- Steps: download → load → EDA → feature engineering → train regression model → evaluate


In [None]:
# Kaggle download (run if kaggle.json available)
# kaggle datasets download -d <dataset> -p ./data --unzip

import os
os.makedirs('data', exist_ok=True)
print('Place sales dataset CSV in ./data (e.g., sales.csv)')


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Fallback: create a small synthetic time-series dataset if no CSV provided
csv_path = 'data/sales.csv'
if not os.path.exists(csv_path):
    print('No sales.csv found — creating synthetic dataset for demo')
    dates = pd.date_range(start='2018-01-01', periods=500, freq='D')
    sales = (np.sin(np.arange(len(dates))/50) * 200 + 1000 + np.random.randn(len(dates))*50).astype(int)
    df = pd.DataFrame({'date': dates, 'sales': sales})
else:
    df = pd.read_csv(csv_path, parse_dates=['date'])

# Quick feature engineering
if 'date' in df.columns:
    df['day'] = df['date'].dt.day
n_lags = 7
for lag in range(1, n_lags+1):
    df[f'lag_{lag}'] = df['sales'].shift(lag)

df = df.dropna().reset_index(drop=True)
X = df[[f'lag_{lag}' for lag in range(1, n_lags+1)]]
y = df['sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
pred = model.predict(X_test)
print('RMSE:', mean_squared_error(y_test, pred, squared=False))
print('R2:', r2_score(y_test, pred))


## Notes
- Replace synthetic data by placing your Kaggle CSV at `data/sales.csv`.
- For better forecasting, explore ARIMA, Prophet, or LSTM-based models.
