📦 **Imports and Data Load**


In [None]:
import pandas as pd
from itertools import product
from sklearn.preprocessing import LabelEncoder

# Set display options for pandas
pd.set_option("display.max_columns", None)

# Load data
df = pd.read_csv("DTM/DTM.csv")

🧹 **Initial Cleaning**


In [None]:
# Drop columns with too many missing values
df = df.dropna(axis=1, thresh=800)

# Drop unnecessary columns
df = df.drop(columns=['Created', 'Year'])

# Drop rows where 'Vendor' column is missing
df = df.dropna(subset=['Vendor'])

# Standardize 'Shortage Date' column to datetime
df['Shortage Date'] = pd.to_datetime(df['Shortage Date'])

# Fill missing values
df[['Downtime', 'Missed Vehicle']] = df[['Downtime', 'Missed Vehicle']].fillna(0)
df[['LiMa Comment', 'Code 1']] = df[['LiMa Comment', 'Code 1']].fillna('No Data')

📆 **Add Calendar Features**

In [None]:
# Extract calendar-based features
df['DayOfWeek'] = df['Shortage Date'].dt.dayofweek
df['IsWeekend'] = df['DayOfWeek'].isin([5, 6]).astype(int)
df['Month'] = df['Shortage Date'].dt.month
df['Quarter'] = df['Shortage Date'].dt.quarter

📊 **Aggregate Daily Data per Vendor**

In [None]:
# Aggregate Downtime + Misses (daily level per vendor)
daily_agg = df.groupby(['Vendor', 'Shortage Date']).agg({
    'Downtime': 'sum',
    'Missed Vehicle': 'sum'
}).reset_index()

# Fill in all Vendor-Date combinations (to fill missing days with 0)
all_vendors = daily_agg['Vendor'].unique()
full_dates = pd.date_range(daily_agg['Shortage Date'].min(), daily_agg['Shortage Date'].max(), freq='D')
full_index = pd.DataFrame(product(all_vendors, full_dates), columns=['Vendor', 'Shortage Date'])

df = full_index.merge(daily_agg, how='left', on=['Vendor', 'Shortage Date'])
df[['Downtime', 'Missed Vehicle']] = df[['Downtime', 'Missed Vehicle']].fillna(0)

🔁 **Add Lag Features & Rolling Averages**

In [None]:
df = df.sort_values(['Vendor', 'Shortage Date']).reset_index(drop=True)

lags = [1, 2, 3, 7]
for lag in lags:
    df[f'Lag_Downtime_{lag}'] = df.groupby('Vendor')['Downtime'].shift(lag)
    df[f'Lag_Misses_{lag}'] = df.groupby('Vendor')['Missed Vehicle'].shift(lag)

# Fill NaN lag values with 0
lag_cols = [f'Lag_Downtime_{l}' for l in lags] + [f'Lag_Misses_{l}' for l in lags]
df[lag_cols] = df[lag_cols].fillna(0)

📆 **Re-Add Calendar Features to Daily Data**

In [None]:
# Re-add calendar-based features
df['DayOfWeek'] = df['Shortage Date'].dt.dayofweek
df['IsWeekend'] = df['DayOfWeek'].isin([5, 6]).astype(int)
df['Month'] = df['Shortage Date'].dt.month
df['Quarter'] = df['Shortage Date'].dt.quarter

**⚠️Frequency + Severity Ratings**

In [None]:
# Compute per vendor summary
risk_df = df.groupby('Vendor').agg(
    Total_Incidents=('Downtime', lambda x: (x > 0).sum()),
    Total_Downtime=('Downtime', 'sum')
).reset_index()

# Normalize to 0-1 scale
risk_df['Freq_Score'] = risk_df['Total_Incidents'] / risk_df['Total_Incidents'].max()
risk_df['Severity_Score'] = risk_df['Total_Downtime'] / risk_df['Total_Downtime'].max()
risk_df['Risk_Score'] = 0.5 * risk_df['Freq_Score'] + 0.5 * risk_df['Severity_Score']

# Merge risk score to main df
df = df.merge(risk_df[['Vendor', 'Risk_Score']], on='Vendor', how='left')

df['YearMonth'] = df['Shortage Date'].dt.year * 100 + df['Shortage Date'].dt.month
df['YearWeek'] = df['Shortage Date'].dt.strftime('%Y-W%U')

** 📈Rolling Averages**

In [None]:
df_weekly = df.groupby(['Vendor', 'YearWeek']).agg(
    Weekly_Downtime=('Downtime', 'sum')
).reset_index()

# Sort for rolling calc
df_weekly = df_weekly.sort_values(['Vendor', 'YearWeek'])

# Rolling 12-week avg (shifted)
df_weekly['Avg_Downtime_12w'] = df_weekly.groupby('Vendor')['Weekly_Downtime'].transform(
    lambda x: x.shift(1).rolling(window=12, min_periods=1).mean()
)

# Merge rolling avg to daily df
df = df.merge(df_weekly[['Vendor', 'YearWeek', 'Avg_Downtime_12w']],
              on=['Vendor', 'YearWeek'], how='left')
df['Avg_Downtime_12w'] = df['Avg_Downtime_12w'].fillna(0)

label_encoder = LabelEncoder()
df['Vendor_Encoded'] = label_encoder.fit_transform(df['Vendor'])

iso = df['Shortage Date'].dt.isocalendar()
df['YearWeek_Num'] = iso['year'] * 100 + iso['week']

df = df.sort_values(['Vendor', 'Shortage Date']).reset_index(drop=True)
df.info()

In [None]:
df.info()

**Model Time Baby...**

In [None]:
# Data handling
import numpy as np
import pandas as pd

# Data splitting and preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

# Models
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

# Metrics
from sklearn.metrics import mean_squared_error, r2_score

# Plotting (optional for EDA or results viz)
import matplotlib.pyplot as plt
import seaborn as sns

# For encoding categorical variables (optional if needed)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Parallel backend for faster GridSearchCV with feedback (optional but recommended)
from sklearn.utils import parallel_backend

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')


**Initial Data Split**

In [None]:
df = df.drop(columns=['Vendor', 'YearWeek'])

# Split data into features and target
X = df.drop(columns=['Downtime', 'Missed Vehicle'])  # Assuming Downtime and Missed Vehicle are your target columns
y = df[['Downtime', 'Missed Vehicle']]  # You might want to predict both, or choose one

# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**📊Models Training and GridSearchCV**

In [None]:
df.info()

In [None]:
# Define models and parameter grids
models_params = {
    'RandomForest': {
        'model': RandomForestRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [5, 10]
        }
    },
    'XGBoost': {
        'model': XGBRegressor(objective='reg:squarederror', random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [3, 6]
        }
    },
    'LinearRegression': {
        'model': LinearRegression(),
        'params': {
            'fit_intercept': [True, False]
        }
    }
}

# Train models with GridSearchCV and live feedback
results = []

for name, mp in models_params.items():
    print(f"🔹 Training {name}...")
    grid = GridSearchCV(mp['model'], mp['params'], cv=3, scoring='r2', n_jobs=-1, verbose=1)
    with parallel_backend('loky'):
        grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    score = r2_score(y_test, y_pred)
    print(f"✅ {name} best params: {grid.best_params_}")
    print(f"✅ {name} R² score: {score:.4f}\n")
    
    results.append({
        'Model': name,
        'Best Params': grid.best_params_,
        'Test R2': score,
        'Trained Model': best_model
    })

# Ensemble (VotingRegressor) using best RF + XGB + Linear
print("🔹 Training VotingRegressor (ensemble)...")
voting = VotingRegressor(estimators=[
    ('rf', results[0]['Trained Model']),
    ('xgb', results[1]['Trained Model']),
    ('lr', results[2]['Trained Model'])
])
voting.fit(X_train, y_train)
y_pred_ens = voting.predict(X_test)
score_ens = r2_score(y_test, y_pred_ens)
print(f"✅ VotingRegressor R² score: {score_ens:.4f}\n")

results.append({
    'Model': 'VotingEnsemble',
    'Best Params': 'N/A',
    'Test R2': score_ens,
    'Trained Model': voting
})

# 📈 Summary table
summary_df = pd.DataFrame(results).drop(columns=['Trained Model'])
summary_df = summary_df.sort_values(by='Test R2', ascending=False).reset_index(drop=True)

print("📊 Model Comparison Summary:")
print(summary_df)


**Plots for Analysis**

In [None]:
# Best hyperparameters for each model
print("Best Random Forest Parameters:", rf_grid.best_params_)
print("Best XGBoost Parameters:", xgb_grid.best_params_)
print("Best Linear Regression Parameters:", lr_grid.best_params_)

# Evaluate models on the test set
rf_score = rf_grid.score(X_test, y_test)
xgb_score = xgb_grid.score(X_test, y_test)
lr_score = lr_grid.score(X_test, y_test)

print(f"Random Forest R²: {rf_score:.4f}")
print(f"XGBoost R²: {xgb_score:.4f}")
print(f"Linear Regression R²: {lr_score:.4f}")

# Plot model comparison
model_names = ['Random Forest', 'XGBoost', 'Linear Regression']
model_scores = [rf_score, xgb_score, lr_score]

plt.bar(model_names, model_scores)
plt.ylabel('R² Score')
plt.title('Model Comparison')
plt.show()


Some next steps:
- Refine Model
- Use "BAG OF WORDS" on LIMA comments and codes
- Add potential cause of downtime
- Email to be sent out to LIMAS
- Incorporate more data