In [None]:
#temporary traffic dataset for Bengaluru
#Here’s the plan:

#Basic EDA
#Check for missing values and data types
#Summary statistics
#Distribution of traffic volume

#Traffic trends by:
#Hour of day
#Day of week
#Junction
#Weather

# Task 1: Data Cleaning & Pre-processing:

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load the data
df = pd.read_csv("bengaluru_temp_traffic_dataset.csv", parse_dates=['datetime'])

# Check for missing values
print("Missing values:\n", df.isnull().sum())

# Remove duplicates
df = df.drop_duplicates()

# Ensure correct data types
df['is_holiday'] = df['is_holiday'].astype(int)
df['traffic_volume'] = df['traffic_volume'].astype(int)

# Aggregate traffic data hourly (already hourly, but this ensures it's clean per junction)
df_hourly = df.groupby(['datetime', 'junction'], as_index=False).agg({
    'traffic_volume': 'sum',
    'weather': 'first',
    'is_holiday': 'first',
    'temperature_C': 'mean',
    'day_of_week': 'first'
})

# Normalize numeric values
scaler = MinMaxScaler()
df_hourly[['traffic_volume_norm', 'temperature_C_norm']] = scaler.fit_transform(
    df_hourly[['traffic_volume', 'temperature_C']]
)

# Preview the cleaned data
print(df_hourly.head())


Missing values:
 datetime          0
junction          0
traffic_volume    0
weather           0
is_holiday        0
temperature_C     0
day_of_week       0
dtype: int64
             datetime      junction  traffic_volume weather  is_holiday  \
0 2025-06-01 00:00:00  Marathahalli             196   Foggy           0   
1 2025-06-01 01:00:00      KR Puram             222   Foggy           0   
2 2025-06-01 02:00:00      Madiwala             200   Foggy           0   
3 2025-06-01 03:00:00      KR Puram             196   Rainy           0   
4 2025-06-01 04:00:00      KR Puram             200   Clear           0   

   temperature_C day_of_week  traffic_volume_norm  temperature_C_norm  
0           30.0      Sunday             0.467391            0.528846  
1           24.6      Sunday             0.750000            0.269231  
2           27.4      Sunday             0.510870            0.403846  
3           28.0      Sunday             0.467391            0.432692  
4           29.8   

In [None]:
#Task 2: Feature Engineering and Selection

In [None]:
# Time-Based Features
#From datetime:
#hour — Hour of the day (0–23)
#day — Day of the month (1–31)
#month — Month number (1–12)
#weekend — 1 if Saturday/Sunday, else 0

#Lag Features
#These capture previous traffic volume trends (temporal memory):
#lag_1h — Traffic volume 1 hour before
#lag_2h — Traffic volume 2 hours before
#lag_24h — Traffic volume same time

#Special Events
#yes we simulate with is_holiday from your dataset.It can create:
#is_weekend = 1 if Saturday or Sunday
#event_day = is_holiday or any known public event (leave as is for now)

In [None]:
import pandas as pd
import numpy as np

# Assuming df_hourly is your cleaned dataset
df_hourly['hour'] = df_hourly['datetime'].dt.hour
df_hourly['day'] = df_hourly['datetime'].dt.day
df_hourly['month'] = df_hourly['datetime'].dt.month
df_hourly['is_weekend'] = df_hourly['day_of_week'].isin(['Saturday', 'Sunday']).astype(int)

# Sort before creating lag features
df_hourly.sort_values(by=['junction', 'datetime'], inplace=True)

# Create lag features
df_hourly['lag_1h'] = df_hourly.groupby('junction')['traffic_volume'].shift(1)
df_hourly['lag_2h'] = df_hourly.groupby('junction')['traffic_volume'].shift(2)
df_hourly['lag_24h'] = df_hourly.groupby('junction')['traffic_volume'].shift(24)

# Drop NA values created by lag features
df_hourly.dropna(inplace=True)

# Correlation matrix
correlation_matrix = df_hourly[['traffic_volume', 'hour', 'day', 'month', 'is_weekend',
                                'traffic_volume_norm', 'temperature_C_norm',
                                'lag_1h', 'lag_2h', 'lag_24h']].corr()

print("Correlation Matrix:\n", correlation_matrix['traffic_volume'].sort_values(ascending=False))

# Feature importance using Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# One-hot encode categorical variable 'junction'
df_model = pd.get_dummies(df_hourly, columns=['junction', 'weather'], drop_first=True)

# Define features and target
features = [col for col in df_model.columns if col not in ['datetime', 'traffic_volume', 'day_of_week']]
X = df_model[features]
y = df_model['traffic_volume']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Feature importances
importances = pd.Series(rf.feature_importances_, index=X.columns)
importances = importances.sort_values(ascending=False)

print("\nTop 10 Features:\n", importances.head(10))


Correlation Matrix:
 traffic_volume         1.000000
traffic_volume_norm    1.000000
lag_1h                 0.054892
lag_24h                0.029625
hour                   0.018209
temperature_C_norm     0.003324
day                   -0.051894
is_weekend            -0.056373
lag_2h                -0.075910
month                       NaN
Name: traffic_volume, dtype: float64

Top 10 Features:
 traffic_volume_norm    0.998427
day                    0.000280
temperature_C_norm     0.000252
lag_24h                0.000202
hour                   0.000200
temperature_C          0.000185
lag_2h                 0.000105
lag_1h                 0.000066
weather_Cloudy         0.000063
is_weekend             0.000060
dtype: float64
