<a href="https://www.kaggle.com/code/swapanroy/predicting-accidents-using-randomforest-regressor?scriptVersionId=267329528" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

## 1. Load Data

In [2]:
train_df = pd.read_csv("/kaggle/input/playground-series-s5e10/train.csv")
test_df = pd.read_csv("/kaggle/input/playground-series-s5e10/test.csv")

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517754 entries, 0 to 517753
Data columns (total 14 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   id                      517754 non-null  int64  
 1   road_type               517754 non-null  object 
 2   num_lanes               517754 non-null  int64  
 3   curvature               517754 non-null  float64
 4   speed_limit             517754 non-null  int64  
 5   lighting                517754 non-null  object 
 6   weather                 517754 non-null  object 
 7   road_signs_present      517754 non-null  bool   
 8   public_road             517754 non-null  bool   
 9   time_of_day             517754 non-null  object 
 10  holiday                 517754 non-null  bool   
 11  school_season           517754 non-null  bool   
 12  num_reported_accidents  517754 non-null  int64  
 13  accident_risk           517754 non-null  float64
dtypes: bool(4), float64(

In [4]:
train_df

Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk
0,0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,False,True,1,0.13
1,1,urban,4,0.99,35,daylight,clear,True,False,evening,True,True,0,0.35
2,2,rural,4,0.63,70,dim,clear,False,True,morning,True,False,2,0.30
3,3,highway,4,0.07,35,dim,rainy,True,True,morning,False,False,1,0.21
4,4,rural,1,0.58,60,daylight,foggy,False,False,evening,True,False,1,0.56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517749,517749,highway,4,0.10,70,daylight,foggy,True,True,afternoon,False,False,2,0.32
517750,517750,rural,4,0.47,35,daylight,rainy,True,True,morning,False,False,1,0.26
517751,517751,urban,4,0.62,25,daylight,foggy,False,False,afternoon,False,True,0,0.19
517752,517752,highway,3,0.63,25,night,clear,True,False,afternoon,True,True,3,0.51


In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 172585 entries, 0 to 172584
Data columns (total 13 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   id                      172585 non-null  int64  
 1   road_type               172585 non-null  object 
 2   num_lanes               172585 non-null  int64  
 3   curvature               172585 non-null  float64
 4   speed_limit             172585 non-null  int64  
 5   lighting                172585 non-null  object 
 6   weather                 172585 non-null  object 
 7   road_signs_present      172585 non-null  bool   
 8   public_road             172585 non-null  bool   
 9   time_of_day             172585 non-null  object 
 10  holiday                 172585 non-null  bool   
 11  school_season           172585 non-null  bool   
 12  num_reported_accidents  172585 non-null  int64  
dtypes: bool(4), float64(1), int64(4), object(4)
memory usage: 12.5+ MB



## 2. Separate Target and IDs

In [6]:
target = train_df['accident_risk']
test_ids = test_df['id']

# Drop 'id' and 'accident_risk' from the training set
train_df = train_df.drop(columns=['id', 'accident_risk'])
test_df = test_df.drop(columns=['id'])


## 3. Feature Engineering 

In [7]:
# Concatenate train and test data for consistent encoding
combined_df = pd.concat([train_df, test_df], ignore_index=True)

# Identify boolean and object columns
bool_cols = combined_df.select_dtypes(include=['bool']).columns
object_cols = combined_df.select_dtypes(include=['object']).columns

# Convert boolean columns to integer 0/1
for col in bool_cols:
    combined_df[col] = combined_df[col].astype(int)

# One-Hot Encoding for object type columns
# This handles 'road_type', 'lighting', 'weather', and 'time_of_day'
combined_df = pd.get_dummies(combined_df, columns=object_cols, drop_first=True)

# Separate the combined data back into training and testing sets
X_train = combined_df.iloc[:len(train_df)]
X_test = combined_df.iloc[len(train_df):]
y_train = target

## 4. Model Training (RandomForestRegressor)

In [8]:
# Using a modest set of hyper-parameters for efficiency on a large dataset
rfr = RandomForestRegressor(
    n_estimators=100,      # Number of trees
    max_depth=10,          # Limiting depth to speed up training and reduce overfitting
    min_samples_split=20,  # Minimum samples required to split an internal node
    random_state=42,       # For reproducibility
    n_jobs=-1,              # Use all avail    
    #min_samples_leaf=8,     # Additional regularization
    #max_features=0.5      # Feature subsampling
    
)

print("Starting model training (RandomForestRegressor)...")
rfr.fit(X_train, y_train)
print("Model training complete.")

Starting model training (RandomForestRegressor)...
Model training complete.


## 5. Prediction and Submission File Generation

In [9]:
# Generate predictions on the test set
predictions = rfr.predict(X_test)

# Ensure predictions are between 0 and 1
predictions = np.clip(predictions, 0, 1)

# Create the submission 
submission_df = pd.DataFrame({
    'id': test_ids,
    'accident_risk': predictions
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)

print("\nSubmission file 'submission.csv' created.")
print(submission_df.head())


Submission file 'submission.csv' created.
       id  accident_risk
0  517754       0.296026
1  517755       0.119702
2  517756       0.179841
3  517757       0.322069
4  517758       0.409924
