In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import numpy as np

# --- 1. Synthetic Data Generation ---
# In a real-world scenario, you would load your data here:
# df = pd.read_csv('your_accident_data.csv')

print("--- Generating Synthetic Accident Data ---")
data_size = 1000
data = {
    'Time_of_Day': np.random.choice(['Morning', 'Afternoon', 'Evening', 'Night'], size=data_size),
    'Weather_Condition': np.random.choice(['Clear', 'Rainy', 'Foggy', 'Snowy'], size=data_size),
    'Road_Surface': np.random.choice(['Dry', 'Wet', 'Icy'], size=data_size),
    'Speed_Limit': np.random.randint(20, 70, size=data_size),
    'Accident_Severity': np.random.choice([0, 1, 2], size=data_size, p=[0.6, 0.3, 0.1])
    # 0: No Injury, 1: Minor Injury, 2: Serious Injury (This is our TARGET variable)
}
df = pd.DataFrame(data)
print(df.head())
print("-" * 50)

# --- 2. Data Preprocessing and Feature Engineering ---

# Define features (X) and target (y)
X = df.drop('Accident_Severity', axis=1)
y = df['Accident_Severity']

# 2a. Convert categorical features into numerical using One-Hot Encoding
# This is crucial for machine learning models.
X = pd.get_dummies(X, columns=['Time_of_Day', 'Weather_Condition', 'Road_Surface'], drop_first=True)

# 2b. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print("Shape of Training Data (Features):", X_train.shape)
print("Shape of Testing Data (Features):", X_test.shape)
print("-" * 50)

# --- 3. Model Training (Random Forest Classifier) ---
# Random Forest is a robust choice for classification tasks

print("--- Training the Random Forest Model ---")
# n_estimators: number of decision trees in the forest
# class_weight: helps the model handle class imbalance (common in accident data)
model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced'
)
model.fit(X_train, y_train)
print("Model Training Complete!")
print("-" * 50)

# --- 4. Model Prediction and Evaluation ---

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=0)

print(f"## Model Evaluation Results ðŸ“ˆ")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report (Precision, Recall, F1-Score per Severity Class):\n")
print(report)
print("-" * 50)

# --- 5. Example Prediction with New Data ---

print("--- Example Prediction on a New Scenario ---")

# Create a single new data point for prediction
new_data = pd.DataFrame({
    'Time_of_Day': ['Night'],
    'Weather_Condition': ['Snowy'],
    'Road_Surface': ['Icy'],
    'Speed_Limit': [45]
})

# Apply the same One-Hot Encoding steps as used for the training data
new_data_encoded = pd.get_dummies(new_data, columns=['Time_of_Day', 'Weather_Condition', 'Road_Surface'], drop_first=True)

# Ensure the new data frame has all the columns the model was trained on
# Fill missing columns (the ones that didn't appear in the new data point) with 0
missing_cols = set(X_train.columns) - set(new_data_encoded.columns)
for col in missing_cols:
    new_data_encoded[col] = 0

# Reorder columns to match the training data feature order
new_data_encoded = new_data_encoded[X_train.columns]

# Make the final prediction
prediction = model.predict(new_data_encoded)
severity_map = {0: 'No Injury', 1: 'Minor Injury', 2: 'Serious Injury'}

print(f"Scenario: Night, Snowy, Icy Road, Speed 45")
print(f"Predicted Accident Severity (Raw): {prediction[0]}")
print(f"Predicted Accident Severity: **{severity_map[prediction[0]]}**")

--- Generating Synthetic Accident Data ---
  Time_of_Day Weather_Condition Road_Surface  Speed_Limit  Accident_Severity
0     Morning             Clear          Icy           24                  1
1     Evening             Foggy          Wet           68                  2
2     Morning             Clear          Dry           44                  0
3     Morning             Foggy          Icy           38                  1
4       Night             Snowy          Icy           65                  1
--------------------------------------------------
Shape of Training Data (Features): (700, 9)
Shape of Testing Data (Features): (300, 9)
--------------------------------------------------
--- Training the Random Forest Model ---
Model Training Complete!
--------------------------------------------------
## Model Evaluation Results ðŸ“ˆ
Accuracy: 0.4967

Classification Report (Precision, Recall, F1-Score per Severity Class):

              precision    recall  f1-score   support

          