In [16]:
import pandas as pd
from pymongo import MongoClient

# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017")
db = client["traffic_Data"]

# Fetch traffic and accident data
traffic_data = pd.DataFrame(list(db.traffic.find({}, {"_id": 0})))
print(traffic_data.head())  # Check the first few rows


  Million Plus Cities   Cause category        Cause Subcategory  \
0                Agra  Traffic Control  Flashing Signal/Blinker   
1                Agra  Traffic Control  Flashing Signal/Blinker   
2                Agra  Traffic Control  Flashing Signal/Blinker   
3                Agra  Traffic Control  Flashing Signal/Blinker   
4                Agra  Traffic Control  Flashing Signal/Blinker   

         Outcome of Incident  Count  
0         Greviously Injured    0.0  
1               Minor Injury    0.0  
2             Persons Killed    0.0  
3              Total Injured    0.0  
4  Total number of Accidents    0.0  


In [21]:
import pandas as pd

# Load the dataset
file_path = "traffic.csv"
df = pd.read_csv(file_path)

# Show basic info
print(df.head())  # Show first few rows


  Million Plus Cities   Cause category        Cause Subcategory  \
0                Agra  Traffic Control  Flashing Signal/Blinker   
1                Agra  Traffic Control  Flashing Signal/Blinker   
2                Agra  Traffic Control  Flashing Signal/Blinker   
3                Agra  Traffic Control  Flashing Signal/Blinker   
4                Agra  Traffic Control  Flashing Signal/Blinker   

         Outcome of Incident  Count  
0         Greviously Injured    0.0  
1               Minor Injury    0.0  
2             Persons Killed    0.0  
3              Total Injured    0.0  
4  Total number of Accidents    0.0  


In [22]:
print(df.info())  # Show column data types


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9550 entries, 0 to 9549
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Million Plus Cities  9550 non-null   object 
 1   Cause category       9550 non-null   object 
 2   Cause Subcategory    9550 non-null   object 
 3   Outcome of Incident  9550 non-null   object 
 4   Count                9547 non-null   float64
dtypes: float64(1), object(4)
memory usage: 373.2+ KB
None


In [24]:
print(df.describe())

             Count
count  9547.000000
mean    103.627632
std     275.189303
min       0.000000
25%       0.000000
50%      12.000000
75%      70.500000
max    3148.000000


In [25]:
print(df.isnull().sum())

Million Plus Cities    0
Cause category         0
Cause Subcategory      0
Outcome of Incident    0
Count                  3
dtype: int64


In [31]:
# Fill missing values with mean for numerical data
# Fill missing values only for numeric columns
df.fillna(df.select_dtypes(include=['number']).mean(), inplace=True)


In [32]:
# Convert categorical variables to numerical using one-hot encoding
df = pd.get_dummies(df, drop_first=True)


In [34]:
# Define target variable (y) and features (X)
X = df.drop(columns=["Count"])  # Drop target column
y = df["Count"]  # Target variable (number of accidents)

print(X.shape, y.shape)  # Check dimensions


(9550, 92) (9550,)


In [35]:
from sklearn.model_selection import train_test_split

# Split dataset into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training Set:", X_train.shape)
print("Testing Set:", X_test.shape)


Training Set: (7640, 92)
Testing Set: (1910, 92)


In [36]:
from sklearn.ensemble import RandomForestRegressor

# Initialize Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on test set
y_pred_rf = rf_model.predict(X_test)


In [37]:
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# Compute metrics
r2_rf = r2_score(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))

print(f"Random Forest R² Score: {r2_rf}")
print(f"Random Forest RMSE: {rmse_rf}")


Random Forest R² Score: 0.38970969599650995
Random Forest RMSE: 205.62333771394808


In [38]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Reshape for LSTM (samples, time steps, features)
X_train_lstm = np.reshape(X_train.values, (X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = np.reshape(X_test.values, (X_test.shape[0], 1, X_test.shape[1]))

# Build LSTM Model
model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(1, X_train.shape[1])),
    LSTM(50),
    Dense(1)  # Output layer
])

model.compile(optimizer="adam", loss="mse")
model.fit(X_train_lstm, y_train, epochs=50, batch_size=16, validation_data=(X_test_lstm, y_test))


Epoch 1/50


  super().__init__(**kwargs)


[1m478/478[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - loss: 91399.5000 - val_loss: 73861.2812
Epoch 2/50
[1m478/478[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 87595.1641 - val_loss: 71693.0234
Epoch 3/50
[1m478/478[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 85582.0781 - val_loss: 69629.1719
Epoch 4/50
[1m478/478[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 75903.3047 - val_loss: 67891.6953
Epoch 5/50
[1m478/478[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 77285.1953 - val_loss: 66300.6328
Epoch 6/50
[1m478/478[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 70071.4688 - val_loss: 64734.3906
Epoch 7/50
[1m478/478[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 64811.6719 - val_loss: 63279.1055
Epoch 8/50
[1m478/478[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 71905.6172 - val_loss

<keras.src.callbacks.history.History at 0x20407585670>

In [40]:
import joblib

# Save the model in the current directory
joblib.dump(rf_model, "traffic_model.pkl")  


['traffic_model.pkl']

In [44]:
joblib.dump(rf_model, "C:/Users/mruna/Documents/traffic_model.pkl") 

['C:/Users/mruna/Documents/traffic_model.pkl']

In [45]:
rf_model = joblib.load("traffic_model.pkl")
print("✅ Model loaded successfully!")


✅ Model loaded successfully!


In [46]:
model.save("/mnt/data/lstm_model.h5")

print("✅ LSTM model saved successfully!")




✅ LSTM model saved successfully!


In [47]:
#Convert Categorical Columns Using One-Hot Encoding

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# Apply One-Hot Encoding to categorical variables
X_encoded = pd.get_dummies(X)  # Convert categorical features to numerical

# Split into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

print(f"✅ Data Preprocessing Complete! Train Shape: {X_train.shape}, Test Shape: {X_test.shape}")


✅ Data Preprocessing Complete! Train Shape: (7640, 92), Test Shape: (1910, 92)


In [48]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Initialize the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on test data
y_pred = rf_model.predict(X_test)

# Evaluate performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"✅ Model Training Complete! MAE: {mae:.2f}, MSE: {mse:.2f}, R²: {r2:.2f}")


✅ Model Training Complete! MAE: 78.19, MSE: 42280.96, R²: 0.39


In [49]:
import joblib

# Save the trained model
joblib.dump(rf_model, "traffic_model.pkl")

print("✅ Random Forest model saved successfully!")


✅ Random Forest model saved successfully!


In [50]:
#Check for Time-Series Data
print(df.head())  # Check first few rows
print(df.dtypes)  # Check data types


   Count  Million Plus Cities_Ahmedabad  \
0    0.0                          False   
1    0.0                          False   
2    0.0                          False   
3    0.0                          False   
4    0.0                          False   

   Million Plus Cities_Allahabad(Prayagraj)  Million Plus Cities_Amritsar  \
0                                     False                         False   
1                                     False                         False   
2                                     False                         False   
3                                     False                         False   
4                                     False                         False   

   Million Plus Cities_Asansol Durgapur  Million Plus Cities_Aurangabad  \
0                                 False                           False   
1                                 False                           False   
2                                 False              