In [1]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Number of samples
num_samples = 5000

# Generating synthetic data
data = {
    "mode": np.random.choice(["walk", "bus", "rikshaw", "cycle", "bike"], num_samples),
    "time": np.random.randint(1, 25, num_samples),
    "day": np.random.choice(["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"], num_samples),
    "human_density": np.random.uniform(0, 1, num_samples),  # 0 (low) to 1 (high)
    "light_condition": np.random.randint(10, 1000, num_samples),  # in lumens
    "loadshedding": np.random.choice([0, 1], num_samples),  # 0 = No, 1 = Yes
    "affected_people": np.random.choice(["alone", "1", "2", "3+"], num_samples),
    "nearest_thana_km": np.random.uniform(0.1, 10, num_samples),
    "nearest_market_km": np.random.uniform(0.1, 5, num_samples),
    "nearest_busy_road_km": np.random.uniform(0.1, 3, num_samples),
    "was_road_busy": np.random.choice([0, 1], num_samples),  # 0 = No, 1 = Yes

    # Personal Details (for Model 2)
    "dress": np.random.choice(["well_dressed", "normal", "attractive"], num_samples),
    "age": np.random.randint(15, 70, num_samples),
    "gender": np.random.choice(["male", "female"], num_samples),
    "financial_status": np.random.randint(1, 11, num_samples),  # 1 (poor) to 10 (rich)

    # Target Variable (Binary: Snatched or Not)
    "snatched": np.random.choice([0, 1], num_samples, p=[0.85, 0.15])  # Assuming 15% incidents occur
}

# Create DataFrame
df = pd.DataFrame(data)



In [2]:
# Display first few rows
df.head()


Unnamed: 0,mode,time,day,human_density,light_condition,loadshedding,affected_people,nearest_thana_km,nearest_market_km,nearest_busy_road_km,was_road_busy,dress,age,gender,financial_status,snatched
0,cycle,23,Sunday,0.743629,908,1,3+,3.315466,0.581408,2.777295,0,normal,37,female,6,0
1,bike,16,Thursday,0.985807,167,1,3+,8.478146,3.484154,2.021968,0,attractive,61,male,2,0
2,rikshaw,14,Saturday,0.66719,737,1,2,9.063393,4.13196,2.860186,0,well_dressed,16,male,4,0
3,bike,10,Friday,0.24484,905,0,alone,4.573637,4.267402,1.790158,1,attractive,56,male,7,1
4,bike,7,Saturday,0.716554,822,0,alone,0.189725,3.244058,1.439025,0,normal,18,male,1,0


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report

In [5]:

# Define features for Model 1 (location-based hazard assessment)
features_model1 = ["mode", "time", "day", "human_density", "light_condition", "loadshedding",
                   "affected_people", "nearest_thana_km", "nearest_market_km", "nearest_busy_road_km", "was_road_busy"]

# Define features for Model 2 (personalized risk assessment)
features_model2 = features_model1 + ["dress", "age", "gender", "financial_status"]

In [6]:

# Define target variable
target = "snatched"

# Preprocessing: Categorical encoding + Scaling numerical values
categorical_features = ["mode", "day", "affected_people"]  # Categorical for both models
categorical_features_model2 = categorical_features + ["dress", "gender"]  # Additional for Model 2

In [10]:
num_features = list(set(features_model1) - set(categorical_features))
num_features_model2 = list(set(features_model2) - set(categorical_features_model2))


In [8]:
def create_pipeline(categorical_features, num_features):
    preprocessor = ColumnTransformer([
        ("num", StandardScaler(), num_features),  # Scale numerical features
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features)  # Encode categorical features
    ])
    
    return Pipeline([
        ("preprocessor", preprocessor),  # Apply preprocessing
        ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))  # Train Random Forest model
    ])


In [9]:
def train_model(features, categorical_features, num_features):
    X = df[features]
    y = df[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = create_pipeline(categorical_features, num_features)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    return model

In [11]:
# Train Model 1 (without personal details)
print("Model 1 (Location Hazard Analysis):")
model1 = train_model(features_model1, categorical_features, num_features)

Model 1 (Location Hazard Analysis):
Accuracy: 0.84
              precision    recall  f1-score   support

           0       0.84      1.00      0.91       840
           1       0.00      0.00      0.00       160

    accuracy                           0.84      1000
   macro avg       0.42      0.50      0.46      1000
weighted avg       0.71      0.84      0.77      1000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:

# Train Model 2 (with personal details)
print("\nModel 2 (Personalized Risk Assessment):")
model2 = train_model(features_model2, categorical_features_model2, num_features_model2)



Model 2 (Personalized Risk Assessment):
Accuracy: 0.84
              precision    recall  f1-score   support

           0       0.84      1.00      0.91       840
           1       0.00      0.00      0.00       160

    accuracy                           0.84      1000
   macro avg       0.42      0.50      0.46      1000
weighted avg       0.71      0.84      0.77      1000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
sample_input1 = pd.DataFrame([{
    "mode": "walk",
    "time": 22,
    "day": "Friday",
    "human_density": 10,
    "light_condition": 5,
    "loadshedding": 1,
    "affected_people": "alone",
    "nearest_thana_km": 3.0,
    "nearest_market_km": 2.5,
    "nearest_busy_road_km": 1.0,
    "was_road_busy": 0
}])

# Make a prediction
pred1 = model1.predict(sample_input1)
prob1 = model1.predict_proba(sample_input1)

print("Model 1 Prediction:", "Snatching Likely" if pred1[0] == 1 else "Safe")
print("Probability (Safe, Snatching):", prob1)


Model 1 Prediction: Safe
Probability (Safe, Snatching): [[0.67 0.33]]


In [14]:
sample_input2 = pd.DataFrame([{
    "mode": "walk",
    "time": 22,
    "day": "Friday",
    "human_density": 10,
    "light_condition": 5,
    "loadshedding": 1,
    "affected_people": "alone",
    "nearest_thana_km": 3.0,
    "nearest_market_km": 2.5,
    "nearest_busy_road_km": 1.0,
    "was_road_busy": 0,
    "dress": "attractive",
    "age": 25,
    "gender": "Male",
    "financial_status": 8
}])

# Make a prediction
pred2 = model2.predict(sample_input2)
prob2 = model2.predict_proba(sample_input2)

print("Model 2 Prediction:", "Snatching Likely" if pred2[0] == 1 else "Safe")
print("Probability (Safe, Snatching):", prob2)


Model 2 Prediction: Safe
Probability (Safe, Snatching): [[0.74 0.26]]
