In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score

In [11]:
df = pd.read_csv('road_safety_data.csv')

In [12]:
df['Crime Types'] = df['Crime Types'].apply(lambda x: ','.join(eval(x)) if isinstance(x, str) else x)

In [13]:
target = 'SafeRoad'
features = [
    'Magnitude', 'Crime Types', 'time_of_day', 'shops_nearby', 'area_type', 'has_Vehicle',
    'crime_rate', 'number_crime_last_Three_months', 'number_people_accompanying', 'weather_condition',
    'proximity_police_station', 'proximity_hospital', 'lightingOnRoad', 'traffic_density',
    'reported_crimes', 'proximity_public_transport'
]

label_encoder = LabelEncoder()
df[target] = label_encoder.fit_transform(df[target])  # Converts 'yes', 'no', 'likely' to integers (e.g. 0, 1, 2)

X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
categorical_features = ['Crime Types', 'time_of_day', 'area_type', 'crime_rate', 'weather_condition', 
                        'lightingOnRoad', 'traffic_density']
numerical_features = ['Magnitude', 'shops_nearby', 'number_crime_last_Three_months', 
                      'number_people_accompanying', 'proximity_police_station', 'proximity_hospital', 
                      'reported_crimes', 'proximity_public_transport']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),  # Scaling numerical features
        ('cat', OneHotEncoder(), categorical_features)  # One-hot encoding categorical features
    ])


In [15]:
xgb_clf = XGBClassifier(objective='multi:softmax', num_class=3, eval_metric='mlogloss')
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', xgb_clf)])

In [29]:
param_grid = {
    'classifier__n_estimators': [100, 200, 300],  # Number of boosting rounds
    'classifier__max_depth': [0.5,1,2,3, 5],           # Maximum depth of trees
    'classifier__learning_rate': [0.01, 0.1, 0.2,0.3,0.5], # Step size shrinkage
    'classifier__subsample': [0.6, 0.8, 1.0],     # Subsample ratio of training instances
    'classifier__colsample_bytree': [0.6, 0.8, 1.0,1.2,1.4],
    'classifier__gamma': [0, 0.1, 0.5, 1], 
    'classifier__scale_pos_weight': [1, 3, 5],
    'classifier__min_child_weight': [1, 3, 5]  # Subsample ratio of columns
}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 40500 candidates, totalling 202500 fits


105300 fits failed out of a total of 202500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24300 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Adarsh Vishwakarma\.conda\envs\TextS\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Adarsh Vishwakarma\.conda\envs\TextS\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Adarsh Vishwakarma\.conda\envs\TextS\Lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"

In [30]:
print(f"Best - params: {grid_search.best_params_}")

Best - params: {'classifier__colsample_bytree': 0.6, 'classifier__gamma': 0.5, 'classifier__learning_rate': 0.2, 'classifier__max_depth': 3, 'classifier__min_child_weight': 5, 'classifier__n_estimators': 300, 'classifier__scale_pos_weight': 1, 'classifier__subsample': 0.8}


In [31]:
y_pred = grid_search.predict(X_test)

In [32]:
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Classification Report:
              precision    recall  f1-score   support

      likely       0.40      0.53      0.46        51
          no       0.37      0.25      0.29        57
         yes       0.40      0.42      0.41        52

    accuracy                           0.39       160
   macro avg       0.39      0.40      0.39       160
weighted avg       0.39      0.39      0.38       160



In [33]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.39


In [34]:
cv_scores = cross_val_score(grid_search.best_estimator_, X_train, y_train, cv=5)
print(f"Cross-validation accuracy: {cv_scores.mean():.2f} (+/- {cv_scores.std():.2f})")

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.

Parameters: { "scale_pos_weight" } are not used.



Cross-validation accuracy: 0.38 (+/- 0.02)


Parameters: { "scale_pos_weight" } are not used.

