In [23]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/analysis-of-us-accidents/US_Accidents_May19_Migrated Data.csv


In [24]:
df = pd.read_csv("/kaggle/input/analysis-of-us-accidents/US_Accidents_May19_Migrated Data.csv")

In [25]:
columns_to_drop = [
    'ID', 
    'Number', 
    'Airport_Code',
    'End_Lat',
    'End_Time',
    'End_Lng', 
    'Description', 
    'Source', 
    'Timezone',
    'Country',
    "County", 
    'Records', 
    'Number of Records', 
    'Astronomical_Twilight',
    'Nautical_Twilight',
    'Civil_Twilight',
    'Zipcode',  
    'Turning_Loop',
    'count of Bump', 
    'count Traffic Signal', 
    'count of county', 
    'Count of Crossing',
    'Count of accidents',
    'Wind_Chill(F)', 
    'Weather_Timestamp', 
    "TMC", 
    "Street", 
    'Distance(mi)',
    'Precipitation(in)', 
]

In [26]:
df = df.drop(columns=columns_to_drop)

In [27]:
import joblib
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScaler,PowerTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, classification_report



class CustomLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoders = {}

    def fit(self, X, y=None):
        for col in X.columns:
            le = LabelEncoder()
            le.fit(X[col].astype(str))
            self.encoders[col] = dict(zip(le.classes_, le.transform(le.classes_)))
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for col in X.columns:
            mapping = self.encoders[col]
            X_transformed[col] = X_transformed[col].astype(str).map(mapping).fillna(0).astype(int)
            print("After CustomLabeling:", X_transformed.shape)

        return X_transformed


class GeographicEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X["Lat_sin"] = np.sin(np.radians(X["Start_Lat"]))
        X["Lat_cos"] = np.cos(np.radians(X["Start_Lat"]))
        X["Lng_sin"] = np.sin(np.radians(X["Start_Lng"]))
        X["Lng_cos"] = np.cos(np.radians(X["Start_Lng"]))
        print("After GeoEncoding:", X.shape)

        return X.drop(columns=["Start_Lat", "Start_Lng"])



class SeverityEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X = X[X['Severity'].isin([2, 3, 4])]
        X['Severity'] = X['Severity'].map({2: 0, 3: 1, 4: 2})
        print("After Severity:", X.shape)

        return X


class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        X['Start_Time'] = pd.to_datetime(X['Start_Time'], errors='coerce')

        X['Hour'] = X['Start_Time'].dt.hour
        X['DayOfWeek'] = X['Start_Time'].dt.dayofweek
        X['IsWeekend'] = X['DayOfWeek'] >= 5

        X['IsNight'] = (X['Sunrise_Sunset'] == 'Night').astype(int)
        X['IsRainy'] = X['Weather_Condition'].str.contains('Rain|Storm', na=False).astype(int)
        X['IsFoggy'] = X['Weather_Condition'].str.contains('Fog|Haze', na=False).astype(int)

        X['HasObstacle'] = X[['Amenity', 'Bump', 'Traffic_Calming', 'Crossing', 
                              'Junction', 'Stop', 'Traffic_Signal']].sum(axis=1) > 0
        X['HasObstacle'] = X['HasObstacle'].astype(int)

        X.drop(columns=[
            'Start_Time', 'DayOfWeek',
            'Sunrise_Sunset', 'Weather_Condition',
            'Amenity', 'Bump', 'Traffic_Calming', 'Crossing',
            'Junction', 'Stop', 'Traffic_Signal'
        ], inplace=True, errors='ignore')
        print("After Feature enginiring:", X.shape)
        
        return X

def create_preprocessing_pipeline():
    categorical_columns = ['Calculation1', 'City', 'Side', 'State', 'Wind_Direction']
    numeric_standard = ['Temperature(F)']
    numeric_minmax = ['Humidity(%)']
    numeric_power = ['Wind_Speed(mph)', 'Pressure(in)', 'Visibility(mi)']
    
    preprocessor = ColumnTransformer([
        ('categorical', CustomLabelEncoder(), categorical_columns),
        ('numeric_standard', StandardScaler(), numeric_standard),
        ('numeric_minmax', MinMaxScaler(), numeric_minmax),
        ('numeric_power', PowerTransformer(), numeric_power),
    ], remainder='passthrough')

    pipeline = Pipeline([
        ('feature_engineering', FeatureEngineer()),
        ('geographic_encoder', GeographicEncoder()),
        ('preprocessor', preprocessor),
    ])

    return pipeline


def create_full_model_pipeline():
    preprocessing_pipeline = create_preprocessing_pipeline()
    full_pipeline = Pipeline([
        ('preprocessing', preprocessing_pipeline),
        ('classifier', xgb.XGBClassifier(
            device='cuda',
            tree_method='hist',
            objective='multi:softprob',
            random_state=42,
            eval_metric='mlogloss',
            verbosity=0
        ))
    ])

    return full_pipeline

def load_and_predict(model_path, new_data):
    """Load the saved pipeline and make predictions"""
    pipeline = joblib.load(model_path)
    predictions = pipeline.predict(new_data)
    probabilities = pipeline.predict_proba(new_data)

    return predictions, probabilities


In [28]:

missing_values = df.isna().sum()
missing_values_to_drop = missing_values[missing_values < 9000].index
df = df.dropna(subset=missing_values_to_drop)

weather_cols = ["Temperature(F)", "Visibility(mi)", "Pressure(in)", "Wind_Speed(mph)", "Humidity(%)"]
df.loc[:, weather_cols]= df[weather_cols].fillna(df[weather_cols].median())

categorical_columns = ["Weather_Condition", "Wind_Direction"]
for col in categorical_columns:
    df[col] = df[col].fillna(df[col].mode()[0])

df = df.drop_duplicates()

boolean_columns = df.select_dtypes(include=[bool]).columns
df[boolean_columns] = df[boolean_columns].astype(int)
before = set(df.columns)
severity_encoder = SeverityEncoder()
after = set(df.columns)
df = severity_encoder.transform(df)

X = df.drop('Severity', axis=1)
y = df['Severity']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.05, random_state=42, stratify=y
)
print("Dropped:", before - after)

After Severity: (2232102, 28)
Dropped: set()


In [29]:
preprocessing_pipeline = create_preprocessing_pipeline()

In [30]:
preprocessing_pipeline.fit(X_train)

After Feature enginiring: (2120496, 23)
After GeoEncoding: (2120496, 27)
After CustomLabeling: (2120496, 5)
After CustomLabeling: (2120496, 5)
After CustomLabeling: (2120496, 5)
After CustomLabeling: (2120496, 5)
After CustomLabeling: (2120496, 5)


In [31]:
X_train_transformed = preprocessing_pipeline.transform(X_train)

After Feature enginiring: (2120496, 23)
After GeoEncoding: (2120496, 27)
After CustomLabeling: (2120496, 5)
After CustomLabeling: (2120496, 5)
After CustomLabeling: (2120496, 5)
After CustomLabeling: (2120496, 5)
After CustomLabeling: (2120496, 5)


In [32]:
X_test_transformed = preprocessing_pipeline.transform(X_test)
print(f"Training data shape after preprocessing: {X_train_transformed.shape}")

After Feature enginiring: (111606, 23)
After GeoEncoding: (111606, 27)
After CustomLabeling: (111606, 5)
After CustomLabeling: (111606, 5)
After CustomLabeling: (111606, 5)
After CustomLabeling: (111606, 5)
After CustomLabeling: (111606, 5)
Training data shape after preprocessing: (2120496, 25)


In [33]:
weights = compute_class_weight('balanced', classes=[0,1,2], y=y_train)

In [34]:
param_grid = {
    'n_estimators': [400, 500, 1000],
    'max_depth': [8,12],
    'learning_rate': [0.15, 0.09, 0.01], 
    'subsample': [0.8, 1.0],     
}

xgb_classifier = xgb.XGBClassifier(
    tree_method='hist',
    device='cuda',
    objective='multi:softprob',
    random_state=41,
    eval_metric='mlogloss',
    verbosity=1,
    max_bin=256,
    learning_rate=0.05,
    max_depth=10,
    n_estimators=500,
    subsample=0.8
)

grid_search = GridSearchCV(
    estimator=xgb_classifier,
    param_grid=param_grid,
    cv=2,
    scoring='f1_macro',
    n_jobs=-1,
    verbose=2,
)
import warnings


In [35]:
y.value_counts()

Severity
0    1448030
1     713311
2      70761
Name: count, dtype: int64

In [36]:
warnings.filterwarnings("ignore")
xgb_classifier.set_params(scale_pos_weight=weights[2] / weights[0])
xgb_classifier.fit(X_train_transformed, y_train)

In [37]:
y_pred = xgb_classifier.predict(X_test_transformed)
test_accuracy = accuracy_score(y_test, y_pred)


feature_importance = pd.DataFrame({
    'feature': [f'feature_{i}' for i in range(len(xgb_classifier.feature_importances_))],
    'importance': xgb_classifier.feature_importances_
}).sort_values('importance', ascending=False)


# Create final pipeline with best model
final_pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('classifier', xgb_classifier)
])

joblib.dump(final_pipeline, "xgb.pkl")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))


Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.87      0.86     72402
           1       0.72      0.75      0.74     35666
           2       0.65      0.23      0.34      3538

    accuracy                           0.81    111606
   macro avg       0.74      0.62      0.65    111606
weighted avg       0.81      0.81      0.81    111606


Top 10 Most Important Features:
       feature  importance
0    feature_0    0.260670
2    feature_2    0.186067
22  feature_22    0.061347
20  feature_20    0.059389
16  feature_16    0.058126
3    feature_3    0.052910
24  feature_24    0.044617
23  feature_23    0.041349
12  feature_12    0.034408
21  feature_21    0.032217


In [38]:
test_accuracy

0.8109868645054925