In [10]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/analysis-of-us-accidents/US_Accidents_May19_Migrated Data.csv


In [11]:
df = pd.read_csv("/kaggle/input/analysis-of-us-accidents/US_Accidents_May19_Migrated Data.csv")

In [12]:
columns_to_drop = [
    'ID', 
    'Number', 
    'Airport_Code',
    'End_Lat',
    'End_Time',
    'End_Lng', 
    'Description', 
    'Source', 
    'Timezone',
    'Country',
    "County", 
    'Records', 
    'Number of Records', 
    'Astronomical_Twilight',
    'Nautical_Twilight',
    'Civil_Twilight',
    'Zipcode',  
    'Turning_Loop',
    'count of Bump', 
    'count Traffic Signal', 
    'count of county', 
    'Count of Crossing',
    'Count of accidents',
    'Wind_Chill(F)', 
    'Weather_Timestamp', 
    "TMC", 
    "Street", 
    'Distance(mi)',
    'Precipitation(in)', 
]

In [13]:
df = df.drop(columns=columns_to_drop)

In [14]:
df.columns

Index(['Amenity', 'Bump', 'Calculation1', 'City', 'Crossing', 'Give_Way',
       'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Severity', 'Side',
       'Start_Time', 'State', 'Station', 'Stop', 'Sunrise_Sunset',
       'Temperature(F)', 'Traffic_Calming', 'Traffic_Signal', 'Visibility(mi)',
       'Weather_Condition', 'Wind_Direction', 'Humidity(%)', 'Pressure(in)',
       'Start_Lat', 'Start_Lng', 'Wind_Speed(mph)'],
      dtype='object')

In [15]:
import joblib
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScaler,PowerTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, classification_report



class CustomLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoders = {}

    def fit(self, X, y=None):
        for col in X.columns:
            le = LabelEncoder()
            le.fit(X[col].astype(str))
            self.encoders[col] = dict(zip(le.classes_, le.transform(le.classes_)))
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for col in X.columns:
            mapping = self.encoders[col]
            X_transformed[col] = X_transformed[col].astype(str).map(mapping).fillna(0).astype(int)

        return X_transformed

        
class GeographicEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X["Lat_sin"] = np.sin(np.radians(X["Start_Lat"]))
        X["Lat_cos"] = np.cos(np.radians(X["Start_Lat"]))
        X["Lng_sin"] = np.sin(np.radians(X["Start_Lng"]))
        X["Lng_cos"] = np.cos(np.radians(X["Start_Lng"]))

        return X.drop(columns=["Start_Lat", "Start_Lng"])



class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X['Start_Time'] = pd.to_datetime(X['Start_Time'], format='%m/%d/%Y %I:%M:%S %p', errors='coerce')

        X['Hour'] = X['Start_Time'].dt.hour
        X['DayOfWeek'] = X['Start_Time'].dt.dayofweek
        X['IsWeekend'] = X['DayOfWeek'] >= 5

        X['IsNight'] = (X['Sunrise_Sunset'] == 'Night').astype(int)
        X['IsRainy'] = X['Weather_Condition'].str.contains('Rain|Storm', na=False).astype(int)
        X['IsFoggy'] = X['Weather_Condition'].str.contains('Fog|Haze', na=False).astype(int)

        X['HasObstacle'] = X[['Amenity', 'Bump', 'Traffic_Calming', 'Crossing', 
                              'Junction', 'Stop', 'Traffic_Signal']].sum(axis=1) > 0
        X['HasObstacle'] = X['HasObstacle'].astype(int)

        X.drop(columns=[
            'Start_Time', 'DayOfWeek',
            'Sunrise_Sunset', 'Weather_Condition',
            'Amenity', 'Bump', 'Traffic_Calming', 'Crossing',
            'Junction', 'Stop', 'Traffic_Signal'
        ], inplace=True, errors='ignore')
        
        return X

class DataCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        missing_values = X.isna().sum()
        missing_values_to_drop = missing_values[missing_values < 9000].index
        X = X.dropna(subset=missing_values_to_drop)

        weather_cols = ["Temperature(F)", "Visibility(mi)", "Pressure(in)", "Wind_Speed(mph)", "Humidity(%)"]
        X[weather_cols] = X[weather_cols].fillna(X[weather_cols].median())

        categorical_columns = ["Weather_Condition", "Wind_Direction"]
        for col in categorical_columns:
            X[col] = X[col].fillna(X[col].mode()[0])

        X = X.drop_duplicates()

        boolean_columns = X.select_dtypes(include=[bool]).columns
        X[boolean_columns] = X[boolean_columns].astype(int)

        
        X.reset_index(drop=True, inplace=True)
        return X
        
def create_preprocessing_pipeline():
    categorical_columns = ['Calculation1', 'City', 'Side', 'State', 'Wind_Direction']
    numeric_standard = ['Temperature(F)']
    numeric_minmax = ['Humidity(%)']
    numeric_power = ['Wind_Speed(mph)', 'Pressure(in)', 'Visibility(mi)']
    
    preprocessor = ColumnTransformer([
        ('categorical', CustomLabelEncoder(), categorical_columns),
        ('numeric_standard', StandardScaler(), numeric_standard),
        ('numeric_minmax', MinMaxScaler(), numeric_minmax),
        ('numeric_power', PowerTransformer(), numeric_power),
    ], remainder='passthrough')

    pipeline = Pipeline([
        ('feature_engineering', FeatureEngineer()),
        ('geographic_encoder', GeographicEncoder()),
        ('preprocessor', preprocessor),
    ])

    return pipeline


def create_full_model_pipeline():
    xgb_classifier = xgb.XGBClassifier(
        tree_method='hist',
        device='cuda',
        objective='multi:softprob',
        random_state=2,
        eval_metric='mlogloss',
        verbosity=1,
        max_bin=256,
        learning_rate=0.03,
        max_depth=12,
        n_estimators=1000,
        subsample=0.8
    )

    full_pipeline = Pipeline([
        ('cleaning', DataCleaner()),
        ('features', create_preprocessing_pipeline()),
        ('classifier', xgb_classifier)
    ])

    return full_pipeline

In [16]:
df_filtered = df[df["Severity"].isin([2, 3, 4])].copy()
df_filtered["Severity"] = df_filtered["Severity"].map({2: 0, 3: 1, 4: 2})

classes = np.unique(df_filtered["Severity"])
weights = compute_class_weight('balanced', classes=classes, y=df_filtered["Severity"])

X_raw = df_filtered.drop(columns=["Severity"]).reset_index(drop=True)
y_raw = df_filtered["Severity"].reset_index(drop=True)

pipeline = create_full_model_pipeline()

X_transformed = pipeline[:-1].fit_transform(X_raw, y_raw)

y_transformed = y_raw.iloc[:len(X_transformed)]
classes = np.unique(y_transformed)
weights = compute_class_weight('balanced', classes=classes, y=y_transformed)
weight_map = dict(zip(classes, weights))
sample_weight = y_transformed.map(weight_map)

pipeline.named_steps["classifier"].fit(X_transformed, y_transformed, sample_weight=sample_weight)

In [17]:
import cloudpickle
INPUT_FEATURES = [
    'Calculation1', 'City', 'Side', 'State', 'Wind_Direction',
    'Weather_Condition', 'Sunrise_Sunset', 'Start_Time',
    'Temperature(F)', 'Visibility(mi)', 'Pressure(in)',
    'Wind_Speed(mph)', 'Humidity(%)', 'Start_Lat', 'Start_Lng'
]
pipeline.required_features = INPUT_FEATURES
with open("xgb_full_pipeline1.pkl", "wb") as f:
    cloudpickle.dump(pipeline, f)