In [2]:
from google.colab import files
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
files.upload()  # Upload the 'kaggle.json' file

KeyboardInterrupt: 

In [2]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

mv: cannot stat 'kaggle.json': No such file or directory


In [None]:
!kaggle datasets download -d sobhanmons/analysis-of-us-accidents
!unzip analysis-of-us-accidents.zip -d /content/data

In [3]:
df = pd.read_csv("data/US_Accidents_May19_Migrated Data.csv")

In [10]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    LabelEncoder,
    StandardScaler,
    MinMaxScaler,
    PowerTransformer
)
from sklearn.base import BaseEstimator, TransformerMixin
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import accuracy_score, classification_report
import joblib


class CustomLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoders = {}

    def fit(self, X, y=None):
        for col in X.columns:
            encoder = LabelEncoder()
            encoder.fit(X[col])
            self.encoders[col] = encoder
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for col in X.columns:
            encoder = self.encoders[col]
            X_transformed[col] = X_transformed[col].apply(
                lambda x: encoder.transform([x])[0] if x in encoder.classes_ else 0
            )
        return X_transformed


class GeographicEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X_transformed):

        X_transformed["Lat_sin"] = np.sin(np.radians(X_transformed["Start_Lat"]))
        X_transformed["Lat_cos"] = np.cos(np.radians(X_transformed["Start_Lat"]))
        X_transformed["Lng_sin"] = np.sin(np.radians(X_transformed["Start_Lng"]))
        X_transformed["Lng_cos"] = np.cos(np.radians(X_transformed["Start_Lng"]))

        X_transformed = X_transformed.drop(columns=["Start_Lat", "Start_Lng"])

        return X_transformed


class DateTimeEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X_transformed):
        X_transformed["Start_Time"] = pd.to_datetime(X_transformed["Start_Time"]).astype(int) // 10**9
        return X_transformed


class SeverityEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X_transformed):
        X_transformed = X_transformed[X_transformed['Severity'].isin([2, 3, 4])]
        severity_map = {2: 0, 3: 1, 4: 2}
        X_transformed['Severity'] = X_transformed['Severity'].map(severity_map)

        return X_transformed


def create_preprocessing_pipeline():
    categorical_columns = ['Calculation1', 'City', 'Side', 'State', 'Sunrise_Sunset', 'Weather_Condition', 'Wind_Direction']
    numeric_standard = ['Temperature(F)']
    numeric_minmax = ['Humidity(%)']
    numeric_power = ['Wind_Speed(mph)', 'Pressure(in)', 'Visibility(mi)']

    preprocessor = ColumnTransformer([
        ('categorical', CustomLabelEncoder(), categorical_columns),
        ('numeric_standard', StandardScaler(), numeric_standard),
        ('numeric_minmax', MinMaxScaler(), numeric_minmax),
        ('numeric_power', PowerTransformer(), numeric_power),
    ], remainder='passthrough')

    pipeline = Pipeline([
        ('datetime_encoder', DateTimeEncoder()),
        ('geographic_encoder', GeographicEncoder()),
        ('preprocessor', preprocessor),
    ])

    return pipeline


def create_full_model_pipeline():
    preprocessing_pipeline = create_preprocessing_pipeline()
    full_pipeline = Pipeline([
        ('preprocessing', preprocessing_pipeline),
        ('classifier', xgb.XGBClassifier(
            device='cuda',
            tree_method='hist',
            objective='multi:softprob',
            random_state=42,
            eval_metric='mlogloss',
            verbosity=0
        ))
    ])

    return full_pipeline

def load_and_predict(model_path, new_data):
    """Load the saved pipeline and make predictions"""
    pipeline = joblib.load(model_path)
    predictions = pipeline.predict(new_data)
    probabilities = pipeline.predict_proba(new_data)

    return predictions, probabilities


In [5]:
missing_values = df.isna().sum()
missing_values_to_drop = missing_values[missing_values < 9000].index
df = df.dropna(subset=missing_values_to_drop)

weather_cols = ["Temperature(F)", "Visibility(mi)", "Pressure(in)", "Wind_Speed(mph)", "Humidity(%)"]
df.loc[:, weather_cols]= df[weather_cols].fillna(df[weather_cols].median())

categorical_columns = ["Weather_Condition", "Wind_Direction"]
for col in categorical_columns:
    df[col] = df[col].fillna(df[col].mode()[0])

df = df.drop_duplicates()

boolean_columns = df.select_dtypes(include=[bool]).columns
df[boolean_columns] = df[boolean_columns].astype(int)

severity_encoder = SeverityEncoder()
df = severity_encoder.transform(df)

X = df.drop('Severity', axis=1)
y = df['Severity']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mode()[0])


In [11]:
preprocessing_pipeline = create_preprocessing_pipeline()

In [None]:
preprocessing_pipeline.fit(X_train)

In [None]:
X_train_transformed = preprocessing_pipeline.transform(X_train)

In [None]:
X_test_transformed = preprocessing_pipeline.transform(X_test)
print(f"Training data shape after preprocessing: {X_train_transformed.shape}")

In [None]:
sample_weight = compute_sample_weight(class_weight='balanced', y=y_train)

In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 7],
    'learning_rate': [0.1, 0.2],
    'subsample': [0.8, 1.0],
}

xgb_classifier = xgb.XGBClassifier(
    device='cuda',
    tree_method='hist',
    objective='multi:softprob',
    random_state=42,
    eval_metric='mlogloss',
    verbosity=1,
    max_bin=256,
)

grid_search = GridSearchCV(
    estimator=xgb_classifier,
    param_grid=param_grid,
    cv=2,
    scoring='f1_macro',
    n_jobs=-1,
    verbose=2,
    pre_dispatch='2*n_jobs',
    return_train_score=False
)


In [None]:
grid_search.fit(X_train_transformed, y_train, sample_weight=sample_weight)


In [None]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_transformed)
test_accuracy = accuracy_score(y_test, y_pred)


feature_importance = pd.DataFrame({
    'feature': [f'feature_{i}' for i in range(len(best_model.feature_importances_))],
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)


# Create final pipeline with best model
final_pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('classifier', best_model)
])

joblib.dump(final_pipeline, "xgb.pkl")

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))