# Import libraries and models

In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor

# Load the data

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s5e10/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e10/test.csv')

# Clean the data

In [3]:
# Boolean columns
bool_cols = ['road_signs_present', 'public_road', 'holiday', 'school_season']

# Encoding boolean columns
train[bool_cols] = train[bool_cols].astype(int)
test[bool_cols] = test[bool_cols].astype(int)

# Drop features and target
X = train.drop(['id', 'accident_risk'], axis=1)
y = train['accident_risk']
X_test = test.drop(['id'], axis=1)

# Categorical columns
cat_cols = [col for col in X.columns if X[col].dtype == 'object']

# Numerical columns 
num_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64', 'int32']]

# Preprocessor for numerical
num_transformer = SimpleImputer(strategy='median')

# Preprocessor for categorical
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ])

# Train and predict

In [4]:
# Model XGBoost
model = XGBRegressor(n_estimators=1000, learning_rate=0.05, random_state=0, n_jobs=-1)

# Pipeline 
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)])

# Cross-validation 
scores = -1 * cross_val_score(my_pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
print('Average MAE from CV:', scores.mean())

# Fit model 
my_pipeline.fit(X, y)

# Predict 
test_preds = my_pipeline.predict(X_test)

Average MAE from CV: 0.04360385688312777


# Submission

In [5]:
# Create submission
submission = pd.DataFrame({'id': test['id'], 'accident_risk': test_preds})
submission.to_csv('submission.csv', index=False)
print('Submission file created!')

Submission file created!
