In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [2]:
# Import xgboost
import xgboost as xgb

In [3]:
train = pd.read_csv('/kaggle/input/playground-series-s5e10/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e10/test.csv')
sample_submission = pd.read_csv('/kaggle/input/playground-series-s5e10/sample_submission.csv')  

In [4]:
print("______________Train data______________")
print(train.head())
print("______________Test data______________")
print(test.head())
print(sample_submission.head())

______________Train data______________
   id road_type  num_lanes  curvature  speed_limit  lighting weather  \
0   0     urban          2       0.06           35  daylight   rainy   
1   1     urban          4       0.99           35  daylight   clear   
2   2     rural          4       0.63           70       dim   clear   
3   3   highway          4       0.07           35       dim   rainy   
4   4     rural          1       0.58           60  daylight   foggy   

   road_signs_present  public_road time_of_day  holiday  school_season  \
0               False         True   afternoon    False           True   
1                True        False     evening     True           True   
2               False         True     morning     True          False   
3                True         True     morning    False          False   
4               False        False     evening     True          False   

   num_reported_accidents  accident_risk  
0                       1           0.13

In [5]:
# Display information about the datasets
print("Train data shape")
print(train.shape)
print("Test data shape")
print(test.shape)
print("sample_submission data shape")
print(sample_submission.shape)

Train data shape
(517754, 14)
Test data shape
(172585, 13)
sample_submission data shape
(172585, 2)


In [6]:
# basic statistics
print(train['accident_risk'].describe())

count    517754.000000
mean          0.352377
std           0.166417
min           0.000000
25%           0.230000
50%           0.340000
75%           0.460000
max           1.000000
Name: accident_risk, dtype: float64


In [7]:
# check for missing data
train.isnull().sum()

id                        0
road_type                 0
num_lanes                 0
curvature                 0
speed_limit               0
lighting                  0
weather                   0
road_signs_present        0
public_road               0
time_of_day               0
holiday                   0
school_season             0
num_reported_accidents    0
accident_risk             0
dtype: int64

In [8]:
#road_type,lighting ,weather,time_of_day need to be encoded with "Label Encoding"

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
road_type_encoded = encoder.fit_transform(train["road_type"])
lighting_encoded = encoder.fit_transform(train["lighting"])
weather_encoded = encoder.fit_transform(train["weather"])
time_of_day_encoded = encoder.fit_transform(train["time_of_day"])

In [9]:
#road_signs_present,public_road,holiday,school_season need to be encoded with "One-Hot Encoding"
# True = 1, False = 0
road_signs_present_encoded = [int(x) for x in train["road_signs_present"]]  
public_road_encoded = [int(x) for x in train["public_road"]]  
holiday_encoded = [int(x) for x in train["holiday"]]  
school_season_encoded = [int(x) for x in train["school_season"]]  

In [10]:
print(train.head())

   id road_type  num_lanes  curvature  speed_limit  lighting weather  \
0   0     urban          2       0.06           35  daylight   rainy   
1   1     urban          4       0.99           35  daylight   clear   
2   2     rural          4       0.63           70       dim   clear   
3   3   highway          4       0.07           35       dim   rainy   
4   4     rural          1       0.58           60  daylight   foggy   

   road_signs_present  public_road time_of_day  holiday  school_season  \
0               False         True   afternoon    False           True   
1                True        False     evening     True           True   
2               False         True     morning     True          False   
3                True         True     morning    False          False   
4               False        False     evening     True          False   

   num_reported_accidents  accident_risk  
0                       1           0.13  
1                       0           

In [11]:
from sklearn.preprocessing import PolynomialFeatures
def preprocess_dataframe(df: pd.DataFrame, drop_cols: list = None) -> pd.DataFrame:
    df = df.copy()

    # Drop specified columns 
    if drop_cols:
        df = df.drop(columns=drop_cols, errors='ignore')

    # Encode boolean columns
    bool_cols = df.select_dtypes(include='bool').columns
    df[bool_cols] = df[bool_cols].astype(int)

    # Label encode categorical columns
    label_encoders = {}
    for col in df.select_dtypes(include=['object', 'category']).columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le
        
    # Polynomial features
    poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
    poly_features = poly.fit_transform(df[['num_lanes', 'curvature']])
    poly_cols = [f'poly_{i}' for i in range(poly_features.shape[1])]
    df[poly_cols] = poly_features

    print(train.head())
    
    return df

In [12]:
preprocess_train = preprocess_dataframe(train,drop_cols=['accident_risk','id'])

   id road_type  num_lanes  curvature  speed_limit  lighting weather  \
0   0     urban          2       0.06           35  daylight   rainy   
1   1     urban          4       0.99           35  daylight   clear   
2   2     rural          4       0.63           70       dim   clear   
3   3   highway          4       0.07           35       dim   rainy   
4   4     rural          1       0.58           60  daylight   foggy   

   road_signs_present  public_road time_of_day  holiday  school_season  \
0               False         True   afternoon    False           True   
1                True        False     evening     True           True   
2               False         True     morning     True          False   
3                True         True     morning    False          False   
4               False        False     evening     True          False   

   num_reported_accidents  accident_risk  
0                       1           0.13  
1                       0           

In [13]:
preprocess_train

Unnamed: 0,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,poly_0,poly_1,poly_2
0,2,2,0.06,35,0,2,0,1,0,0,1,1,2.0,0.06,0.12
1,2,4,0.99,35,0,0,1,0,1,1,1,0,4.0,0.99,3.96
2,1,4,0.63,70,1,0,0,1,2,1,0,2,4.0,0.63,2.52
3,0,4,0.07,35,1,2,1,1,2,0,0,1,4.0,0.07,0.28
4,1,1,0.58,60,0,1,0,0,1,1,0,1,1.0,0.58,0.58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517749,0,4,0.10,70,0,1,1,1,0,0,0,2,4.0,0.10,0.40
517750,1,4,0.47,35,0,2,1,1,2,0,0,1,4.0,0.47,1.88
517751,2,4,0.62,25,0,1,0,0,0,0,1,0,4.0,0.62,2.48
517752,0,3,0.63,25,2,0,1,0,0,1,1,3,3.0,0.63,1.89


In [14]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

def get_score(n_estimators):
    my_pipeline = Pipeline(steps=[
        ('model', XGBRegressor(n_estimators=n_estimators, learning_rate=0.05) )
    ])
    scores = -1 * cross_val_score(my_pipeline, X_train_final, y_train_fianl,
                              cv=5,
                              scoring='neg_mean_absolute_error')
    cv_mae = scores.mean()

    # Fit on full training set
    my_pipeline.fit(X_train_final, y_train_fianl)
    
    # Validation score
    valid_preds = my_pipeline.predict(X_valid)
    valid_mae = mean_absolute_error(y_valid, valid_preds)
    
    #print(f"CV MAE: {cv_mae:.4f}")
    #print(f"Validation MAE: {valid_mae:.4f}")

    return cv_mae, valid_mae

In [15]:
train_data = preprocess_dataframe(train,drop_cols=['id'])
test_data_id = test['id']
test_data = preprocess_dataframe(test,drop_cols=['id'])

   id road_type  num_lanes  curvature  speed_limit  lighting weather  \
0   0     urban          2       0.06           35  daylight   rainy   
1   1     urban          4       0.99           35  daylight   clear   
2   2     rural          4       0.63           70       dim   clear   
3   3   highway          4       0.07           35       dim   rainy   
4   4     rural          1       0.58           60  daylight   foggy   

   road_signs_present  public_road time_of_day  holiday  school_season  \
0               False         True   afternoon    False           True   
1                True        False     evening     True           True   
2               False         True     morning     True          False   
3                True         True     morning    False          False   
4               False        False     evening     True          False   

   num_reported_accidents  accident_risk  
0                       1           0.13  
1                       0           

In [16]:
train_x = train_data.drop(['accident_risk'] , axis=1)
train_y = train_data['accident_risk']

In [17]:
X_train_final, X_valid, y_train_fianl, y_valid = train_test_split(train_x, train_y, train_size=0.8, test_size=0.2,random_state=0)

In [18]:
# Define the model
my_model_1 = XGBRegressor(n_estimators=1000, learning_rate=0.025) 

# Fit the model
my_model_1.fit(X_train_final, y_train_fianl)

# Get predictions
predictions_1 = my_model_1.predict(X_valid)

# Calculate MAE
mae_1 =  mean_absolute_error(predictions_1, y_valid)

# Uncomment to print MAE
print("Mean Absolute Error:" , mae_1)

Mean Absolute Error: 0.043599451007814795


In [19]:
# Define the model
my_model_2 = XGBRegressor(n_estimators=200, learning_rate=0.025) 

# Fit the model
my_model_2.fit(X_train_final, y_train_fianl)

# Get predictions
predictions_2 = my_model_2.predict(X_valid)

# Calculate MAE
mae_2 =  mean_absolute_error(predictions_2, y_valid)

# Uncomment to print MAE
print("Mean Absolute Error:" , mae_2)

Mean Absolute Error: 0.04379072902280048


In [20]:
# Define the model
my_model_3 = XGBRegressor(n_estimators=300, learning_rate=0.025) 

# Fit the model
my_model_3.fit(X_train_final, y_train_fianl)

# Get predictions
predictions_3 = my_model_3.predict(X_valid)

# Calculate MAE
mae_3 =  mean_absolute_error(predictions_3, y_valid)

# Uncomment to print MAE
print("Mean Absolute Error:" , mae_3)

Mean Absolute Error: 0.043693216580775


In [21]:
final_preds = np.round(my_model_3.predict(test_data),3)

In [22]:
print(final_preds)

[0.294 0.126 0.188 ... 0.252 0.129 0.49 ]


In [23]:
submission = pd.DataFrame({'id': test_data_id, 'accident_risk': final_preds})
submission.to_csv('submission.csv', index=False)
display(submission.head())

Unnamed: 0,id,accident_risk
0,517754,0.294
1,517755,0.126
2,517756,0.188
3,517757,0.326
4,517758,0.41
