In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb

from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error

train = pd.read_csv('train.csv')
val = pd.read_csv('val.csv')
test = pd.read_csv('test.csv')

print("Train shape:", train.shape)
print("Validation shape:", val.shape)
print("Test shape:", test.shape)


In [None]:
target = 'Lap_Time_Seconds'

plt.figure(figsize=(8, 4))
sns.histplot(train[target], kde=True, bins=30)
plt.title("Target Distribution (Lap Time Seconds)")
plt.xlabel("Lap Time")
plt.ylabel("Frequency")
plt.show()

print(train.describe())
print("\nMissing values:\n", train.isnull().sum().sort_values(ascending=False).head(10))


In [None]:
def advanced_feature_engineering(df):
    if all(col in df.columns for col in ['Year_x', 'Min_year', 'Years_active']):
        df['career_progress'] = (df['Year_x'] - df['Min_year']) / (df['Years_active'] + 1e-5)
    elif all(col in df.columns for col in ['Year', 'Min_year', 'Years_active']):
        df['career_progress'] = (df['Year'] - df['Min_year']) / (df['Years_active'] + 1e-5)

    if 'Podiums' in df.columns and 'Starts' in df.columns:
        df['podium_per_start'] = df['Podiums'] / (df['Starts'] + 1e-5)

    if 'Points' in df.columns and 'Circuit_Length_km' in df.columns and 'Laps' in df.columns:
        df['points_per_km'] = df['Points'] / (df['Circuit_Length_km'] * df['Laps'] + 1e-5)

    if 'Tire_Degradation_Factor_per_Lap' in df.columns and 'Track_Temperature_Celsius' in df.columns:
        df['tire_stress'] = df['Tire_Degradation_Factor_per_Lap'] * df['Track_Temperature_Celsius'] / 100

    if 'Avg_Speed_kmh' in df.columns:
        df['speed_squared'] = df['Avg_Speed_kmh'] ** 2

    if 'Avg_Speed_kmh' in df.columns and 'Track_Temperature_Celsius' in df.columns:
        df['speed_temp_ratio'] = df['Avg_Speed_kmh'] / (df['Track_Temperature_Celsius'] + 1e-5)

    if 'Session' in df.columns:
        session_map = {'Practice 1': 10, 'Practice 2': 14, 'Qualifying': 16, 'Race': 15}
        df['hour_of_day'] = df['Session'].map(session_map).fillna(12)

    if 'Rider_ID' in df.columns and 'Team' in df.columns:
        df['rider_team_combo'] = df['Rider_ID'].astype(str) + '_' + df['Team'].astype(str)

    if 'Rider_ID' in df.columns and 'Circuit_name' in df.columns:
        df['rider_circuit_experience'] = df['Rider_ID'].astype(str) + '_' + df['Circuit_name'].astype(str)

    if 'Avg_Speed_kmh' in df.columns and 'Corners_per_Lap' in df.columns:
        df['speed_per_corner'] = df['Avg_Speed_kmh'] / (df['Corners_per_Lap'] + 1e-5)

    if 'Ambient_Temperature_Celsius' in df.columns and 'Track_Temperature_Celsius' in df.columns:
        df['temp_difference'] = df['Ambient_Temperature_Celsius'] - df['Track_Temperature_Celsius']

    if 'Ambient_Temperature_Celsius' in df.columns:
        df['optimal_temp_range'] = ((df['Ambient_Temperature_Celsius'] >= 20) & 
                                    (df['Ambient_Temperature_Celsius'] <= 30)).astype(int)
    return df


In [None]:
drop_cols = ['Unique ID', 'Rider_name', 'Team_name', 'Bike_name', target]

transformer = PowerTransformer(method='yeo-johnson')
y_train = transformer.fit_transform(train[[target]]).ravel()
y_val = transformer.transform(val[[target]]).ravel()

X_train = train.drop(columns=drop_cols, errors='ignore').pipe(advanced_feature_engineering)
X_val = val.drop(columns=drop_cols, errors='ignore').pipe(advanced_feature_engineering)
X_test = test.drop(columns=drop_cols, errors='ignore').pipe(advanced_feature_engineering)


In [None]:
def optimize_memory(df):
    for col in df.select_dtypes(include=['object', 'category']):
        df[col] = df[col].astype('category')
    num_cols = df.select_dtypes(include=np.number).columns
    df[num_cols] = df[num_cols].apply(pd.to_numeric, downcast='float')
    return df

X_train = optimize_memory(X_train)
X_val = optimize_memory(X_val)
X_test = optimize_memory(X_test)


In [None]:
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 255,
    'learning_rate': 0.005,
    'feature_fraction': 0.8,
    'bagging_freq': 3,
    'bagging_fraction': 0.85,
    'min_data_in_leaf': 50,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'max_depth': -1,
    'n_jobs': -1,
    'verbose': -1
}

train_data = lgb.Dataset(X_train, y_train)
val_data = lgb.Dataset(X_val, y_val, reference=train_data)

model = lgb.train(
    params,
    train_data,
    num_boost_round=10000,
    valid_sets=[train_data, val_data],
    valid_names=['train', 'valid'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=500),
        lgb.log_evaluation(200)
    ]
)


In [None]:
val_preds = model.predict(X_val)
val_preds_inv = transformer.inverse_transform(val_preds.reshape(-1,1)).ravel()
y_val_inv = transformer.inverse_transform(y_val.reshape(-1,1)).ravel()

rmse = np.sqrt(mean_squared_error(y_val_inv, val_preds_inv))
mae = mean_absolute_error(y_val_inv, val_preds_inv)

print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")

lgb.plot_importance(model, max_num_features=20, figsize=(10, 6))
plt.title("Top Feature Importances")
plt.show()


In [None]:
test_preds = transformer.inverse_transform(model.predict(X_test).reshape(-1,1)).ravel()

submission = pd.read_csv('sample_submission.csv')
submission['Lap_Time_Seconds'] = test_preds
submission.to_csv('submission.csv', index=False)

print("submission.csv saved successfully!")
