In [None]:
import pandas as pd
import numpy as np
from google.colab import files
uploaded = files.upload()
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")
test_ids = test['id']
train.drop(columns=['id'], inplace=True)
test.drop(columns=['id'], inplace=True)

Saving sample_submission.csv to sample_submission.csv
Saving test.csv to test.csv
Saving train.csv to train.csv


In [None]:
#final program
!pip install optuna lightgbm xgboost catboost scikit-learn pandas numpy -q

import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

train = pd.read_csv("/content/train.csv")
test = pd.read_csv("/content/test.csv")
test_ids = test['id']
train.drop(columns=['id'], inplace=True)
test.drop(columns=['id'], inplace=True)


for col in ['humidity', 'wind_speed', 'pressure']:
    train[col] = pd.to_numeric(train[col], errors='coerce')
    test[col] = pd.to_numeric(test[col], errors='coerce')
    mean_val = train[col].mean()
    train[col].fillna(mean_val, inplace=True)
    test[col].fillna(mean_val, inplace=True)


def add_features(df):
    df['power'] = df['voltage'] * df['current']
    df['temp_diff'] = df['module_temperature'] - df['temperature']
    df['irradiance_temp_ratio'] = df['irradiance'] / (df['temperature'] + 1)
    df['efficiency_estimate'] = df['power'] / (df['irradiance'] + 1)
    df['log_irradiance'] = np.log1p(df['irradiance'])
    df['log_power'] = np.log1p(df['power'])
    return df

train = add_features(train)
test = add_features(test)

categorical_cols = ['string_id', 'error_code', 'installation_type']
numerical_cols = train.drop(columns=['efficiency'] + categorical_cols).columns.tolist()

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])

X_full = train.drop(columns=['efficiency'])
y_full = train['efficiency']
X_train, X_val, y_train, y_val = train_test_split(X_full, y_full, test_size=0.2, random_state=42)

preprocessor.fit(X_train)
X_train_trans = preprocessor.transform(X_train)
X_val_trans = preprocessor.transform(X_val)
X_test_trans = preprocessor.transform(test)

def custom_score(y_true, y_pred):
    return 100 * (1 - np.sqrt(mean_squared_error(y_true, y_pred)))

def lgb_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 300, 700),
        'learning_rate': trial.suggest_float('learning_rate', 0.02, 0.08),
        'num_leaves': trial.suggest_int('num_leaves', 20, 50),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 15, 50),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'random_state': 42
    }
    model = LGBMRegressor(**params)
    model.fit(X_train_trans, y_train)
    preds = model.predict(X_val_trans)
    return np.sqrt(mean_squared_error(y_val, preds))

lgb_study = optuna.create_study(direction='minimize')
lgb_study.optimize(lgb_objective, n_trials=15)
best_lgb = LGBMRegressor(**lgb_study.best_params)
best_lgb.fit(X_train_trans, y_train)


def xgb_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 300, 700),
        'max_depth': trial.suggest_int('max_depth', 4, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.02, 0.08),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'random_state': 42
    }
    model = XGBRegressor(**params)
    model.fit(X_train_trans, y_train, verbose=False)
    preds = model.predict(X_val_trans)
    return np.sqrt(mean_squared_error(y_val, preds))

xgb_study = optuna.create_study(direction='minimize')
xgb_study.optimize(xgb_objective, n_trials=15)
best_xgb = XGBRegressor(**xgb_study.best_params)
best_xgb.fit(X_train_trans, y_train)

def cat_objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 300, 700),
        'depth': trial.suggest_int('depth', 4, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.02, 0.08),
        'random_seed': 42,
        'verbose': 0
    }
    model = CatBoostRegressor(**params)
    model.fit(X_train_trans, y_train)
    preds = model.predict(X_val_trans)
    return np.sqrt(mean_squared_error(y_val, preds))

cat_study = optuna.create_study(direction='minimize')
cat_study.optimize(cat_objective, n_trials=15)
best_cat = CatBoostRegressor(**cat_study.best_params)
best_cat.fit(X_train_trans, y_train)
stack = StackingRegressor(
    estimators=[
        ('lgbm', best_lgb),
        ('xgb', best_xgb),
        ('cat', best_cat)
    ],
    final_estimator=XGBRegressor(n_estimators=150, max_depth=4, learning_rate=0.05, random_state=42),
    n_jobs=-1
)
stack.fit(X_train_trans, y_train)
val_preds = stack.predict(X_val_trans)
print("Validation Score:", custom_score(y_val, val_preds))

#prediction
final_preds = stack.predict(X_test_trans)
submission = pd.DataFrame({'id': test_ids, 'efficiency': final_preds})
submission.to_csv("submission.csv", index=False)
print("submission.csv saved!")
from google.colab import files
files.download("submission.csv")