In [4]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
import pickle

# Load data
df = pd.read_csv('train.csv')

# View columns with problematic entries
print("üîç Checking for bad values:")
for col in df.columns:
    if df[col].dtype == 'object':
        print(f"\nColumn: {col}")
        print(df[col].value_counts())

# Replace or drop rows with 'badval' or similar
df.replace('badval', np.nan, inplace=True)

# Convert all numeric columns to proper types
for col in df.columns:
    if col not in ['id', 'string_id', 'error_code', 'installation_type']:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows with missing values
df.dropna(inplace=True)

# Encode categorical features
categorical_cols = ['string_id', 'error_code', 'installation_type']
df[categorical_cols] = df[categorical_cols].astype(str)
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Split features and target
X = df_encoded.drop(columns=['id', 'efficiency'])
y = df_encoded['efficiency']

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Scoring function
def get_score(y_true, y_pred):
    return 100 * (1 - np.sqrt(mean_squared_error(y_true, y_pred)))

# Train and compare models
models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
}

best_model = None
best_score = -np.inf

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_val_scaled)
    score = get_score(y_val, preds)
    print(f"{name} Score: {score:.2f}")

    if score > best_score:
        best_score = score
        best_model = model

print("\n‚úÖ Best Model:", best_model.__class__.__name__)
print("üéØ Best Score:", round(best_score, 2))

# Save the best model
with open("best_solar_model.pkl", "wb") as f:
    pickle.dump(best_model, f)


üîç Checking for bad values:

Column: humidity
humidity
unknown               12
error                 10
badval                 7
5.943344787571525      1
65.80376774001192      1
                      ..
53.27151264182748      1
39.44616847963911      1
23.871028628631695     1
33.254590849551434     1
46.07720368087014      1
Name: count, Length: 4494, dtype: int64

Column: wind_speed
wind_speed
unknown               9
error                 8
badval                5
3.5726456615192026    1
2.3842914378686486    1
                     ..
8.061014922753328     1
3.9543370599359577    1
10.701360275854071    1
4.481020398384388     1
7.137585549949118     1
Name: count, Length: 4501, dtype: int64

Column: pressure
pressure
unknown               13
error                 10
badval                 9
1001.5575671939629     1
1012.3993993872118     1
                      ..
1019.2609543313911     1
1000.5589830071985     1
1011.1038770353858     1
1005.4653107849828     1
1021.20394214795

In [5]:
# ... (previous code)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Save the scaler
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

# ... (rest of your code for model training and saving)

In [6]:
# After training your model and encoding your data
required_columns = X_train.columns.tolist()

# Save it
import pickle
with open("columns.pkl", "wb") as f:
    pickle.dump(required_columns, f)
