## Importing necessary files

In [None]:
import pandas as pd
import numpy as np

# Reading train data and cleaning it  

In [None]:
df=pd.read_csv(r"/kaggle/input/solar-panel-degradation/dataset/train.csv")
df.head()

In [None]:
df.info()

In [None]:
df["humidity"] = pd.to_numeric(df["humidity"], errors='coerce')
df["wind_speed"] = pd.to_numeric(df["wind_speed"], errors='coerce')
df["pressure"] = pd.to_numeric(df["pressure"], errors='coerce')


In [None]:
mode_value = df['installation_type'].mode()[0]
df['installation_type'].fillna(mode_value, inplace=True)
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['installation_type'] = le.fit_transform(df['installation_type'])

In [None]:
columns_to_fill = ['irradiance','wind_speed',"pressure","humidity", 'soiling_ratio', 'voltage', 'current',"maintenance_count","temperature","cloud_coverage","module_temperature","panel_age"]

for col in columns_to_fill:
    df[col].fillna(df[col].mean(), inplace=True)

In [None]:
df['irradiance'] = df['irradiance'].where((df['irradiance'] >= 0) & (df['irradiance'] <= 1500), df['irradiance'].median)
df['humidity'] = df['humidity'].where((df['humidity'] >= 70) , df['humidity'].median)
df['soiling_ratio'] = df['soiling_ratio'].where((df['soiling_ratio'] <= 70) , df['soiling_ratio'].median())

# Choosing relevant features, using MI,Correlation, VIF

In [None]:
df=df.drop(["string_id","error_code","id"], axis=1)
correlation_matrix=df.corr()
print("correlation_matrix", correlation_matrix)

In [None]:
import pandas as pd
#from sklearn.feature_selection import mutual_info_classif  # for classification
from sklearn.feature_selection import mutual_info_regression  # for regression
from sklearn.preprocessing import LabelEncoder

# Separate features and target
X = df.drop('efficiency', axis=1)
y = df['efficiency']

# Encode categorical features if they exist
#X = pd.get_dummies(X)

# Compute mutual information
mi_scores = mutual_info_regression(X, y)  # use mutual_info_regression for regression tasks

# Create a DataFrame for better readability
mi_df = pd.DataFrame({'Feature': X.columns, 'MI Score': mi_scores})
mi_df = mi_df.sort_values(by='MI Score', ascending=False)

print(mi_df)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import pandas as pd

X = add_constant(df[columns_to_fill])
vif = pd.DataFrame()
vif["feature"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]


In [None]:
df=df.drop(["installation_type","temperature","pressure","maintenance_count","module_temperature","wind_speed","cloud_coverage"],axis=1)

# Training different ML models
<The best one being the ensemble Technique>

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error

# Step 3: Encode categorical features (if any)
df = pd.get_dummies(df)

# Step 4: Separate features and target
Z = df.drop("efficiency", axis=1)  # replace with your target
r = df["efficiency"]
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
Z_scaled = scaler.fit_transform(Z)

# Step 5: Train-test split
X_train, X_test, y_train, y_test = train_test_split(Z_scaled, r, test_size=0.3, random_state=0)

# Step 6: Create and train the XGBoost model
model = xgb.XGBRegressor()  # Use XGBRegressor() for regression
model.fit(X_train, y_train)

# Step 7: Predictions
y_pred = model.predict(X_test)
# For regression (if using XGBRegressor)
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))


In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=4,n_estimators=1200)  # or RandomForestRegressor()
model.fit(X_train, y_train)

# Step 7: Make predictions
y_pred = model.predict(X_test)
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))

In [None]:
from sklearn.svm import SVR  # Support Vector Regressor
model1 = SVR(kernel='rbf',gamma='auto',C=0.15)
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))


In [None]:
import numpy as np
# gradient bossting regressor.
from sklearn.ensemble import GradientBoostingRegressor
est = GradientBoostingRegressor(
    n_estimators=1200, learning_rate=0.01, max_depth=3, random_state=50,
    loss='squared_error'
)
est = est.fit(X_train, y_train)
y_pred = est.predict(X_test)
Score = 100*(1-np.sqrt(mean_squared_error(y_test,y_pred)))
print("score:", Score)


In [None]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score

model = CatBoostRegressor(verbose=0, iterations=1000, learning_rate=0.01, depth=7, random_state=200)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("R² Score:", r2_score(y_test, y_pred))


In [None]:
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR 
from sklearn.ensemble import RandomForestRegressor


reg1 = GradientBoostingRegressor(random_state=10,n_estimators=1200, learning_rate=0.01, max_depth=3, loss='squared_error')
reg2 = RandomForestRegressor(random_state=1,n_estimators=1200)
reg3 = CatBoostRegressor(verbose=0, iterations=1000, learning_rate=0.01, depth=7, random_state=2)
reg4 = SVR(kernel='rbf',gamma='auto',C=0.15)


ereg = VotingRegressor(estimators=[('gb', reg1), ('rf', reg2),('cb',reg3),('sv',reg4)])
ereg = ereg.fit(X_train, y_train)
y_pred = ereg.predict(X_test)
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
Score = 100*(1-np.sqrt(mean_squared_error(y_test,y_pred)))
print("score:", Score)

In [None]:
class ELMRegressor:
    def __init__(self, n_hidden=100, activation='sigmoid',random_state=None):
        self.n_hidden = n_hidden
        self.activation = activation

    def _activate(self, X):
        if self.activation == 'sigmoid':
            return 1 / (1 + np.exp(-X))
        elif self.activation == 'tanh':
            return np.tanh(X)
        elif self.activation == 'relu':
            return np.maximum(0, X)
        else:
            raise ValueError("Unsupported activation")

    def fit(self, X, y):
        n_samples, n_features = X.shape

        # Random weights and biases
        self.input_weights = np.random.randn(n_features, self.n_hidden)
        self.biases = np.random.randn(self.n_hidden)

        # Hidden layer output
        H = self._activate(np.dot(X, self.input_weights) + self.biases)

        # Moore-Penrose pseudoinverse to solve output weights
        self.output_weights = np.dot(np.linalg.pinv(H), y)

    def predict(self, X):
        H = self._activate(np.dot(X, self.input_weights) + self.biases)
        return np.dot(H, self.output_weights)


In [None]:
elm = ELMRegressor(n_hidden=500, activation='sigmoid')
elm.fit(X_train, y_train)

# Predict
y_pred_scaled = elm.predict(X_test)
# Evaluation
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("R² Score:", r2_score(y_test, y_pred))


# Reading test data for final prediction

In [None]:
df_test=pd.read_csv("/kaggle/input/solar-panel-degradation/dataset/test.csv")

In [None]:
df_test["humidity"] = pd.to_numeric(df_test["humidity"], errors='coerce')
df_test["wind_speed"] = pd.to_numeric(df_test["wind_speed"], errors='coerce')
df_test["pressure"] = pd.to_numeric(df_test["pressure"], errors='coerce')

In [None]:
df_test=df_test.drop(["string_id",'cloud_coverage',"module_temperature","maintenance_count","wind_speed","error_code","id","temperature","pressure","installation_type"], axis=1)

In [None]:
columns_to_fill = ['irradiance',"humidity", 'soiling_ratio', 'voltage', 'current',"panel_age"]

for col in columns_to_fill:
    df_test[col].fillna(df_test[col].median(), inplace=True)

In [None]:
df_test['irradiance'] = df_test['irradiance'].where((df_test['irradiance'] >= 0) & (df_test['irradiance'] <= 1500), df_test['irradiance'].median())
df_test['humidity'] = df_test['humidity'].where((df_test['humidity'] >= 70) , df_test['humidity'].median())
df['soiling_ratio'] = df['soiling_ratio'].where((df['soiling_ratio'] <= 70) , df_test['soiling_ratio'].median())

In [None]:
df_test_1=pd.read_csv("/kaggle/input/solar-panel-degradation/dataset/test.csv")

In [None]:
X_scaled1 = scaler.fit_transform(df_test)
y_test_pred1 =ereg.predict(X_scaled1)

In [None]:
df_test["efficiency"]=y_test_pred1
df_test["id"]=df_test_1["id"]

In [None]:
z=df_test[["id","efficiency"]]

In [None]:
z.to_csv("submission1.csv", index=False)

 Note:-All the models were equally good, but ensemble using voting regressor showed best results