<a href="https://colab.research.google.com/github/vijaykmr18/CodeAlpha/blob/main/SolarEnergy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:

# Mount Google Drive (if your file is stored in Google Drive)


# Load the dataset
file_path = '/content/solarpowergeneration.csv'  # Update the path to your file
data = pd.read_csv(file_path)

In [None]:
# Display the first few rows of the dataset
print("Dataset Preview:")
print(data.head())

In [None]:

# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())

# Drop rows with missing values (or handle them as needed)
data = data.dropna()

In [None]:
# Define features (X) and target variable (y)
X = data.drop(columns=['generated_power_kw'])  # All columns except the target
y = data['generated_power_kw']  # Target variable

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Linear Regression model
model = LinearRegression()

In [None]:
# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation:")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

# Visualize actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.7)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')  # Diagonal line
plt.xlabel('Actual Generated Power (kW)')
plt.ylabel('Predicted Generated Power (kW)')
plt.title('Actual vs Predicted Solar Power Output')
plt.show()





In [None]:
# Feature importance (coefficients of the linear regression model)
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_
}).sort_values(by='Coefficient', ascending=False)

print("\nFeature Importance (Coefficients):")
print(feature_importance)

In [None]:
# Save the trained model (optional)
import joblib
joblib.dump(model, '/content/drive/MyDrive/solar_power_linear_regression_model.pkl')

In [None]:
# Select only the essential features
essential_features = [
    'shortwave_radiation_backwards_sfc',
    'angle_of_incidence',
    'zenith',
    'azimuth'
]

# Define new X (features) using only the essential features
X_essential = data[essential_features]
y = data['generated_power_kw']  # Target variable remains the same

# Split the data into training and testing sets (80% train, 20% test)
X_train_essential, X_test_essential, y_train_essential, y_test_essential = train_test_split(
    X_essential, y, test_size=0.2, random_state=42
)

# Initialize and train a new Linear Regression model
model_essential = LinearRegression()
model_essential.fit(X_train_essential, y_train_essential)

# Make predictions on the test data
y_pred_essential = model_essential.predict(X_test_essential)

# Evaluate the model
mse_essential = mean_squared_error(y_test_essential, y_pred_essential)
r2_essential = r2_score(y_test_essential, y_pred_essential)

print("\nModel Evaluation with Essential Features:")
print(f"Mean Squared Error (MSE): {mse_essential}")
print(f"R-squared (R2): {r2_essential}")



In [None]:
# Visualize actual vs predicted values for the essential features model
plt.figure(figsize=(10, 6))
plt.scatter(y_test_essential, y_pred_essential, alpha=0.7, color='green')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')  # Diagonal line
plt.xlabel('Actual Generated Power (kW)')
plt.ylabel('Predicted Generated Power (kW)')
plt.title('Actual vs Predicted Solar Power Output (Essential Features)')
plt.show()



In [None]:
# Feature importance (coefficients of the linear regression model)
feature_importance_essential = pd.DataFrame({
    'Feature': essential_features,
    'Coefficient': model_essential.coef_
}).sort_values(by='Coefficient', ascending=False)

print("\nFeature Importance (Coefficients) for Essential Features:")
print(feature_importance_essential)



In [None]:
# Compare performance of full-feature model vs essential-feature model
comparison = pd.DataFrame({
    'Model': ['Full-Feature Model', 'Essential-Feature Model'],
    'MSE': [mse, mse_essential],
    'R2': [r2, r2_essential]
})

print("\nComparison of Models:")
print(comparison)

In [None]:
# Select weather-related features
weather_features = [
    'total_cloud_cover_sfc',
    'high_cloud_cover_high_cld_lay',
    'medium_cloud_cover_mid_cld_lay',
    'low_cloud_cover_low_cld_lay',
    'total_precipitation_sfc',
    'snowfall_amount_sfc',
    'temperature_2_m_above_gnd'
]

# Define new X (features) using only the weather-related features
X_weather = data[weather_features]
y = data['generated_power_kw']  # Target variable remains the same

# Split the data into training and testing sets (80% train, 20% test)
X_train_weather, X_test_weather, y_train_weather, y_test_weather = train_test_split(
    X_weather, y, test_size=0.2, random_state=42
)

# Initialize and train a new Linear Regression model
model_weather = LinearRegression()
model_weather.fit(X_train_weather, y_train_weather)

# Make predictions on the test data
y_pred_weather = model_weather.predict(X_test_weather)

# Evaluate the model
mse_weather = mean_squared_error(y_test_weather, y_pred_weather)
r2_weather = r2_score(y_test_weather, y_pred_weather)

print("\nModel Evaluation with Weather-Related Features:")
print(f"Mean Squared Error (MSE): {mse_weather}")
print(f"R-squared (R2): {r2_weather}")

# Visualize actual vs predicted values for the weather-related features model
plt.figure(figsize=(10, 6))
plt.scatter(y_test_weather, y_pred_weather, alpha=0.7, color='orange')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')  # Diagonal line
plt.xlabel('Actual Generated Power (kW)')
plt.ylabel('Predicted Generated Power (kW)')
plt.title('Actual vs Predicted Solar Power Output (Weather-Related Features)')
plt.show()



In [None]:
# Feature importance (coefficients of the linear regression model)
feature_importance_weather = pd.DataFrame({
    'Feature': weather_features,
    'Coefficient': model_weather.coef_
}).sort_values(by='Coefficient', ascending=False)

print("\nFeature Importance (Coefficients) for Weather-Related Features:")
print(feature_importance_weather)



In [None]:
# Compare performance of all models
comparison_all = pd.DataFrame({
    'Model': ['Full-Feature Model', 'Essential-Feature Model', 'Weather-Related Feature Model'],
    'MSE': [mse, mse_essential, mse_weather],
    'R2': [r2, r2_essential, r2_weather]
})

print("\nComparison of All Models:")
print(comparison_all)

In [None]:
# Select wind-related features
wind_features = [
    'wind_speed_10_m_above_gnd',
    'wind_speed_80_m_above_gnd',
    'wind_direction_10_m_above_gnd',
    'wind_direction_80_m_above_gnd',
    'wind_gust_10_m_above_gnd'
]

# Define new X (features) using only the wind-related features
X_wind = data[wind_features]
y = data['generated_power_kw']  # Target variable remains the same

# Split the data into training and testing sets (80% train, 20% test)
X_train_wind, X_test_wind, y_train_wind, y_test_wind = train_test_split(
    X_wind, y, test_size=0.2, random_state=42
)

# Initialize and train a new Linear Regression model
model_wind = LinearRegression()
model_wind.fit(X_train_wind, y_train_wind)

# Make predictions on the test data
y_pred_wind = model_wind.predict(X_test_wind)

# Evaluate the model
mse_wind = mean_squared_error(y_test_wind, y_pred_wind)
r2_wind = r2_score(y_test_wind, y_pred_wind)

print("\nModel Evaluation with Wind-Related Features:")
print(f"Mean Squared Error (MSE): {mse_wind}")
print(f"R-squared (R2): {r2_wind}")

# Visualize actual vs predicted values for the wind-related features model
plt.figure(figsize=(10, 6))
plt.scatter(y_test_wind, y_pred_wind, alpha=0.7, color='purple')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')  # Diagonal line
plt.xlabel('Actual Generated Power (kW)')
plt.ylabel('Predicted Generated Power (kW)')
plt.title('Actual vs Predicted Solar Power Output (Wind-Related Features)')
plt.show()



In [None]:
# Feature importance (coefficients of the linear regression model)
feature_importance_wind = pd.DataFrame({
    'Feature': wind_features,
    'Coefficient': model_wind.coef_
}).sort_values(by='Coefficient', ascending=False)

print("\nFeature Importance (Coefficients) for Wind-Related Features:")
print(feature_importance_wind)



In [None]:
# Compare performance of all models
comparison_all = pd.DataFrame({
    'Model': ['Full-Feature Model', 'Essential-Feature Model', 'Weather-Related Feature Model', 'Wind-Related Feature Model'],
    'MSE': [mse, mse_essential, mse_weather, mse_wind],
    'R2': [r2, r2_essential, r2_weather, r2_wind]
})

print("\nComparison of All Models:")
print(comparison_all)

In [None]:
# Select atmospheric pressure and humidity features
pressure_humidity_features = [
    'relative_humidity_2_m_above_gnd',
    'mean_sea_level_pressure_MSL'
]

# Define new X (features) using only the atmospheric pressure and humidity features
X_pressure_humidity = data[pressure_humidity_features]
y = data['generated_power_kw']  # Target variable remains the same

# Split the data into training and testing sets (80% train, 20% test)
X_train_ph, X_test_ph, y_train_ph, y_test_ph = train_test_split(
    X_pressure_humidity, y, test_size=0.2, random_state=42
)

# Initialize and train a new Linear Regression model
model_ph = LinearRegression()
model_ph.fit(X_train_ph, y_train_ph)

# Make predictions on the test data
y_pred_ph = model_ph.predict(X_test_ph)

# Evaluate the model
mse_ph = mean_squared_error(y_test_ph, y_pred_ph)
r2_ph = r2_score(y_test_ph, y_pred_ph)

print("\nModel Evaluation with Atmospheric Pressure & Humidity Features:")
print(f"Mean Squared Error (MSE): {mse_ph}")
print(f"R-squared (R2): {r2_ph}")

# Visualize actual vs predicted values for the atmospheric pressure & humidity features model
plt.figure(figsize=(10, 6))
plt.scatter(y_test_ph, y_pred_ph, alpha=0.7, color='brown')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')  # Diagonal line
plt.xlabel('Actual Generated Power (kW)')
plt.ylabel('Predicted Generated Power (kW)')
plt.title('Actual vs Predicted Solar Power Output (Atmospheric Pressure & Humidity Features)')
plt.show()



In [None]:
# Feature importance (coefficients of the linear regression model)
feature_importance_ph = pd.DataFrame({
    'Feature': pressure_humidity_features,
    'Coefficient': model_ph.coef_
}).sort_values(by='Coefficient', ascending=False)

print("\nFeature Importance (Coefficients) for Atmospheric Pressure & Humidity Features:")
print(feature_importance_ph)



In [None]:
# Compare performance of all models
comparison_all = pd.DataFrame({
    'Model': ['Full-Feature Model', 'Essential-Feature Model', 'Weather-Related Feature Model',
              'Wind-Related Feature Model', 'Atmospheric Pressure & Humidity Feature Model'],
    'MSE': [mse, mse_essential, mse_weather, mse_wind, mse_ph],
    'R2': [r2, r2_essential, r2_weather, r2_wind, r2_ph]
})

print("\nComparison of All Models:")
print(comparison_all)

In [None]:
# Import necessary libraries
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor

# Define the recommended feature set
features = [
    'shortwave_radiation_backwards_sfc', 'angle_of_incidence', 'zenith', 'azimuth',
    'total_cloud_cover_sfc', 'low_cloud_cover_low_cld_lay', 'medium_cloud_cover_mid_cld_lay',
    'high_cloud_cover_high_cld_lay', 'temperature_2_m_above_gnd', 'total_precipitation_sfc',
    'wind_speed_10_m_above_gnd', 'wind_speed_80_m_above_gnd'
]
target = 'generated_power_kw'

# Select features and target variable
X = data[features]
y = data[target]

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a linear regression model with all recommended features
model_full = LinearRegression()
model_full.fit(X_train, y_train)

# Make predictions on the test data
y_pred_full = model_full.predict(X_test)

# Evaluate the model
mse_full = mean_squared_error(y_test, y_pred_full)
r2_full = r2_score(y_test, y_pred_full)

print("\nModel Evaluation with Full Recommended Feature Set:")
print(f"Mean Squared Error (MSE): {mse_full}")
print(f"R-squared (R2): {r2_full}")



In [None]:


# Define features and target variable
features = [
    'shortwave_radiation_backwards_sfc', 'angle_of_incidence', 'zenith', 'azimuth',
    'total_cloud_cover_sfc', 'low_cloud_cover_low_cld_lay', 'medium_cloud_cover_mid_cld_lay',
    'high_cloud_cover_high_cld_lay', 'temperature_2_m_above_gnd', 'total_precipitation_sfc',
    'wind_speed_10_m_above_gnd', 'wind_speed_80_m_above_gnd'
]
target = 'generated_power_kw'

X = data[features]
y = data[target]

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation:")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

# Visualize Actual vs Predicted Solar Power Output with a Bar Chart
# Select a subset of the test data for visualization (e.g., first 10 samples)
subset_size = 10
y_test_subset = y_test[:subset_size]
y_pred_subset = y_pred[:subset_size]

# Create a bar chart
x_labels = [f'Sample {i+1}' for i in range(subset_size)]
x = np.arange(len(x_labels))  # X-axis positions
width = 0.35  # Width of the bars

plt.figure(figsize=(12, 6))
plt.bar(x - width/2, y_test_subset, width, label='Actual Power (kW)', color='blue', alpha=0.7)
plt.bar(x + width/2, y_pred_subset, width, label='Predicted Power (kW)', color='orange', alpha=0.7)

# Add labels, title, and legend
plt.xlabel('Samples')
plt.ylabel('Solar Power Output (kW)')
plt.title('Actual vs Predicted Solar Power Output')
plt.xticks(x, x_labels)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Show the plot
plt.tight_layout()
plt.show()



In [None]:
# Print actual vs predicted values for the subset
comparison_df = pd.DataFrame({
    'Sample': x_labels,
    'Actual Power (kW)': y_test_subset,
    'Predicted Power (kW)': y_pred_subset
})
print("\nComparison of Actual vs Predicted Solar Power Output:")
print(comparison_df)

In [None]:
# Feature importance (coefficients of the linear regression model)
feature_importance_full = pd.DataFrame({
    'Feature': features,
    'Coefficient': model_full.coef_
}).sort_values(by='Coefficient', ascending=False)

print("\nFeature Importance (Coefficients) for Full Recommended Feature Set:")
print(feature_importance_full)


In [None]:

# Correlation Matrix Analysis
correlation_matrix = data[features + [target]].corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Features with Target')
plt.show()



In [None]:
# Recursive Feature Elimination (RFE) for Feature Selection
# Use a Random Forest Regressor as the base estimator for RFE
rf_model = RandomForestRegressor(random_state=42)
rfe = RFE(estimator=rf_model, n_features_to_select=6)  # Select top 6 features
rfe.fit(X_train, y_train)

# Get the selected features
selected_features = np.array(features)[rfe.support_]
print("\nSelected Features from RFE:")
print(selected_features)



In [None]:
# Train a new model with the selected features
X_train_rfe = X_train[selected_features]
X_test_rfe = X_test[selected_features]

model_rfe = LinearRegression()
model_rfe.fit(X_train_rfe, y_train)

# Make predictions on the test data
y_pred_rfe = model_rfe.predict(X_test_rfe)

# Evaluate the RFE model
mse_rfe = mean_squared_error(y_test, y_pred_rfe)
r2_rfe = r2_score(y_test, y_pred_rfe)

print("\nModel Evaluation with RFE-Selected Features:")
print(f"Mean Squared Error (MSE): {mse_rfe}")
print(f"R-squared (R2): {r2_rfe}")


In [None]:
# Compare performance of models
comparison_all = pd.DataFrame({
    'Model': ['Full Recommended Feature Set', 'RFE-Selected Features'],
    'MSE': [mse_full, mse_rfe],
    'R2': [r2_full, r2_rfe]
})

print("\nComparison of Models:")
print(comparison_all)