#WattSage: Energy Oracle Pro

##Here we are importing libraries and loading our data set.


In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import seaborn as sns

# Load the dataset
data = pd.read_csv('/Users/vanessa/Documents/CCPP_data.csv')


###Here we split the data and train the Linear and Random Forest Regression Models

In [None]:

# Split data into features (X) and target (y)
X = data.drop(columns=['PE'])  # Use all features except 'PE'
y = data['PE']  # Target variable

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a Linear Regression model
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)

# Make predictions on the test set
y_pred_lr = model_lr.predict(X_test)

# Calculate Mean Absolute Error and RMSE for evaluation
mae_lr = mean_absolute_error(y_test, y_pred_lr)
rmse_lr = mean_squared_error(y_test, y_pred_lr, squared=False)

# Print Linear Regression results
print("Linear Regression:")
print(f"Mean Absolute Error: {mae_lr:.2f}")
print(f"Root Mean Squared Error: {rmse_lr:.2f}")

# Initialize and train a Random Forest Regression model
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = model_rf.predict(X_test)

# Calculate Mean Absolute Error and RMSE for evaluation
mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))

# Print Random Forest Regression results
print("\nRandom Forest Regression:")
print(f"Mean Absolute Error: {mae_rf:.2f}")
print(f"Root Mean Squared Error: {rmse_rf:.2f}")



###Here we analyze the data grapichally.

In [None]:
# Data exploration and visualization

# Create a pairplot to visualize relationships between features and the target variable
sns.pairplot(data, x_vars=['AT', 'V', 'AP', 'RH'], y_vars='PE', kind='reg')
plt.title('Pairplot of Features vs. Electrical Energy Output')
plt.show()

# Create a heatmap to visualize correlations between features
correlation_matrix = data[['AT', 'V', 'AP', 'RH', 'PE']].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Create a bar plot to show the distribution of the target variable
plt.figure(figsize=(8, 6))
sns.histplot(data['PE'], bins=20, kde=True)
plt.title('Distribution of Electrical Energy Output (PE)')
plt.xlabel('Electrical Energy Output (PE)')
plt.ylabel('Frequency')
plt.show()

# Create a scatter plot to visualize the relationship between a feature and the target variable
plt.figure(figsize=(8, 6))
sns.scatterplot(x='AT', y='PE', data=data, alpha=0.7)
plt.title('Scatter Plot: Temperature vs. Electrical Energy Output')
plt.xlabel('Temperature')
plt.ylabel('Electrical Energy Output (PE)')
plt.show()
