In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

# Load your dataset
data = pd.read_csv('data_points.csv')

data.head()



from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd

# Load your dataset
#data = pd.read_csv('your_dataset.csv')

# Convert 'Datetime' column to datetime format and extract features
data['Datetime'] = pd.to_datetime(data['Datetime'])
data['year'] = data['Datetime'].dt.year
data['month'] = data['Datetime'].dt.month
data['day'] = data['Datetime'].dt.day
data['hour'] = data['Datetime'].dt.hour

# Drop the original 'Datetime' column
data.drop('Datetime', axis=1, inplace=True)

# One-hot encode the 'building_type' column
data = pd.get_dummies(data, columns=['building_type'])

data.head()

print(data.dtypes)




# Convert 'square_feet' and 'year_built' to numeric, coercing errors into NaN
data['square_feet'] = pd.to_numeric(data['square_feet'], errors='coerce')
data['year_built'] = pd.to_numeric(data['year_built'], errors='coerce')

# Check for NaN values in these columns
print("NaN counts before filling:")
print(data[['square_feet', 'year_built']].isna().sum())

# Fill NaNs with the median of their respective columns
data['square_feet'].fillna(data['square_feet'].median(), inplace=True)
data['year_built'].fillna(data['year_built'].median(), inplace=True)

# Confirm that NaN values have been addressed
print("\nNaN counts after filling:")
print(data[['square_feet', 'year_built']].isna().sum())

# Re-check data types to ensure conversion was successful
print("\nData types after conversion:")
print(data.dtypes)






from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

# data is the DataFrame ready for training
X = data.drop('kwh', axis=1)  # Features
y = data['kwh']  # Target variable

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
clf = RandomForestRegressor(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Make predictions
predictions = clf.predict(X_test)

# Evaluate the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, predictions)

# Calculate Root Mean Squared Error (RMSE) for a more interpretable metric
rmse = np.sqrt(mse)
# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, predictions)

# Calculate R-squared (R^2)
r2 = r2_score(y_test, predictions)

print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'MAE: {mae:.2f}')
print(f'R^2: {r2:.2f}')




# Calculate the mean of the target variable
mean_kwh = y_train.mean()

# Create a list/array of mean values to serve as our predictions
baseline_predictions = [mean_kwh] * len(y_test)

# Calculate the metrics for the baseline model
baseline_mae = mean_absolute_error(y_test, baseline_predictions)
baseline_mse = mean_squared_error(y_test, baseline_predictions)
baseline_rmse = np.sqrt(baseline_mse)
baseline_r2 = r2_score(y_test, baseline_predictions)

print(f'Baseline MAE: {baseline_mae:.2f}')
print(f'Baseline MSE: {baseline_mse:.2f}')
print(f'Baseline RMSE: {baseline_rmse:.2f}')
print(f'Baseline R²: {baseline_r2:.2f}')




from sklearn.model_selection import cross_val_score, KFold

# Setup cross-validation (5-fold in this example)
cv = KFold(n_splits=5, random_state=42, shuffle=True)

# Perform cross-validation for R² and MAE
cv_r2_scores = cross_val_score(clf, X, y, cv=cv, scoring='r2')
cv_mae_scores = cross_val_score(clf, X, y, cv=cv, scoring='neg_mean_absolute_error')

# Calculate the average scores and convert MAE scores to positive
average_cv_r2 = np.mean(cv_r2_scores)
average_cv_mae = -np.mean(cv_mae_scores) # MAE scores are negated by convention in cross_val_score

print(f'Average CV R²: {average_cv_r2:.2f}')
print(f'Average CV MAE: {average_cv_mae:.2f}')




import matplotlib.pyplot as plt
import seaborn as sns

# Ensure that your Jupyter notebook can display plots directly
%matplotlib inline

# Scatter plot of actual vs predicted values
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=predictions)
plt.title('Actual vs. Predicted Values')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')

# Plot a line representing the perfect predictions
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
plt.show()







# Calculate residuals
residuals = y_test - predictions

# Plot residuals
plt.figure(figsize=(10, 6))
sns.scatterplot(x=predictions, y=residuals)
plt.title('Residuals of Predictions')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')

# Horizontal line at 0 to show perfect predictions
plt.axhline(y=0, color='k', linestyle='--')
plt.show()






from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# Select one of the trees from the random forest model
tree = clf.estimators_[0]

# Visualize the selected tree using plot_tree
# Convert feature names from Index to list
feature_names = X.columns.tolist()

plt.figure(figsize=(20,10))  # Set figure size (width, height) in inches
plot_tree(tree, feature_names=feature_names, filled=True)
plt.show()






import matplotlib.pyplot as plt
import pandas as pd

# Data from the provided table
data = {
    'Metric': ['MSE', 'MAE', 'RMSE', 'R2 Score'],
    'Multilinear Regression': [41.6558, 5.0239, 6.4541, 0.0049],
    'LSTM Recurrent Neural Network': [3.8912, 1.2759, 1.9726, None],  # None for N/A value
    'Random Forest': [4.88, 1.56, 2.21, 0.88]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Set index to metric names for plotting
df.set_index('Metric', inplace=True)

# Replace None with 0 for plotting purposes
df.fillna(0, inplace=True)

# Plot area graph
ax = df.plot(kind='area', stacked=False, figsize=(12, 8), alpha=0.3)

# Set titles and labels
plt.title('Model Comparison')
plt.xlabel('Metrics')
plt.ylabel('Values')
plt.xticks(rotation=45)
plt.grid(True)

# Move the legend outside of the plot
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))

# Show plot
plt.tight_layout()
plt.show()



  data = pd.read_csv('data_points.csv')


kwh                                            float64
air_temp                                       float64
dew_temp                                       float64
square_feet                                     object
year_built                                      object
year                                             int64
month                                            int64
day                                              int64
hour                                             int64
building_type_E+I806+G2:H+G2:G1450               uint8
building_type_Education                          uint8
building_type_Entertainment/public assembly      uint8
building_type_Food sales and service             uint8
building_type_Healthcare                         uint8
building_type_Lodging/residential                uint8
building_type_Manufacturing/industrial           uint8
building_type_Office                             uint8
building_type_Other                              uint8
building_t

In [None]:
from sklearn.tree import export_text

# Select one of the trees from the random forest model
tree = clf.estimators_[0]

# Convert feature names from Index to list
feature_names = X.columns.tolist()

# Export the tree to a text representation
tree_text = export_text(tree, feature_names=feature_names)

# Print the text representation of the tree
print(tree_text)


In [None]:
pip install pydotplus


In [None]:
from sklearn.tree import export_graphviz

# Select one of the trees from the random forest model
tree = clf.estimators_[0]

# Export as dot file
dot_file = 'tree.dot'
export_graphviz(tree, out_file=dot_file, feature_names=X.columns, filled=True)

print(f"Exported the decision tree to a dot file: {dot_file}")



#dot -Tpng tree.dot -o tree.png




import graphviz

# Open and read the dot file
with open(dot_file, 'r') as file:
    dot_graph = file.read()

# Use graphviz to render the dot graph
graphviz.Source(dot_graph)
