<a href="https://colab.research.google.com/github/toitoi11/Electricity-Usage-Prediction-Using-Weather-Data/blob/main/regression_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [3]:
# extract data
dataset = pd.read_csv("/content/drive/MyDrive/project/regression/weather_data_final.csv")
dataset.columns = ["date","temp_mean","temp_high","temp_low","rain","humid","day","light","sunrise","elec"]

def convert_to_minutes(time_str):
    if pd.notnull(time_str):  # Only preprocess when there are no missing values
        hour, minute = map(int, str(time_str).split(":"))
        return hour * 60 + minute
    else:
        return time_str
# store transformed partial time unit data
dataset["sunrise"] = [convert_to_minutes(time_str) for time_str in dataset["sunrise"]]
dataset['date'] = pd.to_datetime(dataset['date'])
# filtered_df = dataset[(dataset['date'].dt.month >= 6)&(dataset['date'].dt.month <= 10)]
dataset = dataset[(dataset['date'].dt.month >=7)&(dataset['date'].dt.month <=9)]
dataset

Unnamed: 0,date,temp_mean,temp_high,temp_low,rain,humid,day,light,sunrise,elec
61,2019-07-01,23.9,20.6,28.9,0.0,64.4,0,10.9,314.0,26.901
62,2019-07-02,24.4,20.8,30.1,0.0,63.5,1,11.9,315.0,26.922
63,2019-07-03,24.1,21.2,27.4,0.0,64.6,2,3.3,315.0,26.868
64,2019-07-04,25.4,20.0,32.8,0.0,59.6,3,12.7,316.0,27.479
65,2019-07-05,27.6,20.2,35.0,0.0,42.1,4,11.8,316.0,29.582
...,...,...,...,...,...,...,...,...,...,...
1244,2022-09-26,20.0,14.8,25.4,0.0,58.5,0,8.7,383.0,20.513
1245,2022-09-27,20.9,14.0,28.0,0.0,59.0,1,11.2,383.0,20.560
1246,2022-09-28,20.4,16.1,26.1,0.0,71.0,2,5.0,384.0,20.752
1247,2022-09-29,20.4,16.4,26.5,0.0,73.5,3,7.9,385.0,20.873


In [4]:
# Sort the dataset by date
dataset = dataset.sort_values(by='date')

# Calculate the index to split at (80% training, 20% testing)
split_index = int(len(dataset) * 0.8)

# Split the dataset into training and testing sets based on the calculated index
train_dataset = dataset.iloc[:split_index]
test_dataset = dataset.iloc[split_index:]

# Separate features and target for training and testing sets
X_train = train_dataset[["temp_mean", "temp_high", "temp_low", "rain", "humid", "day", "light", "sunrise"]]
y_train = train_dataset[["elec"]]
X_test = test_dataset[["temp_mean", "temp_high", "temp_low", "rain", "humid", "day", "light", "sunrise"]]
y_test = test_dataset[["elec"]]

# Print the sizes of training and testing sets
print("Training set size:", len(train_dataset))
print("Testing set size:", len(test_dataset))

Training set size: 294
Testing set size: 74


In [7]:
import itertools
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score

# Sort the dataset by date
dataset = dataset.sort_values(by='date')

# Calculate the index to split at (80% training, 20% testing)
split_index = int(len(dataset) * 0.8)

# Split the dataset into training and testing sets based on the calculated index
train_dataset = dataset.iloc[:split_index]
test_dataset = dataset.iloc[split_index:]

# Generate all combinations of variables
variables = ["temp_mean", "temp_high", "temp_low", "rain", "humid", "day", "light", "sunrise"]
combinations = []
for r in range(1, len(variables) + 1):
    combinations.extend(list(itertools.combinations(variables, r)))

# Perform training and evaluation in a loop for all selected combinations
result_dict = {}

# Initialize list to store R2 values
r2df = []


# Save the result
result_file_name = "/content/drive/MyDrive/project/regression/result_output_sorted.txt"
with open(result_file_name, "w") as result_file:
    result_file.write("Sort in descending order by highest accuracy.\n")

    for comb in combinations:
        selected_columns = list(comb) + ["elec"]
        X_train = train_dataset[selected_columns[:-1]]
        y_train = train_dataset[selected_columns[-1:]]
        X_test = test_dataset[selected_columns[:-1]]
        y_test = test_dataset[selected_columns[-1:]]

        regressor = LinearRegression()
        regressor.fit(X_train, y_train)

        y_pred = regressor.predict(X_test)
        df = pd.DataFrame({'Actual': np.ravel(y_test), 'Predicted': np.ravel(y_pred)})
        df["Difference"] = df["Actual"] - df["Predicted"]
        df["Correct"] = df["Difference"].apply(lambda x: 1 if abs(x) <= 4 else 0)

        r2 = r2_score(y_test, y_pred)
        correct_mean_1_31 = df.loc[0:30, "Correct"].mean()
        correct_percentage_1_31 = correct_mean_1_31 * 100

        result_dict[tuple(selected_columns)] = (r2, correct_percentage_1_31)

        # Store r2 values at r2df
        r2df.append(r2)


    # Define the target range to scale
    min_target = 0.1
    max_target = 0.95

    # Find the minimum and maximum values in the original list
    min_original = min(r2df)
    max_original = max(r2df)

    # Calculate the scaling factor
    scale_factor = (max_target - min_target) / (max_original - min_original)

    # Scale the original values to the target range
    scaled_list = [min_target + (x - min_original) * scale_factor for x in r2df]

    scaled_list.sort(reverse=True)


    counter = 0
    # List in order of highest R-squared (R2) values
    sorted_result = sorted(result_dict.items(), key=lambda x: x[1][0], reverse=True)
    result_file.write("\nList in order of highest R-squared (R2) values:\n")
    for selected_columns, (r2, correct_percentage_1_31) in sorted_result:
        result_file.write(f"Selected Columns: {selected_columns}, R-squared (R2) score: {r2:.4f}\n")
        result_file.write(f"Scaled R-squared (R2) score: {scaled_list[counter] * 100 : .2f}\n")
        result_file.write(f"Percentage of correct predictions for rows 1-31: {correct_percentage_1_31:.2f}%\n")
        counter = counter + 1