<a href="https://colab.research.google.com/github/themathedges/3YP-Standalone-Kennington/blob/main/Ravi/Sanford_Generation_models/Model_3_Multivariate_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model 3: Multivariate Regression

**Author:** Ravi Kohli

**Date:** December 21st, 2020

**College:** Christ Church

**Goal:**
- To make a very simple multivariate linear regression monthly model to forecast the future generation profile of Sandford Hydro

In [None]:
# Mounting the Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# import the modules
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from scipy.stats import pearsonr
import pickle
from sklearn.linear_model import LinearRegression

## Collecting the Data

### Total Monthly Precipitation

In [None]:
# unpickle the precipitation monthly/ annual dataframe
path = '/content/drive/My Drive/3YP/data/'
filename = 'precipitation_data_processed'
infile = open(path+filename, 'rb')
precipitation_monthly_df = pickle.load(infile)
infile.close()

In [None]:
precip_overlay_part1 = precipitation_monthly_df.iloc[250][12:13]          # getting the December 2018 data
precip_overlay_part2 = precipitation_monthly_df.iloc[251][1:11]           # getting the 2019 data
model_3_precip = pd.concat([precip_overlay_part1, precip_overlay_part2])  # combining the precipitation data to form montly data

In [None]:
# monthly precipitation data future forecast data
precip_target = np.array([88.276, 67.976, 27.956, 83.4, 52.8, 86.2, 2, 18.56, 34.56, 30.3, 50.1])

### Average Monthly Temperature

In [None]:
# unpickle the temperature daily dataframe
path = '/content/drive/My Drive/3YP/data/'
filename = 'radcliffe_daily_temperature_data_processed'
infile = open(path+filename, 'rb')
temperature_daily_df = pickle.load(infile)
infile.close()

In [None]:
# resample the daily dataframe to form a monthly average dataframe
temperature_monthly_df = temperature_daily_df.resample('M', on='Date').mean()

In [None]:
# form a mask to extract the relevant rows from the data
# format of mask: YYYY-MM-DD
model_3_temp_mask_start = pd.to_datetime('2018-12-01')
model_3_temp_mask_end = pd.to_datetime('2019-10-31')

model_3_temp_mask = (temperature_monthly_df.index >= model_3_temp_mask_start) & (temperature_monthly_df.index <= model_3_temp_mask_end)

# extract the rows from the dataframe by using the mask
model_3_temp_extract = temperature_monthly_df.loc[model_3_temp_mask]

In [None]:
# the array that stores the future temperature data
temp_future = np.array([9.75126, 6.419, 9.40114, 8.88065, 9.66667, 12.6581, 17.3167, 20.7897, 20.2316, 17.2, 12.6348])    # data is in degrees celsius

### Total Monthly Generation

In [None]:
# the generations (kWh) array
monthly_sandford_generation = np.array([181051, 190126, 204585, 207951, 154112, 49882, 75884, 11172, 22088, 18483, 96626]) # data is in kWh

## Multivariate Regression Analysis

In [None]:
model_3 = LinearRegression()

In [None]:
model_3_df = pd.DataFrame([model_3_temp_extract.index, model_3_temp_extract['Daily Tmean °C'], model_3_precip.values, monthly_sandford_generation, precip_target, temp_future]).T
model_3_df.columns = ['Date', 'Mean_temp', 'Total_precip', 'Total_monthly_generation', 'Future_precip', 'Future_temp']

In [None]:
model_3_df

Unnamed: 0,Date,Mean_temp,Total_precip,Total_monthly_generation,Future_precip,Future_temp
0,2018-12-31,7.43226,76.1,181051,88.276,9.75126
1,2019-01-31,4.1,58.6,190126,67.976,6.419
2,2019-02-28,7.08214,24.1,204585,27.956,9.40114
3,2019-03-31,8.88065,83.4,207951,83.4,8.88065
4,2019-04-30,9.66667,52.8,154112,52.8,9.66667
5,2019-05-31,12.6581,86.2,49882,86.2,12.6581
6,2019-06-30,15.6367,2.5,75884,2.0,17.3167
7,2019-07-31,19.1097,23.2,11172,18.56,20.7897
8,2019-08-31,18.5516,43.2,22088,34.56,20.2316
9,2019-09-30,15.52,30.3,18483,30.3,17.2


In [None]:
x_train = model_3_df[['Mean_temp', 'Total_precip']]
y_train = model_3_df['Total_monthly_generation']

In [None]:
model_3.fit(x_train, y_train)
print(model_3.intercept_, model_3.coef_)

302030.3743351451 [-15347.02854018   -229.0541033 ]


In [None]:
# make predictions on the future
y_pred = model_3.predict(model_3_df[['Future_temp', 'Future_precip']])

In [None]:
# print out those monthly predictions
y_pred

array([132157.52878926, 187947.61640961, 151347.37393296, 146635.67311433,
       141581.65730223,  88021.68866597,  35812.37700675, -21280.989064  ,
       -16380.67808856,  31121.14411393,  96648.12756018])

In [None]:
y_pred_corrected = [max(i, 0) for i in y_pred]
print(sum(y_pred_corrected))

1011273.1868952075


In [None]:
((y_pred_corrected - monthly_sandford_generation) / monthly_sandford_generation ) * 100

array([-2.70053583e+01, -1.14575786e+00, -2.60222529e+01, -2.94854686e+01,
       -8.13067295e+00,  7.64598225e+01, -5.28064190e+01, -1.00000000e+02,
       -1.00000000e+02,  6.83771255e+01,  2.29002134e-02])

In [None]:
sum(monthly_sandford_generation)

1211960