<a href="https://colab.research.google.com/github/themathedges/3YP-Standalone-Kennington/blob/main/Ravi/Old_regression/Model_4_Multivariate_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model 4: Multivariate Regression

**Author:** Ravi Kohli

**Date:** December 27th, 2020

**College:** Christ Church

**Goal:**
- To make a very simple multivariate linear regression monthly model to forecast the future generation profile of Sandford Hydro

In [None]:
# Mounting the Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# import the modules
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from scipy.stats import pearsonr
import pickle
from sklearn.linear_model import LinearRegression

## Collecting the Data

### Total Monthly Precipitation

In [None]:
# unpickle the current precipitation monthly precipitation dataframe
path = '/content/drive/My Drive/3YP/data/'
filename = 'precipitation_data_processed'
infile = open(path+filename, 'rb')
precipitation_monthly_df = pickle.load(infile)
infile.close()

In [None]:
precip_overlay_part1 = precipitation_monthly_df.iloc[250][12:13]          # getting the December 2018 data
precip_overlay_part2 = precipitation_monthly_df.iloc[251][1:11]           # getting the 2019 data
model_4_precip = pd.concat([precip_overlay_part1, precip_overlay_part2])  # combining the precipitation data to form montly data

In [None]:
# unpickle the future precipitation monthly precipitation dataframe
path = '/content/drive/My Drive/3YP/data/met_office/'
filename = 'precip_2335_met_monthly'
infile = open(path+filename, 'rb')
precipitation_future_df = pickle.load(infile)
infile.close()

In [None]:
# extract the rows that we want by using a mask
# format of mask: YYYY-MM-DD
model_4_future_mask_start = pd.to_datetime('2050-01-01')
model_4_future_mask_end = pd.to_datetime('2050-12-31')

model_4_precip_future_mask = (precipitation_future_df['Date'] >= model_4_future_mask_start) & (precipitation_future_df['Date'] <= model_4_future_mask_end)

# extract the rows from the dataframe by using the mask
precipitation_future = precipitation_future_df.loc[model_4_precip_future_mask]

In [None]:
precipitation_future

Unnamed: 0,Date,Precipitation Rate
3330,2050-01-16,3.122402
3334,2050-02-16,2.776208
3338,2050-03-16,3.617622
3342,2050-04-16,1.736092
3346,2050-05-16,1.108913
3350,2050-06-16,1.9039
3354,2050-07-16,1.971584
3358,2050-08-16,0.550804
3362,2050-09-16,0.236867
3366,2050-10-16,1.071648


In [None]:
number_of_days = np.array([31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31])

In [None]:
# monthly precipitation data future forecast data

# multiply the average per day by the number of days
precip_target = np.array(precipitation_future['Precipitation Rate'] * number_of_days)

In [None]:
# order the array to exclude november and start with december
# index 0 = January now, but we need it to be december 
precip_target_ordered = np.array([precip_target[i] for i in range(-1, len(precip_target) - 2) ])

### Average Monthly Temperature

In [None]:
# unpickle the current temperature daily dataframe
path = '/content/drive/My Drive/3YP/data/'
filename = 'radcliffe_daily_temperature_data_processed'
infile = open(path+filename, 'rb')
temperature_daily_df = pickle.load(infile)
infile.close()

In [None]:
# resample the daily dataframe to form a monthly average dataframe
temperature_monthly_df = temperature_daily_df.resample('M', on='Date').mean()

In [None]:
# form a mask to extract the relevant rows from the data
# format of mask: YYYY-MM-DD
model_4_temp_mask_start = pd.to_datetime('2018-12-01')
model_4_temp_mask_end = pd.to_datetime('2019-10-31')

model_4_temp_mask = (temperature_monthly_df.index >= model_4_temp_mask_start) & (temperature_monthly_df.index <= model_4_temp_mask_end)

# extract the rows from the dataframe by using the mask
model_4_temp_extract = temperature_monthly_df.loc[model_4_temp_mask]

In [None]:
# unpickle the future temperature dataframe
path = '/content/drive/My Drive/3YP/data/met_office/'
filename = 'temp_2242_met_monthly' #2335
infile = open(path+filename, 'rb')
temperature_future_df = pickle.load(infile)
infile.close()

In [None]:
temperature_future_df

Unnamed: 0,Date,Mean Air Tempeature
14,1980-12-16,-0.447383
18,1981-01-16,-1.803063
22,1981-02-16,5.492897
26,1981-03-16,6.562324
30,1981-04-16,9.374220
...,...,...
3406,2051-08-16,21.363884
3410,2051-09-16,17.817898
3414,2051-10-16,13.470314
3418,2051-11-16,9.025274


In [None]:
# extract the rows that we want by using a mask

model_4_temp_future_mask = (temperature_future_df['Date'] >= model_4_future_mask_start) & (temperature_future_df['Date'] <= model_4_future_mask_end)

# extract the rows from the dataframe by using the mask
temp_future = np.array(temperature_future_df.loc[model_4_precip_future_mask]['Mean Air Tempeature'])    # data in degrees celsius

In [None]:
temp_future_ordered = np.array([temp_future[i] for i in range(-1, len(temp_future) - 2) ])

In [None]:
print(temp_future, temp_future_ordered)

[ 6.61692565  7.33836532  5.78745175 11.82022925 14.328254   17.83871075
 20.807478   22.87336775 18.087347   12.85819375 11.1655725   5.97792705] [ 5.97792705  6.61692565  7.33836532  5.78745175 11.82022925 14.328254
 17.83871075 20.807478   22.87336775 18.087347   12.85819375]


### Total Monthly Generation

In [None]:
# the generations (kWh) array
monthly_sandford_generation = np.array([181051, 190126, 204585, 207951, 154112, 49882, 75884, 11172, 22088, 18483, 96626]) # data is in kWh

## Multivariate Regression Analysis

In [None]:
model_4 = LinearRegression()

In [None]:
# note the indexing as we need to exclude november and start the data as december
model_4_df = pd.DataFrame([model_4_temp_extract.index, model_4_temp_extract['Daily Tmean °C'], model_4_precip.values, monthly_sandford_generation, 
                           precip_target_ordered, 
                           temp_future_ordered]).T
model_4_df.columns = ['Date', 'Mean_temp', 'Total_precip', 'Total_monthly_generation', 'Future_precip', 'Future_temp']

In [None]:
model_4_df

Unnamed: 0,Date,Mean_temp,Total_precip,Total_monthly_generation,Future_precip,Future_temp
0,2018-12-31,7.43226,76.1,181051,81.9643,5.97793
1,2019-01-31,4.1,58.6,190126,96.7945,6.61693
2,2019-02-28,7.08214,24.1,204585,77.7338,7.33837
3,2019-03-31,8.88065,83.4,207951,112.146,5.78745
4,2019-04-30,9.66667,52.8,154112,52.0828,11.8202
5,2019-05-31,12.6581,86.2,49882,34.3763,14.3283
6,2019-06-30,15.6367,2.5,75884,57.117,17.8387
7,2019-07-31,19.1097,23.2,11172,61.1191,20.8075
8,2019-08-31,18.5516,43.2,22088,17.0749,22.8734
9,2019-09-30,15.52,30.3,18483,7.106,18.0873


In [None]:
x_train = model_4_df[['Mean_temp', 'Total_precip']]
y_train = model_4_df['Total_monthly_generation']

In [None]:
model_4.fit(x_train, y_train)
print(model_4.intercept_, model_4.coef_)

302030.3743351451 [-15347.02854018   -229.0541033 ]


In [None]:
# make predictions on the future
y_pred = model_4.predict(model_4_df[['Future_temp', 'Future_precip']])

In [None]:
# print out those monthly predictions
y_pred

array([191512.69362759, 178309.05582047, 171603.01865674, 187522.62002563,
       108695.21034068,  74260.21516736,  15176.28708574, -31302.16381126,
       -52918.93351512,  22815.68517744,  97085.87984367])

In [None]:
y_pred_corrected = [max(i, 0) for i in y_pred]
print(sum(y_pred_corrected))

1046980.6657453115


In [None]:
((y_pred_corrected - monthly_sandford_generation) / monthly_sandford_generation ) * 100

array([   5.77831309,   -6.21532256,  -16.12140741,   -9.82365075,
        -29.46998914,   48.87176771,  -80.00067592, -100.        ,
       -100.        ,   23.44146068,    0.47593799])

In [None]:
sum(monthly_sandford_generation)

1211960

In [None]:
(1046980.6657453115 - 1211960)/1211960 * 100

-13.612605552550288