In [343]:
# Libraries for data loading, data manipulation and data visulisation
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pickle
plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# Libraries for data preparation and model building
import statsmodels.graphics.api as sga
import statsmodels.formula.api as sfa
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

# print multiple outputs in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Setting global constants to ensure notebook results are reproducible
# PARAMETER_CONSTANT = ###

## Rationale to selecting predictor variables

In [431]:
df1 = pd.read_csv("df_train.csv")
print(f"There are {df1.shape[0]} rows and {df1.shape[1]} columns before")

# Remove unnecessary column(s)

df_train = df1.drop(labels="Unnamed: 0", axis=1)
print(f"There are {df_train.shape[0]} rows and {df_train.shape[1]} columns after")
df_train.head(1)
print('', end="\n\n")

There are 8763 rows and 49 columns before
There are 8763 rows and 48 columns after


Unnamed: 0,time,Madrid_wind_speed,Valencia_wind_deg,Bilbao_rain_1h,Valencia_wind_speed,Seville_humidity,Madrid_humidity,Bilbao_clouds_all,Bilbao_wind_speed,Seville_clouds_all,...,Madrid_temp_max,Barcelona_temp,Bilbao_temp_min,Bilbao_temp,Barcelona_temp_min,Bilbao_temp_max,Seville_temp_min,Madrid_temp,Madrid_temp_min,load_shortfall_3h
0,2015-01-01 03:00:00,0.666667,level_5,0.0,0.666667,74.333333,64.0,0.0,1.0,0.0,...,265.938,281.013,269.338615,269.338615,281.013,269.338615,274.254667,265.938,265.938,6715.666667






In [433]:
# Function to describe variable (including mode and median)

def describe(df):
    d = {0:[df.mean(), df.median(), df.mode()[0]]}
    dat = pd.DataFrame(data=d).rename(index={0: "Mean", 1: "Median", 2: "Mode"})
    return pd.concat([df.describe(), dat])

# Deal with null containing column(s)

df_train_clean = df_train.copy()
df_train_clean["Valencia_pressure"] = df_train_clean["Valencia_pressure"].fillna(df_train_clean["Valencia_pressure"].mode()[0])
print('', end="\n\n")

# Convert object dtypes to float

df_train_clean["Valencia_wind_deg"] = df_train_clean["Valencia_wind_deg"].str.extract("(\d+)").astype(int)
df_train_clean["Seville_pressure"] = df_train_clean["Seville_pressure"].str.extract("(\d+)").astype(int)
df_train_clean["time"] = pd.to_datetime(df_train_clean["time"])

# extract features from date

df_train_clean["time_year"] = df_train_clean["time"].dt.year.astype(int)
df_train_clean["time_month"] = df_train_clean["time"].dt.month.astype(int)
df_train_clean["time_day"] = df_train_clean["time"].dt.day.astype(int)
df_train_clean['time_dayofyear'] = df_train_clean['time'].dt.dayofyear.astype(int)
df_train_clean["time_hour"] = df_train_clean["time"].dt.hour.astype(int)
df_train_clean["time_weekday"] = df_train_clean["time"].dt.weekday.astype(int) # Monday is 0 and Sunday is 6
df_train_clean["time_weeknumber"] = df_train_clean["time"].dt.week.astype(int)

# Sort columns and drop noise ("time")

df_train_clean_sort = df_train_clean[sorted(df_train_clean)]
df_train_clean_sort = df_train_clean_sort.drop(labels="time", axis=1)
df_train_clean_sort.head(1)





Unnamed: 0,Barcelona_pressure,Barcelona_rain_1h,Barcelona_rain_3h,Barcelona_temp,Barcelona_temp_max,Barcelona_temp_min,Barcelona_weather_id,Barcelona_wind_deg,Barcelona_wind_speed,Bilbao_clouds_all,...,Valencia_wind_deg,Valencia_wind_speed,load_shortfall_3h,time_day,time_dayofyear,time_hour,time_month,time_weekday,time_weeknumber,time_year
0,1036.333333,0.0,0.0,281.013,281.013,281.013,800.0,42.666667,6.333333,0.0,...,5,0.666667,6715.666667,1,1,3,1,3,1,2015


In [262]:
# Variable selection
x_0=df_train_clean_sort.copy()

x_temp_min = x_0.filter(regex="min$", axis=1)
x_temp_min["Mean_temp_min"] = x_temp_min.mean(axis=1)
x_temp_min.head(1)

x_wind_deg1 = x_0.filter(regex=r'(Barcelona_wind_deg|Bilbao_wind_deg)', axis=1)
x_wind_deg1["Mean_wind_deg1"] = x_wind_deg1.mean(axis=1)
x_wind_deg1.head(1)

x_humidity = x_0.filter(regex="humidity", axis=1)
x_humidity["Mean_humidity"] = x_humidity.mean(axis=1)
x_humidity.head(1)

x_clouds_all = x_0.filter(regex="clouds", axis=1)
x_clouds_all["Mean_clouds_all"] = x_clouds_all.mean(axis=1)
x_clouds_all.head(1)

x_weather_id = x_0.filter(regex="weather", axis=1)
x_weather_id["Mean_weather_id"] = x_weather_id.mean(axis=1)
x_weather_id.head(1)

x_time = x_0.filter(regex="time", axis=1)
x_time["time_hourofyear"] = x_time["time_hour"] + ((x_time["time_dayofyear"] - 1) * 24)
x_time.tail(1)

Unnamed: 0,Barcelona_temp_min,Bilbao_temp_min,Madrid_temp_min,Seville_temp_min,Valencia_temp_min,Mean_temp_min
0,281.013,269.338615,265.938,274.254667,269.888,272.086456


Unnamed: 0,Barcelona_wind_deg,Bilbao_wind_deg,Mean_wind_deg1
0,42.666667,223.333333,133.0


Unnamed: 0,Madrid_humidity,Seville_humidity,Valencia_humidity,Mean_humidity
0,64.0,74.333333,75.666667,71.333333


Unnamed: 0,Bilbao_clouds_all,Madrid_clouds_all,Seville_clouds_all,Mean_clouds_all
0,0.0,0.0,0.0,0.0


Unnamed: 0,Barcelona_weather_id,Bilbao_weather_id,Madrid_weather_id,Seville_weather_id,Mean_weather_id
0,800.0,800.0,800.0,800.0,800.0


Unnamed: 0,time_day,time_dayofyear,time_hour,time_month,time_weekday,time_weeknumber,time_year,time_hourofyear
8762,31,365,21,12,6,52,2017,8757


In [263]:
x_select = x_time[["time_hourofyear"]].join(other = [x_temp_min["Mean_temp_min"], x_wind_deg1["Mean_wind_deg1"],
                                                         x_humidity["Mean_humidity"], x_clouds_all["Mean_clouds_all"],
                                                         x_weather_id["Mean_weather_id"]])
x_select.head(1)
x_select.shape

Unnamed: 0,time_hourofyear,Mean_temp_min,Mean_wind_deg1,Mean_humidity,Mean_clouds_all,Mean_weather_id
0,3,272.086456,133.0,71.333333,0.0,800.0


(8763, 6)

## Transform data function

In [114]:
def transform_alltime(csv):
    a = pd.read_csv(csv)
    
    # Convert object dtypes to float
    a["time"] = pd.to_datetime(a["time"])
    
    # extract features from date
    a["time_year"] = a["time"].dt.year.astype(int)
    a["time_month"] = a["time"].dt.month.astype(int)
    a["time_day"] = a["time"].dt.day.astype(int)
    a['time_dayofyear'] = a['time'].dt.dayofyear.astype(int)
    a["time_hour"] = a["time"].dt.hour.astype(int)
    a["time_weekday"] = a["time"].dt.weekday.astype(int) # Monday is 0 and Sunday is 6
    a["time_weeknumber"] = a["time"].dt.week.astype(int)
    
    # Sort columns and drop noise ("time")
    b = a[sorted(a)]
    b = b.drop(labels="time", axis=1)
    
    # Mean of variables
    c = b.filter(regex="min$", axis=1)
    c["Mean_temp_min"] = c.mean(axis=1)
    
    d = b.filter(regex=r'(Barcelona_wind_deg|Bilbao_wind_deg)', axis=1)
    d["Mean_wind_deg1"] = d.mean(axis=1)
    
    e = b.filter(regex="humidity", axis=1)
    e["Mean_humidity"] = e.mean(axis=1)
    
    f = b.filter(regex="clouds", axis=1)
    f["Mean_clouds_all"] = f.mean(axis=1)
    
    g = b.filter(regex="weather", axis=1)
    g["Mean_weather_id"] = g.mean(axis=1)
    
    h = b.filter(regex="time", axis=1)
    h["time_hourofyear"] = h["time_hour"] + ((h["time_dayofyear"] - 1) * 24)
    
    final = h.join(other = [c["Mean_temp_min"], d["Mean_wind_deg1"],
                                                 e["Mean_humidity"], f["Mean_clouds_all"],
                                                 g["Mean_weather_id"]])
    
    return final

def transform_HofY(csv):
    a = pd.read_csv(csv)
    
    # Convert object dtypes to float
    a["time"] = pd.to_datetime(a["time"])
    
    # extract features from date
    a["time_year"] = a["time"].dt.year.astype(int)
    a["time_month"] = a["time"].dt.month.astype(int)
    a["time_day"] = a["time"].dt.day.astype(int)
    a['time_dayofyear'] = a['time'].dt.dayofyear.astype(int)
    a["time_hour"] = a["time"].dt.hour.astype(int)
    a["time_weekday"] = a["time"].dt.weekday.astype(int) # Monday is 0 and Sunday is 6
    a["time_weeknumber"] = a["time"].dt.week.astype(int)
    
    # Sort columns and drop noise ("time")
    b = a[sorted(a)]
    b = b.drop(labels="time", axis=1)
    
    # Mean of variables
    c = b.filter(regex="min$", axis=1)
    c["Mean_temp_min"] = c.mean(axis=1)
    
    d = b.filter(regex=r'(Barcelona_wind_deg|Bilbao_wind_deg)', axis=1)
    d["Mean_wind_deg1"] = d.mean(axis=1)
    
    e = b.filter(regex="humidity", axis=1)
    e["Mean_humidity"] = e.mean(axis=1)
    
    f = b.filter(regex="clouds", axis=1)
    f["Mean_clouds_all"] = f.mean(axis=1)
    
    g = b.filter(regex="weather", axis=1)
    g["Mean_weather_id"] = g.mean(axis=1)
    
    h = b.filter(regex="time", axis=1)
    h["time_hourofyear"] = h["time_hour"] + ((h["time_dayofyear"] - 1) * 24)
    
    final = h[["time_hourofyear"]].join(other = [c["Mean_temp_min"], d["Mean_wind_deg1"],
                                                 e["Mean_humidity"], f["Mean_clouds_all"],
                                                 g["Mean_weather_id"]])
    
    return final

In [497]:
def transform_v001(csv):
    fldname = 'time'
    a = pd.read_csv(csv, parse_dates=[fldname])
    
    fld = a[fldname]
    targ_pre = re.sub('[Dd]ates','', fldname)
    for n in ("Year", "Month", "Week", "Day", "Hour", "Dayofweek", "Dayofyear", "Is_month_end", "Is_month_start", 
                  "Is_quarter_end", "Is_quarter_start", "Is_year_end", "Is_year_start"):
        a[targ_pre+n] = getattr(fld.dt, n.lower())
    a[targ_pre+'Elapsed' ] = (fld - fld.min()).dt.days
    a.drop(fldname, axis=1, inplace=True)
    a['Valencia_pressure']= a.groupby(['timeYear', 'timeMonth'])['Valencia_pressure'].apply(lambda x: x.fillna(round(x.mean(), 1)))
    a['Valencia_wind_deg']= a["Valencia_wind_deg"].str.extract("(\d+)").astype(int)
    a["Seville_pressure"] = a["Seville_pressure"].str.extract("(\d+)").astype(int)
    
    return a

In [409]:
feature_vector_df = pd.read_csv("df_test.csv")
feature_vector_df["time"] = pd.to_datetime(feature_vector_df["time"])
targ_pre = re.sub('[Dd]ates','', "time")
fld = feature_vector_df["time"]
for n in ("Year", "Month", "Week", "Day", "Hour", "Dayofweek", "Dayofyear", "Is_month_end", "Is_month_start", 
          "Is_quarter_end", "Is_quarter_start", "Is_year_end", "Is_year_start"):
    feature_vector_df[targ_pre+n] = getattr(fld.dt, n.lower())
feature_vector_df[targ_pre+'Elapsed' ] = (fld - fld.min()).dt.days
feature_vector_df.drop("time", axis=1, inplace=True)
feature_vector_df['Valencia_pressure']= feature_vector_df.groupby(['timeYear', 'timeMonth'])['Valencia_pressure'].apply(lambda x: x.fillna(round(x.mean(), 1)))
feature_vector_df['Valencia_wind_deg']= feature_vector_df["Valencia_wind_deg"].str.extract("(\d+)").astype(int)
feature_vector_df["Seville_pressure"] = feature_vector_df["Seville_pressure"].str.extract("(\d+)").astype(int)
predict_vector = feature_vector_df.copy()

array([[8763, 5.0, 8, ..., False, True, 0],
       [8764, 4.6666666667, 8, ..., False, True, 0],
       [8765, 2.3333333333, 7, ..., False, True, 0],
       ...,
       [11680, 1.0, 6, ..., True, False, 364],
       [11681, 1.0, 6, ..., True, False, 364],
       [11682, 1.3333333333, 10, ..., True, False, 364]], dtype=object)

In [498]:
x = transform_v001("df_train.csv").drop('load_shortfall_3h', axis=1)
y = pd.read_csv("df_train.csv")[["load_shortfall_3h"]]
x

Unnamed: 0.1,Unnamed: 0,Madrid_wind_speed,Valencia_wind_deg,Bilbao_rain_1h,Valencia_wind_speed,Seville_humidity,Madrid_humidity,Bilbao_clouds_all,Bilbao_wind_speed,Seville_clouds_all,...,timeHour,timeDayofweek,timeDayofyear,timeIs_month_end,timeIs_month_start,timeIs_quarter_end,timeIs_quarter_start,timeIs_year_end,timeIs_year_start,timeElapsed
0,0,0.666667,5,0.0,0.666667,74.333333,64.000000,0.000000,1.000000,0.000000,...,3,3,1,False,True,False,True,False,True,0
1,1,0.333333,10,0.0,1.666667,78.333333,64.666667,0.000000,1.000000,0.000000,...,6,3,1,False,True,False,True,False,True,0
2,2,1.000000,9,0.0,1.000000,71.333333,64.333333,0.000000,1.000000,0.000000,...,9,3,1,False,True,False,True,False,True,0
3,3,1.000000,8,0.0,1.000000,65.333333,56.333333,0.000000,1.000000,0.000000,...,12,3,1,False,True,False,True,False,True,0
4,4,1.000000,7,0.0,1.000000,59.000000,57.000000,2.000000,0.333333,0.000000,...,15,3,1,False,True,False,True,False,True,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8758,8758,1.000000,6,0.0,2.666667,89.000000,95.666667,56.666667,4.333333,80.000000,...,9,6,365,True,False,True,False,True,False,1095
8759,8759,5.000000,6,0.0,2.000000,82.000000,85.000000,26.666667,8.000000,75.000000,...,12,6,365,True,False,True,False,True,False,1095
8760,8760,6.333333,9,0.4,7.333333,67.666667,71.000000,63.333333,8.333333,33.333333,...,15,6,365,True,False,True,False,True,False,1095
8761,8761,7.333333,8,0.2,7.333333,67.666667,79.000000,63.333333,2.666667,51.666667,...,18,6,365,True,False,True,False,True,False,1095


In [411]:
X_train, X_test, y_train, y_test= train_test_split(x,
                                                   y,
                                                   test_size=0.2,
                                                   random_state=6)

In [456]:
list(pd.read_csv("df_train.csv").info())#columns.values)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 49 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            8763 non-null   int64  
 1   time                  8763 non-null   object 
 2   Madrid_wind_speed     8763 non-null   float64
 3   Valencia_wind_deg     8763 non-null   object 
 4   Bilbao_rain_1h        8763 non-null   float64
 5   Valencia_wind_speed   8763 non-null   float64
 6   Seville_humidity      8763 non-null   float64
 7   Madrid_humidity       8763 non-null   float64
 8   Bilbao_clouds_all     8763 non-null   float64
 9   Bilbao_wind_speed     8763 non-null   float64
 10  Seville_clouds_all    8763 non-null   float64
 11  Bilbao_wind_deg       8763 non-null   float64
 12  Barcelona_wind_speed  8763 non-null   float64
 13  Barcelona_wind_deg    8763 non-null   float64
 14  Madrid_clouds_all     8763 non-null   float64
 15  Seville_wind_speed   

TypeError: 'NoneType' object is not iterable

In [501]:
heyo = pd.read_csv("df_train.csv")
x = heyo[['Seville_temp', 'Valencia_temp_min', 'Barcelona_temp_max', 'Madrid_temp_min',
          'Madrid_temp', 'Seville_temp_max', 'Barcelona_temp_min',
          'Valencia_temp_max', 'Valencia_temp', 'Bilbao_temp_max', 'Seville_temp_min',
          'Madrid_temp_max', 'Barcelona_temp', 'Bilbao_temp_min', 'Bilbao_temp']]
x

Unnamed: 0,Seville_temp,Valencia_temp_min,Barcelona_temp_max,Madrid_temp_min,Madrid_temp,Seville_temp_max,Barcelona_temp_min,Valencia_temp_max,Valencia_temp,Bilbao_temp_max,Seville_temp_min,Madrid_temp_max,Barcelona_temp,Bilbao_temp_min,Bilbao_temp
0,274.254667,269.888000,281.013000,265.938000,265.938000,274.254667,281.013000,269.888000,269.888000,269.338615,274.254667,265.938000,281.013000,269.338615,269.338615
1,274.945000,271.728333,280.561667,266.386667,266.386667,274.945000,280.561667,271.728333,271.728333,270.376000,274.945000,266.386667,280.561667,270.376000,270.376000
2,278.792000,278.008667,281.583667,272.708667,272.708667,278.792000,281.583667,278.008667,278.008667,275.027229,278.792000,272.708667,281.583667,275.027229,275.027229
3,285.394000,284.899552,283.434104,281.895219,281.895219,285.394000,283.434104,284.899552,284.899552,281.135063,285.394000,281.895219,283.434104,281.135063,281.135063
4,285.513719,283.015115,284.213167,280.678437,280.678437,285.513719,284.213167,283.015115,283.015115,282.252063,285.513719,280.678437,284.213167,282.252063,282.252063
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8758,285.076667,282.483333,282.150000,278.483333,279.686667,285.483333,280.483333,282.483333,282.483333,290.150000,284.816667,280.816667,281.276667,285.150000,287.573333
8759,287.673333,291.816667,287.816667,280.150000,282.400000,288.483333,287.150000,291.816667,291.816667,291.150000,287.150000,283.483333,287.483333,286.483333,288.616667
8760,290.070000,290.706333,290.483333,281.150000,283.956667,291.483333,289.150000,290.706333,290.706333,286.816667,289.150000,285.150000,289.816667,283.816667,285.330000
8761,290.070000,290.483333,288.150000,280.816667,282.666667,291.483333,286.816667,290.483333,290.483333,284.150000,289.150000,283.483333,287.523333,278.816667,281.410000


In [502]:
# beyo = pd.read_csv("df_test.csv")
# a_unseen = beyo[['Unnamed: 0', 'Madrid_wind_speed', 'Valencia_wind_speed', 'Seville_humidity',
#           'Madrid_humidity', 'Bilbao_clouds_all', 'Bilbao_wind_speed', 'Madrid_temp_min',
#           'Seville_clouds_all', 'Bilbao_wind_deg', 'Barcelona_wind_speed', 'Barcelona_wind_deg',
#           'Madrid_clouds_all', 'Seville_wind_speed', 'Barcelona_pressure', 'Bilbao_pressure',
#           'Madrid_temp', 'Seville_temp_max', 'Madrid_pressure', 'Barcelona_temp_min',
#           'Valencia_temp_max', 'Valencia_temp', 'Bilbao_temp_max', 'Seville_temp_min',
#           'Seville_temp', 'Valencia_humidity', 'Valencia_temp_min', 'Barcelona_temp_max',
#           'Madrid_temp_max', 'Barcelona_temp', 'Bilbao_temp_min', 'Bilbao_temp']]
# a_unseen

In [503]:
beyo = pd.read_csv("df_test.csv")
# predict_vector = feature_vector_df[['Madrid_wind_speed','Bilbao_rain_1h','Valencia_wind_speed']]
a_unseen = beyo[['Seville_temp', 'Valencia_temp_min', 'Barcelona_temp_max', 'Madrid_temp_min',
          'Madrid_temp', 'Seville_temp_max', 'Barcelona_temp_min',
          'Valencia_temp_max', 'Valencia_temp', 'Bilbao_temp_max', 'Seville_temp_min',
          'Madrid_temp_max', 'Barcelona_temp', 'Bilbao_temp_min', 'Bilbao_temp']]
a_unseen

Unnamed: 0,Seville_temp,Valencia_temp_min,Barcelona_temp_max,Madrid_temp_min,Madrid_temp,Seville_temp_max,Barcelona_temp_min,Valencia_temp_max,Valencia_temp,Bilbao_temp_max,Seville_temp_min,Madrid_temp_max,Barcelona_temp,Bilbao_temp_min,Bilbao_temp
0,283.673333,287.483333,287.816667,279.150000,279.866667,284.483333,286.816667,287.483333,287.483333,285.150000,283.150000,280.816667,287.356667,276.150000,280.380000
1,281.673333,284.150000,284.816667,278.150000,279.193333,282.483333,283.483333,284.150000,284.150000,284.150000,281.150000,280.483333,284.190000,277.816667,281.010000
2,280.613333,282.816667,284.483333,276.150000,276.340000,280.816667,281.816667,282.816667,282.816667,282.150000,280.483333,276.483333,283.150000,276.816667,279.196667
3,279.936667,283.483333,284.150000,274.483333,275.953333,281.150000,282.150000,283.483333,283.483333,284.483333,279.150000,277.150000,283.190000,279.150000,281.740000
4,285.570000,287.150000,287.483333,280.150000,280.686667,287.150000,286.150000,287.150000,287.150000,286.816667,284.483333,281.150000,286.816667,281.816667,284.116667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,280.210000,278.816667,279.816667,271.150000,274.910000,280.816667,276.483333,279.816667,279.323333,276.150000,279.816667,281.483333,278.140000,270.816667,273.210000
2916,287.280000,289.816667,286.483333,280.483333,283.156667,287.483333,285.816667,290.816667,290.323333,278.816667,287.150000,287.816667,286.150000,278.150000,278.443333
2917,290.816667,293.483333,289.483333,286.483333,287.733333,290.816667,288.150000,293.483333,293.483333,285.816667,290.816667,288.816667,288.820000,284.150000,285.073333
2918,288.300000,285.483333,285.816667,282.150000,283.813333,288.816667,283.150000,286.483333,285.976667,282.816667,287.483333,285.150000,284.473333,280.150000,281.626667


In [475]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [511]:
# Train_model.py

# Fetch training data and preprocess for modeling
train = pd.read_csv('df_train.csv')

y_train = train[['load_shortfall_3h']]
X_train = train[['Seville_temp', 'Valencia_temp_min', 'Barcelona_temp_max', 'Madrid_temp_min',
                 'Madrid_temp', 'Seville_temp_max', 'Barcelona_temp_min',
                 'Valencia_temp_max', 'Valencia_temp', 'Bilbao_temp_max', 'Seville_temp_min',
                 'Madrid_temp_max', 'Barcelona_temp', 'Bilbao_temp_min', 'Bilbao_temp']]

# Fit model
lm_regression = LinearRegression(normalize=True)
print ("Training Model...")
lm_regression.fit(X_train, y_train)

# Pickle model for use within our API
save_path = r"C:\Users\ADEBOWALE Tosin\Desktop\tee\Explore\predict\05 - Advanced Regression\Spain_Electricity_Shortfall_Challenge_Team_ZF1\3_AWS_and_Flask_webserver\assets\trained-models\ZF1_LM_model.pkl"
print (f"Training completed. Saving model to: {save_path}")
pickle.dump(lm_regression, open(save_path,'wb'))

Training Model...


LinearRegression(normalize=True)

Training completed. Saving model to: C:\Users\ADEBOWALE Tosin\Desktop\tee\Explore\predict\05 - Advanced Regression\Spain_Electricity_Shortfall_Challenge_Team_ZF1\3_AWS_and_Flask_webserver\assets\trained-models\ZF1_LM_model.pkl


In [506]:
model_load_path = r"C:\Users\ADEBOWALE Tosin\Desktop\tee\Explore\predict\05 - Advanced Regression\Spain_Electricity_Shortfall_Challenge_Team_ZF1\3_AWS_and_Flask_webserver\assets\trained-models\ZF1_LM2_model.pkl"
with open(model_load_path,'rb') as file:
    unpickled_model = pickle.load(file)

In [507]:
y_pred = unpickled_model.predict(a_unseen)
y_pred

array([[11424.78922537],
       [10464.95577335],
       [10511.04440819],
       ...,
       [10514.56866786],
       [ 9926.35608978],
       [ 8848.80128894]])

In [415]:
prep_data = predict_vector.copy()
prediction = unpickled_model.predict(prep_data)
prediction[0].tolist()
prediction

10311.226666666664

array([10311.22666667,  7542.88333333,  7251.13333333, ...,
       13057.42      , 12478.62333333, 12845.00666667])

In [484]:
kag_pred_check = unpickled_model.predict(a_unseen)
kag_pred_check

array([[ 9387.1188303 ],
       [ 9518.3749324 ],
       [10725.00294979],
       ...,
       [11609.82148646],
       [11717.75972494],
       [11514.5447972 ]])

In [339]:
a_unseen = transform_v001("df_test.csv")
kag_pred = RF.predict(a_unseen)

In [340]:
kag_pred

array([10311.22666667,  7542.88333333,  7251.13333333, ...,
       13057.42      , 12478.62333333, 12845.00666667])

In [452]:
save_sub(series=y_pred, time="df_test.csv", file="sub_RF_v002.csv")

## Functions

In [273]:
# function to display all rows in a df
def display_all(df):
    with pd.option_context("display.max_rows", 1000):
        with pd.option_context("display.max_columns", 1000):
            display(df)

In [52]:
# Normalize Data
def Normalize(df):
    scaler = MinMaxScaler()
    x_scaled = scaler.fit_transform(df)
    x_normalize = pd.DataFrame(x_scaled, columns=df.columns)
    return x_normalize

In [35]:
# Fit model
def fit_model(df, y=y_0):
    df_fit = df.copy()
    y_name = ''.join([col for col in y.columns])
    X_name = [col for col in df_fit.columns]

    # Build OLS formula string " y ~ X "

    formula_str = y_name+" ~ "+" + ".join(X_name)

    model = sfa.ols(formula=formula_str, data=df_fit.join(y))
    fitted = model.fit()
    print(fitted.summary())

In [53]:
# Standardize
def standardize(df):
    # Create scaler object
    scaler = StandardScaler()

    # Create scaled version of the predictors (there is no need to scale the response)
    X_scaled = scaler.fit_transform(df)

    # Convert the scaled predictor values into a dataframe
    X_standardise = pd.DataFrame(X_scaled,columns=df.columns)
    return X_standardise

In [239]:
# Train data using Ridge
def ridge_train(x, y):
    X_train, X_test, y_train, y_test = train_test_split(x,
                                                        y["load_shortfall_3h"], 
                                                        test_size=0.2, 
                                                        shuffle=False)
    ridge = Ridge()
    ridge.fit(X_train, y_train)
    b0 = float(ridge.intercept_)
    coeff = pd.DataFrame(ridge.coef_, x.columns, columns=['Coefficient'])

    print("Intercept:", float(b0))
    return coeff

In [264]:
# Fit a basic linear model
def model_accuracy_unseen_pred(x, y, df_unseen, choice=LinearRegression()):
    X_train, X_test, y_train, y_test = train_test_split(x,
                                                        y["load_shortfall_3h"], 
                                                        test_size=0.2, 
                                                        shuffle=False)
    model_object = choice
    model_object.fit(X_train, y_train)
    
    train_pred = model_object.predict(X_train)
    test_pred = model_object.predict(X_test)
    
    # print("Intercept:\t", float(model_object.intercept_))
    # print(pd.DataFrame(model_object.coef_, x.columns, columns=['Coefficient']))
    
    print('', end="\n")
    print('Training RMSE:', np.sqrt(metrics.mean_squared_error(y_train, train_pred)))
    print('Model is, on average,', round(np.sqrt(metrics.mean_squared_error(y_train, train_pred))),'off prediction for training data')
    
    print('', end="\n")
    print('Training R_squared:', r2_score(y_train, train_pred))
    print('Model is good', "{:.0%}".format(r2_score(y_train, train_pred)), 'of the time')

    print('', end="\n")
    print('Testing RMSE:', np.sqrt(metrics.mean_squared_error(y_test, test_pred)))
    print('Model is , on average,', round(np.sqrt(metrics.mean_squared_error(y_test, test_pred))),'off prediction for testing data')
    
    print('', end="\n")
    print('Testing R_squared:', r2_score(y_test, test_pred))
    print('Model is good', "{:.0%}".format(r2_score(y_test, test_pred)), 'of the time')
    print('', end="\n")
    
    unseen_predictions = model_object.predict(df_unseen)
    return unseen_predictions

In [246]:
# Predict Unseen
def predict_unseen(x, y, df_unseen, choice=LinearRegression()):
    X_train = x
    y_train = y["load_shortfall_3h"]
    X_test = df_unseen
    model_object = choice
    model_object.fit(X_train, y_train)
    predictions = model_object.predict(df_unseen)
    return predictions

In [66]:
# Save submission
def save_sub(series, time="df_test.csv", file="sample_submission_load_shortfall.csv"):
    my_series = pd.Series(series)
    load_shortfall_3h = my_series.to_frame()
    load_shortfall_3h = load_shortfall_3h.rename(columns = {0:'load_shortfall_3h'})
    time = pd.read_csv(time)[["time"]]
    sample = time.join(other = load_shortfall_3h)
    sample.to_csv(file, index=False)

## Work

In [487]:
y_0 = pd.read_csv("df_train.csv")[["load_shortfall_3h"]]

y_0.head(1)
y_0[(y_0[y_0.columns] < 0).any(axis=1)].T

Unnamed: 0,load_shortfall_3h
0,6715.666667


Unnamed: 0,8,9,10,13,17,18,21,191,192,230,...,8717,8724,8725,8733,8741,8749,8750,8757,8758,8761
load_shortfall_3h,-1850.333333,-4002.333333,-2893.0,-1317.0,-2679.333333,-1478.0,-2013.0,-1007.333333,-298.0,-2950.666667,...,-238.666667,-3467.0,-6292.666667,-3311.666667,-2451.333333,-4277.0,-1874.333333,-380.666667,-28.333333,-760.0


In [102]:
x_seen = transform_HofY("df_train.csv")
x_seen.head(1)
x_seen.shape

Unnamed: 0,time_hourofyear,Mean_temp_min,Mean_wind_deg1,Mean_humidity,Mean_clouds_all,Mean_weather_id
0,3,272.086456,133.0,71.333333,0.0,800.0


(8763, 6)

In [101]:
x_unseen = transform_HofY("df_test.csv")
x_unseen.head(1)
x_unseen.shape

Unnamed: 0,time_hourofyear,Mean_temp_min,Mean_wind_deg1,Mean_humidity,Mean_clouds_all,Mean_weather_id
0,0,282.55,185.0,68.222222,6.666667,800.25


(2920, 6)

In [204]:
Normalize(x_seen).head(1)

Unnamed: 0,time_hourofyear,Mean_temp_min,Mean_wind_deg1,Mean_humidity,Mean_clouds_all,Mean_weather_id
0,0.000342,0.02766,0.387191,0.661842,0.0,0.988095


In [205]:
standardize(x_seen).head(1)

Unnamed: 0,time_hourofyear,Mean_temp_min,Mean_wind_deg1,Mean_humidity,Mean_clouds_all,Mean_weather_id
0,-1.731947,-2.340125,-0.621239,0.513853,-1.302834,0.744832


In [240]:
ridge_train(x_seen, y_0)

Intercept: -23143.27963783583


Unnamed: 0,Coefficient
time_hourofyear,0.237741
Mean_temp_min,98.086223
Mean_wind_deg1,-10.429098
Mean_humidity,18.171724
Mean_clouds_all,-7.109671
Mean_weather_id,6.626397


In [245]:
model_accuracy_unseen_pred(x_seen, y_0, x_unseen, choice=RandomForestRegressor(n_estimators=100, max_depth=5, random_state=137))

Unnamed: 0,load_shortfall_3h
0,8865.392992
1,5565.331627
2,6295.504918
3,10309.918830
4,5489.051669
...,...
2915,9327.143514
2916,12371.751266
2917,13464.443846
2918,13519.096800


In [247]:
# Get predictions of unseen data

# series_L = predict_unseen(x=x_seen, y=y_0, df_unseen=x_unseen, choice=LinearRegression())
# series_R = predict_unseen(x=x_seen, y=y_0, df_unseen=x_unseen, choice=Ridge())
# series_Lasso = predict_unseen(x=x_seen, y=y_0, df_unseen=x_unseen, choice=Lasso())
# series_RF = predict_unseen(x=x_seen, y=y_0, df_unseen=x_unseen, choice=RandomForestRegressor(random_state=137))
series_RFpar = predict_unseen(x=x_seen, y=y_0, df_unseen=x_unseen,
                              choice=RandomForestRegressor(n_estimators=100, max_depth=5, random_state=137))

**array([ 9839.23550254,  9874.22328293, 10176.95654912, ...,13713.97623095, 14099.32865803, 13610.03186923])**

In [248]:
# View predictions of unseen data 

# series_L
# series_R
# series_Lasso
# series_RF
series_RFpar

array([ 8914.14910466,  5783.71195901,  6852.21524057, ...,
       12821.04906139, 12776.99571039,  7031.37717153])

In [234]:
# Check for negative values

check = series_RFpar
y_check =  pd.Series(check).to_frame().rename(columns = {0:'load_shortfall_3h'})
y_check[(y_check[y_check.columns] < 0).any(axis=1)].T

load_shortfall_3h


In [258]:
# Save in submission format

# save_sub(series=series_RFpar, time="df_test.csv", file="sub_RFpar_HofY_5topmean.csv")