In [424]:
import pandas as pd
import requests as req
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# la = pd.read_csv('los angeles.csv')
# la.rename(columns={'Unnamed: 0': 'Timestamp'}, inplace=True)
# la.set_index('Timestamp', inplace=True)
# la = la.reindex(pd.date_range(la.index.min(), la.index.max(), freq="H"))

In [232]:
ta = pd.read_csv('tallahassee.csv')
ta.set_index('Timestamp', inplace=True)
ta.index = pd.to_datetime(ta.index, format="%d/%m/%Y %H:%M")

In [233]:
def fetch_segmented_data(year, from_month=0, to_month=0):
    if from_month > 0 and to_month > 0:
        return ta[(ta.index.year == year) & (ta.index.from_month >= from_month) & (ta.index.to_month <= to_month)]
    else:
        return ta[(ta.index.year == year)]

In [234]:
def fetch_daily_avg_temperatures(year):
    first_half = pd.read_csv('https://www.ncei.noaa.gov/access/services/data/v1?dataset=daily-summaries&startDate={}-01-01T00:00:00z&endDate={}-05-31&stations=USW00093805'.format(year, year))
    second_half = pd.read_csv('https://www.ncei.noaa.gov/access/services/data/v1?dataset=daily-summaries&startDate={}-06-01T00:00:00z&endDate={}-12-31&stations=USW00093805'.format(year, year))
    relevant_first_half = first_half[['DATE', 'TMAX', 'TMIN']]
    relevant_second_half = second_half[['DATE', 'TMAX', 'TMIN']]
    combined_per_year = pd.concat([relevant_first_half, relevant_second_half])
    combined_per_year['TMAX'] = combined_per_year['TMAX']/10
    combined_per_year['TMIN'] = combined_per_year['TMIN']/10
    combined_per_year['DATE'] = pd.to_datetime(combined_per_year['DATE'])
    return combined_per_year.set_index('DATE')


In [235]:
def fetch_hourly_temperatures(year):
    first_half = pd.read_csv('https://www.ncei.noaa.gov/access/services/data/v1?dataset=normals-hourly&startDate={}-01-01T00:00:00z&endDate={}-05-31&stations=USW00093805'.format(year, year))
    second_half = pd.read_csv('https://www.ncei.noaa.gov/access/services/data/v1?dataset=normals-hourly&startDate={}-06-01T00:00:00z&endDate={}-12-31&stations=USW00093805'.format(year, year))
    relevant_first_half = first_half[['DATE', 'HLY-TEMP-NORMAL', 'HLY-TEMP-10PCTL', 'HLY-TEMP-90PCTL', 'HLY-HIDX-NORMAL', 'HLY-HTDH-NORMAL']]
    relevant_second_half = second_half[['DATE', 'HLY-TEMP-NORMAL', 'HLY-TEMP-10PCTL', 'HLY-TEMP-90PCTL', 'HLY-HIDX-NORMAL', 'HLY-HTDH-NORMAL']]
    combined_per_year = pd.concat([relevant_first_half, relevant_second_half])
    combined_per_year['HLY-TEMP-10PCTL'] = combined_per_year['HLY-TEMP-10PCTL']/10
    combined_per_year['HLY-TEMP-90PCTL'] = combined_per_year['HLY-TEMP-90PCTL']/10
    combined_per_year['DATE'] = str(year) + '-' + combined_per_year['DATE']
    combined_per_year['DATE'] = pd.to_datetime(combined_per_year['DATE'])
    return combined_per_year.set_index('DATE')


In [236]:
ta_2015 = fetch_segmented_data(2015)
ta_2016 = fetch_segmented_data(2016)
ta_2017 = fetch_segmented_data(2017)
ta_2018 = fetch_segmented_data(2018)
ta_2019 = fetch_segmented_data(2019)

In [237]:
temp_daily_2015 = fetch_daily_avg_temperatures(2015)
temp_daily_2016 = fetch_daily_avg_temperatures(2016)
temp_daily_2017 = fetch_daily_avg_temperatures(2017)
temp_daily_2018 = fetch_daily_avg_temperatures(2018)
temp_daily_2019 = fetch_daily_avg_temperatures(2019)

In [238]:
temp_hourly_2015 = fetch_hourly_temperatures(2015)
temp_hourly_2016 = fetch_hourly_temperatures(2016)
temp_hourly_2017 = fetch_hourly_temperatures(2017)
temp_hourly_2018 = fetch_hourly_temperatures(2018)
temp_hourly_2019 = fetch_hourly_temperatures(2019)

In [239]:
# fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 7))
# temp_daily_2015.plot(ax=axes[0,0])
# ta_2015['demand'].plot(ax=axes[0,1], title='TA Demand - 2015')
# temp_hourly_2015[['HLY-TEMP-10PCTL', 'HLY-TEMP-90PCTL']].plot(ax=axes[1,0])
# ta_2015['forecast'].plot(ax=axes[1,1], title='TA Forecast - 2015');

In [240]:
# fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 7))
# temp_daily_2016.plot(ax=axes[0,0])
# ta_2016['demand'].plot(ax=axes[0,1], title='TA Demand - 2016')
# temp_hourly_2016[['HLY-TEMP-10PCTL', 'HLY-TEMP-90PCTL']].plot(ax=axes[1,0])
# ta_2016['forecast'].plot(ax=axes[1,1], title='TA Forecast - 2016');

In [241]:
# fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 7))
# temp_daily_2017.plot(ax=axes[0,0])
# ta_2017['demand'].plot(ax=axes[0,1], title='TA Demand - 2017')
# temp_hourly_2017[['HLY-TEMP-10PCTL', 'HLY-TEMP-90PCTL']].plot(ax=axes[1,0])
# ta_2017['forecast'].plot(ax=axes[1,1], title='TA Forecast - 2017');

In [242]:
# fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 7))
# temp_daily_2018.plot(ax=axes[0,0])
# ta_2018['demand'].plot(ax=axes[0,1], title='LA Demand - 2018')
# temp_hourly_2018[['HLY-TEMP-10PCTL', 'HLY-TEMP-90PCTL']].plot(ax=axes[1,0])
# ta_2018['forecast'].plot(ax=axes[1,1], title='LA Forecast - 2018');

In [243]:
# fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 7))
# temp_daily_2019.plot(ax=axes[0,0])
# ta_2019['demand'].plot(ax=axes[0,1], title='LA Demand - 2019')
# temp_hourly_2019[['HLY-TEMP-10PCTL', 'HLY-TEMP-90PCTL']].plot(ax=axes[1,0])
# ta_2019['forecast'].plot(ax=axes[1,1], title='LA Forecast - 2019');

In [18]:
# fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 7))
# ta_2016[(ta_2016.index.month == 2)]['demand'].plot(ax=axes[0,0], title='LA-2016')
# ta_2017[(ta_2017.index.month == 2)]['demand'].plot(ax=axes[0,1], title='LA-2017')
# ta_2018[(ta_2018.index.month == 2)]['demand'].plot(ax=axes[1,0], title='LA-2018')
# ta_2019[(ta_2019.index.month == 2)]['demand'].plot(ax=axes[1,1], title='LA-2019');

In [244]:
ta['moving_avg_3'] = ta['demand'].shift().rolling(min_periods=3, window=3).mean()

In [245]:
# Null values in ta demand/forecast
ta[ta['forecast'].isnull()]

Unnamed: 0_level_0,demand,forecast,moving_avg_3
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-03-13 02:00:00,,,240.666667
2017-03-12 02:00:00,,,217.0
2018-03-11 02:00:00,,,229.666667
2019-03-10 02:00:00,,,234.666667
2019-10-13 01:00:00,270.0,,319.0
2019-10-13 02:00:00,250.0,,294.0
2019-10-13 03:00:00,236.0,,271.0
2019-10-13 04:00:00,226.0,,252.0
2019-10-13 05:00:00,219.0,,237.333333


In [246]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

In [247]:
rmse(ta['moving_avg_3'], ta['demand'])

35.89529198871812

In [248]:
rmse(ta['forecast'], ta['demand'])

22.0874896698288

In [249]:
past_3_years = pd.concat([ta_2016, ta_2017, ta_2018])
mvg_avg_3_lag365 = past_3_years.groupby([past_3_years.index.month, past_3_years.index.day, past_3_years.index.hour]).mean()

In [250]:
mvg_avg_3_lag365 = mvg_avg_3_lag365.iloc[0:6846]
mvg_avg_3_lag365.index = ta_2019.index

In [251]:
mvg_avg_3_lag365.head()

Unnamed: 0_level_0,demand,forecast
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-01 00:00:00,258.666667,262.333333
2019-01-01 01:00:00,246.333333,243.666667
2019-01-01 02:00:00,239.333333,236.666667
2019-01-01 03:00:00,232.666667,230.0
2019-01-01 04:00:00,230.666667,228.333333


In [252]:
rmse(mvg_avg_3_lag365['demand'], ta_2019['demand'])

53.77335786342343

### OUTLIER ANALYSIS

In [361]:
ta['diff'] = ta['demand'] - ta['forecast']

In [362]:
outliers = ta[(ta['diff'] > 100) | (ta['diff'] < -100)]
outliers#.groupby([outliers.index.day])[['demand', 'forecast', 'diff']]
# outliers.reset_index().sort_values(by='Timestamp').to_csv("ta_outliers.csv")

Unnamed: 0_level_0,demand,forecast,moving_avg_3,diff
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-03-19 14:00:00,534.0,266.0,253.666667,268.0
2016-04-27 17:00:00,540.0,397.0,368.666667,143.0
2016-06-17 17:00:00,386.0,509.0,457.666667,-123.0
2016-06-17 18:00:00,365.0,498.0,417.666667,-133.0
2016-06-17 19:00:00,357.0,479.0,388.333333,-122.0
2016-09-02 02:00:00,83.0,203.0,237.666667,-120.0
2016-09-02 03:00:00,55.0,196.0,164.666667,-141.0
2016-09-02 04:00:00,51.0,196.0,101.666667,-145.0
2016-09-02 05:00:00,49.0,201.0,63.000000,-152.0
2016-09-02 06:00:00,47.0,218.0,51.666667,-171.0


In [363]:
# Exactly equal demand and forecast
len(ta[ta['diff']==0])

1047

In [364]:
# variation when demand is greater than forecast
ta[ta['diff']>0]['diff'].std()

13.645014199849276

In [365]:
# variation when demand is less than forecast
ta[ta['diff']<0]['diff'].std()

28.13463651099432

## REGRESSION ANALYSIS ON TIME SERIES

In [391]:
# Timestamp information as features for regression
ta['year'] = ta.index.year
ta['month'] = ta.index.month
ta['day'] = ta.index.day
ta['hour'] = ta.index.hour

In [392]:
ta.head()

Unnamed: 0_level_0,demand,forecast,moving_avg_3,diff,year,month,day,hour
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-07-01 01:00:00,225.0,274.0,,-49.0,2015,7,1,1
2015-07-01 02:00:00,222.0,255.0,,-33.0,2015,7,1,2
2015-07-01 03:00:00,213.0,243.0,,-30.0,2015,7,1,3
2015-07-01 04:00:00,206.0,237.0,220.0,-31.0,2015,7,1,4
2015-07-01 05:00:00,207.0,238.0,213.666667,-31.0,2015,7,1,5


In [393]:
# Fetch hourly temperatures for only the dates having energy data
temp_hly_total = pd.concat([temp_hourly_2015[temp_hourly_2015.index >= '2015-07-01 01:00:00'], 
                            temp_hourly_2016, temp_hourly_2017, temp_hourly_2018, 
                            temp_hourly_2019[temp_hourly_2019.index  <= '2019-10-13 5:0:0']])

In [394]:
# check start and end dates of both energy and temp data
print('Start date : ', temp_hly_total.index.min(), ta.index.min())
print('End date   : ', temp_hly_total.index.max(), ta.index.max())

Start date :  2015-07-01 01:00:00 2015-07-01 01:00:00
End date   :  2019-10-13 05:00:00 2019-10-13 05:00:00


In [395]:
temp_hly_total[(temp_hly_total.index.year == 2016) & (temp_hly_total.index.month == 2)].tail()

Unnamed: 0_level_0,HLY-TEMP-NORMAL,HLY-TEMP-10PCTL,HLY-TEMP-90PCTL,HLY-HIDX-NORMAL,HLY-HTDH-NORMAL
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-02-28 19:00:00,619,51.1,71.1,619,46
2016-02-28 20:00:00,584,46.9,68.0,584,74
2016-02-28 21:00:00,560,43.0,66.9,560,95
2016-02-28 22:00:00,542,41.0,66.0,542,112
2016-02-28 23:00:00,531,39.0,66.0,531,122


In [396]:
# 2016's data of temperatures for 29th Feb 2019 is missing (leap year) so dropping demand for that day too
ta = ta[ta.index.isin(ta.index.drop(set(ta.index).difference(set(temp_hly_total.index))))]

In [397]:
daylight_hours = pd.read_csv("ta_daylight_hours.csv")
daylight_hours.rename(columns = {'Unnamed: 0': 'Timestamp'}, inplace=True)
daylight_hours.set_index('Timestamp', inplace=True)
daylight_hours.index = pd.to_datetime(daylight_hours.index)

In [398]:
daylight_hours['sunrise'] = pd.to_datetime(daylight_hours['sunrise'], format="%I:%M:%S %p")
daylight_hours['sunset'] = pd.to_datetime(daylight_hours['sunset'], format="%I:%M:%S %p")

In [400]:
daylight_hours['sunrise'] = (daylight_hours['sunrise'] - timedelta(hours=5))
daylight_hours['sunset'] = (daylight_hours['sunset'] - timedelta(hours=5))

In [401]:
# Fetching only sunrise/sunset timings and relevant dates from daylight_hours data
daylight_relevant_hrs = daylight_hours[(daylight_hours.index >= '2015-07-01') & 
                                       (daylight_hours.index <= '2019-10-13')][['sunrise', 'sunset']]

In [402]:
temps_merged_with_ta = ta.merge(temp_hly_total, left_index=True, right_index=True)

In [403]:
complete = temps_merged_with_ta.merge(daylight_relevant_hrs, left_index=True, right_index=True, how='outer')
complete['sunrise'] = complete['sunrise'].fillna(method='ffill')
complete['sunset'] = complete['sunset'].fillna(method='ffill')

In [404]:
is_daylight = ((complete.index.hour >= complete['sunrise'].dt.hour) & (complete.index.hour <= complete['sunset'].dt.hour))

In [405]:
is_daylight = is_daylight.astype(int)

In [406]:
complete['is_daylight'] = is_daylight

In [407]:
complete.head(n=25)

Unnamed: 0_level_0,demand,forecast,moving_avg_3,diff,year,month,day,hour,HLY-TEMP-NORMAL,HLY-TEMP-10PCTL,HLY-TEMP-90PCTL,HLY-HIDX-NORMAL,HLY-HTDH-NORMAL,sunrise,sunset,is_daylight
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2015-07-01 00:00:00,,,,,,,,,,,,,,1900-01-01 05:38:44,1899-12-31 19:43:25,0
2015-07-01 01:00:00,225.0,274.0,,-49.0,2015.0,7.0,1.0,1.0,741.0,71.1,78.1,742.0,-7777.0,1900-01-01 05:38:44,1899-12-31 19:43:25,0
2015-07-01 02:00:00,222.0,255.0,,-33.0,2015.0,7.0,1.0,2.0,735.0,70.0,77.0,736.0,-7777.0,1900-01-01 05:38:44,1899-12-31 19:43:25,0
2015-07-01 03:00:00,213.0,243.0,,-30.0,2015.0,7.0,1.0,3.0,730.0,70.0,77.0,730.0,-7777.0,1900-01-01 05:38:44,1899-12-31 19:43:25,0
2015-07-01 04:00:00,206.0,237.0,220.0,-31.0,2015.0,7.0,1.0,4.0,725.0,69.1,75.9,725.0,-7777.0,1900-01-01 05:38:44,1899-12-31 19:43:25,0
2015-07-01 05:00:00,207.0,238.0,213.666667,-31.0,2015.0,7.0,1.0,5.0,722.0,69.1,75.9,722.0,-7777.0,1900-01-01 05:38:44,1899-12-31 19:43:25,1
2015-07-01 06:00:00,219.0,252.0,208.666667,-33.0,2015.0,7.0,1.0,6.0,721.0,69.1,75.9,721.0,-7777.0,1900-01-01 05:38:44,1899-12-31 19:43:25,1
2015-07-01 07:00:00,245.0,277.0,210.666667,-32.0,2015.0,7.0,1.0,7.0,743.0,71.1,77.0,743.0,0.0,1900-01-01 05:38:44,1899-12-31 19:43:25,1
2015-07-01 08:00:00,266.0,302.0,223.666667,-36.0,2015.0,7.0,1.0,8.0,782.0,75.0,82.0,799.0,0.0,1900-01-01 05:38:44,1899-12-31 19:43:25,1
2015-07-01 09:00:00,292.0,322.0,243.333333,-30.0,2015.0,7.0,1.0,9.0,818.0,77.0,86.0,868.0,0.0,1900-01-01 05:38:44,1899-12-31 19:43:25,1


In [408]:
from datetime import timedelta
date_set = set(temp_hly_total.index[0] + timedelta(x) for x in range((temp_hly_total.index[-1] - temp_hly_total.index[0]).days))
missing = sorted(date_set - set(temp_hly_total.index))
missing

[Timestamp('2016-02-29 01:00:00')]

In [409]:
date_set = set(ta.index[0] + timedelta(x) for x in range((ta.index[-1] - ta.index[0]).days))
missing_ta = sorted(date_set - set(ta.index))
missing_ta

[Timestamp('2016-02-29 01:00:00')]

In [410]:
temp_hly_total.isnull().sum()

HLY-TEMP-NORMAL    0
HLY-TEMP-10PCTL    0
HLY-TEMP-90PCTL    0
HLY-HIDX-NORMAL    0
HLY-HTDH-NORMAL    0
dtype: int64

In [411]:
temp_hly_total.iloc[5830:5840]

Unnamed: 0_level_0,HLY-TEMP-NORMAL,HLY-TEMP-10PCTL,HLY-TEMP-90PCTL,HLY-HIDX-NORMAL,HLY-HTDH-NORMAL
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-02-28 23:00:00,531,39.0,66.0,531,122
2016-03-01 00:00:00,530,39.0,66.0,530,123
2016-03-01 01:00:00,512,37.0,64.9,512,140
2016-03-01 02:00:00,505,36.0,64.9,505,147
2016-03-01 03:00:00,499,35.1,64.9,499,153
2016-03-01 04:00:00,494,34.0,64.9,494,158
2016-03-01 05:00:00,490,34.0,64.9,490,162
2016-03-01 06:00:00,485,33.1,64.9,485,167
2016-03-01 07:00:00,482,32.0,64.9,482,170
2016-03-01 08:00:00,499,35.1,64.9,499,154


In [412]:
lag24 = pd.read_csv("lag24.csv")
lag24.set_index('date', inplace=True)
lag24.drop(['Unnamed: 0', 'demand'], axis=1, inplace=True)

In [413]:
complete= complete.merge(lag24, left_index=True, right_index=True)

In [414]:
complete.head(n=1)

Unnamed: 0,demand,forecast,moving_avg_3,diff,year,month,day,hour,HLY-TEMP-NORMAL,HLY-TEMP-10PCTL,HLY-TEMP-90PCTL,HLY-HIDX-NORMAL,HLY-HTDH-NORMAL,sunrise,sunset,is_daylight,demand24
2015-07-01 01:00:00,225.0,274.0,,-49.0,2015.0,7.0,1.0,1.0,741.0,71.1,78.1,742.0,-7777.0,1900-01-01 05:38:44,1899-12-31 19:43:25,0,307.0


In [415]:
complete.reset_index(inplace=True)
complete.drop(['index', 'sunrise', 'sunset', 'forecast', 
               'HLY-HIDX-NORMAL', 'HLY-HTDH-NORMAL'], axis=1, inplace=True)
complete.drop(0, inplace=True)
complete['demand'].fillna(method='ffill', inplace=True)
complete.dropna(inplace=True)

In [416]:
complete.head(n=1)

Unnamed: 0,demand,moving_avg_3,diff,year,month,day,hour,HLY-TEMP-NORMAL,HLY-TEMP-10PCTL,HLY-TEMP-90PCTL,is_daylight,demand24
3,206.0,220.0,-31.0,2015.0,7.0,1.0,4.0,725.0,69.1,75.9,0,260.0


In [445]:
X = complete.drop('demand', axis=1)
y = complete['demand']

In [446]:
def month_num_to_name(x):
    if x == 1.0:
        return "Jan"
    elif x == 2.0:
        return "Feb"
    elif x == 3.0:
        return "Mar"
    elif x == 4.0:
        return "Apr"
    elif x == 5.0:
        return "May"
    elif x == 6.0:
        return "Jun"
    elif x == 7.0:
        return "Jul"
    elif x == 8.0:
        return "Aug"
    elif x == 9.0:
        return "Sep"
    elif x == 10.0:
        return "Oct"
    elif x == 11.0:
        return "Nov"
    elif x == 12.0:
        return "Dec"

In [447]:
X['day'] = "day" + X['day'].astype(str)
X['hour'] = "hour" + X['hour'].astype(str)
X['month'] = X['month'].apply(lambda x: month_num_to_name(x))

In [448]:
def encoding_cat_features(X, column):
    return X.merge(pd.get_dummies(X[column]),  left_index=True, right_index=True)

In [449]:
year_cat = encoding_cat_features(X, 'year')
month_cat = encoding_cat_features(year_cat, 'month')
day_cat = encoding_cat_features(month_cat, 'day')
X = encoding_cat_features(day_cat, 'hour')

In [451]:
X.drop(['year', 'month', 'day', 'hour', 'diff'],axis=1, inplace=True)

In [452]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [465]:
rgr = LinearRegression()
rgr.fit(X_train, y_train)
y_pred_lr = rgr.predict(X_test)
print("R2 : ", r2_score(y_test, y_pred_lr), "RMSE : ", np.sqrt(mean_squared_error(y_test, y_pred_lr)))

R2 :  0.9553547413645739 RMSE :  17.95682714827267


In [466]:
rgressor = RandomForestRegressor()
rgressor.fit(X_train, y_train)
y_pred_rf = rgressor.predict(X_test)
print("R2 : ", r2_score(y_test, y_pred_rf), "RMSE : ", np.sqrt(mean_squared_error(y_test, y_pred_rf)))



R2 :  0.9711706211371766 RMSE :  14.429775465227971


In [461]:
# x = y_test.tolist()
# for i in range(len(y_pred)):
#     print(x[i], y_pred[i])

In [459]:
# complete.to_csv("data_for_modeling1.0.csv")
# X.to_csv("features_with_dummies1.0.csv")

In [460]:
# from sklearn.svm import OneClassSVM
# clf = OneClassSVM(degree=1, gamma='auto')
# clf.fit([complete['demand']])
# clf.predict([complete['demand']])

In [440]:
correlations = X.corr()

In [441]:
correlations.to_csv("correlations.csv")