### The description of data can be found here: 
http://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption

### Attribute Information:

#### 1.date: Date in format dd/mm/yyyy 
#### 2.time: time in format hh:mm:ss 
#### 3.global_active_power: household global minute-averaged active power (in kilowatt) 
#### 4.global_reactive_power: household global minute-averaged reactive power (in kilowatt) 
#### 5.voltage: minute-averaged voltage (in volt) 
#### 6.global_intensity: household global minute-averaged current intensity (in ampere) 
#### 7.sub_metering_1: energy sub-metering No. 1 (in watt-hour of active energy). It corresponds to the kitchen, containing mainly a dishwasher, an oven and a microwave (hot plates are not electric but gas powered). 
#### 8.sub_metering_2: energy sub-metering No. 2 (in watt-hour of active energy). It corresponds to the laundry room, containing a washing-machine, a tumble-drier, a refrigerator and a light. 
#### 9.sub_metering_3: energy sub-metering No. 3 (in watt-hour of active energy). It corresponds to an electric water-heater and an air-conditioner.


In [1]:

import sys 
import numpy as np # linear algebra
from scipy.stats import randint
import matplotlib.pyplot as plt # this is used for the plot the graph 
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv), data manipulation as in SQL
import seaborn as sns # used for plot interactive graph. 
from sklearn.model_selection import train_test_split # to split the data into two parts
from sklearn.model_selection import KFold # use for cross validation
from sklearn.preprocessing import StandardScaler # for normalization
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline # pipeline making
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn import metrics # for the check the error and accuracy of the model
from sklearn.metrics import mean_squared_error,r2_score

%matplotlib inline
import matplotlib.pyplot as plt
from tqdm import trange     
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2


### Importing the data 

In [2]:

df = pd.read_csv('household_power_consumption.txt', sep=';', 
                 parse_dates={'dt' : ['Date', 'Time']}, infer_datetime_format=True, 
                 low_memory=False, na_values=['nan','?'], index_col='dt')

In [3]:
df.agg({'Voltage':'mean','Global_intensity':'std'})

Voltage             240.839858
Global_intensity      4.444396
dtype: float64

In [4]:
df.index

DatetimeIndex(['2006-12-16 17:24:00', '2006-12-16 17:25:00',
               '2006-12-16 17:26:00', '2006-12-16 17:27:00',
               '2006-12-16 17:28:00', '2006-12-16 17:29:00',
               '2006-12-16 17:30:00', '2006-12-16 17:31:00',
               '2006-12-16 17:32:00', '2006-12-16 17:33:00',
               ...
               '2010-11-26 20:53:00', '2010-11-26 20:54:00',
               '2010-11-26 20:55:00', '2010-11-26 20:56:00',
               '2010-11-26 20:57:00', '2010-11-26 20:58:00',
               '2010-11-26 20:59:00', '2010-11-26 21:00:00',
               '2010-11-26 21:01:00', '2010-11-26 21:02:00'],
              dtype='datetime64[ns]', name='dt', length=2075259, freq=None)

In [5]:
df.iloc[0:10,:]

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-12-16 17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
2006-12-16 17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2006-12-16 17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
2006-12-16 17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
2006-12-16 17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0
2006-12-16 17:29:00,3.52,0.522,235.02,15.0,0.0,2.0,17.0
2006-12-16 17:30:00,3.702,0.52,235.09,15.8,0.0,1.0,17.0
2006-12-16 17:31:00,3.7,0.52,235.22,15.8,0.0,1.0,17.0
2006-12-16 17:32:00,3.668,0.51,233.99,15.8,0.0,1.0,17.0
2006-12-16 17:33:00,3.662,0.51,233.86,15.8,0.0,2.0,16.0


In [6]:
df.describe()

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
count,2049280.0,2049280.0,2049280.0,2049280.0,2049280.0,2049280.0,2049280.0
mean,1.091615,0.1237145,240.8399,4.627759,1.121923,1.29852,6.458447
std,1.057294,0.112722,3.239987,4.444396,6.153031,5.822026,8.437154
min,0.076,0.0,223.2,0.2,0.0,0.0,0.0
25%,0.308,0.048,238.99,1.4,0.0,0.0,0.0
50%,0.602,0.1,241.01,2.6,0.0,0.0,1.0
75%,1.528,0.194,242.89,6.4,0.0,1.0,17.0
max,11.122,1.39,254.15,48.4,88.0,80.0,31.0


### mean value imputation for missingness

In [7]:
## finding all columns that have nan:

droping_list_all=[]
for j in range(0,7):
    if not df.iloc[:, j].notnull().all():
        droping_list_all.append(j)        
        #print(df.iloc[:,j].unique())
droping_list_all

[0, 1, 2, 3, 4, 5, 6]

In [8]:
# filling nan with mean in any columns
for j in range(0,7):        
        df.iloc[:,j]=df.iloc[:,j].fillna(df.iloc[:,j].mean())


In [9]:
pd.DataFrame.dropna(df).shape

(2075259, 7)

In [10]:
# another sanity check to make sure that there are not more any nan
df.isnull().sum()

Global_active_power      0
Global_reactive_power    0
Voltage                  0
Global_intensity         0
Sub_metering_1           0
Sub_metering_2           0
Sub_metering_3           0
dtype: int64

In [11]:
df.describe()

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
count,2075259.0,2075259.0,2075259.0,2075259.0,2075259.0,2075259.0,2075259.0
mean,1.091615,0.1237145,240.8399,4.627759,1.121923,1.29852,6.458447
std,1.050655,0.1120142,3.219643,4.41649,6.114397,5.78547,8.384178
min,0.076,0.0,223.2,0.2,0.0,0.0,0.0
25%,0.31,0.048,239.02,1.4,0.0,0.0,0.0
50%,0.63,0.102,240.96,2.8,0.0,0.0,1.0
75%,1.52,0.192,242.86,6.4,0.0,1.0,17.0
max,11.122,1.39,254.15,48.4,88.0,80.0,31.0


In [12]:
df['Global_active_power'].resample('Y').mean()

dt
2006-12-31    1.901148
2007-12-31    1.116818
2008-12-31    1.072102
2009-12-31    1.078743
2010-12-31    1.062136
Freq: A-DEC, Name: Global_active_power, dtype: float64

### resample to days

In [13]:
## resampling of data over hour
df_resample = df.resample('D').mean() 
df_resample.shape

(1442, 7)

In [14]:
df_resample.head()

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-12-16,3.053475,0.088187,236.243763,13.082828,0.0,1.378788,12.439394
2006-12-17,2.354486,0.156949,240.087028,9.999028,1.411806,2.907639,9.264583
2006-12-18,1.530435,0.112356,241.231694,6.421667,0.738194,1.820139,9.734722
2006-12-19,1.157079,0.104821,241.999313,4.926389,0.582639,5.279167,4.303472
2006-12-20,1.545658,0.111804,242.308063,6.467361,0.0,1.838889,9.765972


In [15]:
df_resample.describe()

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
count,1442.0,1442.0,1442.0,1442.0,1442.0,1442.0,1442.0
mean,1.092609,0.123694,240.8375,4.632038,1.121336,1.298476,6.461578
std,0.416643,0.033785,2.020303,1.725747,1.097471,1.445626,2.541839
min,0.173818,0.057811,231.088229,0.808333,0.0,0.0,0.894444
25%,0.82721,0.099349,240.105012,3.519167,0.489757,0.301562,4.686632
50%,1.084211,0.119222,240.95539,4.57366,0.779861,0.500347,6.458447
75%,1.314143,0.140795,241.859247,5.524132,1.53316,1.88486,8.135764
max,3.314851,0.290162,247.435007,14.028056,7.763279,8.409028,16.488194


### Normalize First

In [16]:
df1=(df_resample-df_resample.mean())/df_resample.std()
df1.describe()


Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
count,1442.0,1442.0,1442.0,1442.0,1442.0,1442.0,1442.0
mean,-1.842878e-15,8.672366e-16,-1.340275e-15,6.455e-16,-9.263664e-16,1.21216e-15,-2.917069e-15
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-2.205221,-1.950062,-4.825648,-2.215681,-1.021746,-0.89821,-2.190199
25%,-0.6369923,-0.7205808,-0.3625636,-0.6448637,-0.5754863,-0.6896066,-0.698292
50%,-0.02015556,-0.1323659,0.05835256,-0.03382788,-0.3111475,-0.5520989,-0.001231669
75%,0.5317127,0.5061772,0.5057391,0.5169318,0.3752475,0.4056264,0.6586513
max,5.33368,4.927265,3.265603,5.444608,6.052044,4.918667,3.94463


### use all previous measurement to predict the next global_active power

In [17]:
dfx=df1.iloc[:-1,:]

In [18]:
dfy=df1.iloc[1:,0]

In [19]:
dfnew=dfx

In [20]:
y=(np.array(dfy)).copy()

In [21]:
dfnew['y']=y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [22]:
dfnew.head()

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,y
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2006-12-16,4.706341,-1.050976,-2.273787,4.896888,-1.021746,0.055555,2.351768,3.028674
2006-12-17,3.028674,0.984291,-0.371465,3.109951,0.264671,1.113126,1.102747,1.050841
2006-12-18,1.050841,-0.335611,0.195116,1.037017,-0.349114,0.360856,1.287707,0.154738
2006-12-19,0.154738,-0.55863,0.575068,0.170564,-0.490854,2.753611,-0.849033,1.08738
2006-12-20,1.08738,-0.351932,0.727892,1.063495,-1.021746,0.373827,1.300001,0.242431


In [23]:
dfnew.shape

(1441, 8)

In [24]:
train=dfnew.sample(frac=0.6,random_state=223)
test=dfnew.drop(train.index)

In [25]:
dt=np.r_[train,test]

In [26]:
np.save("energydt",dt)