### The description of data can be found here: 
http://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption

### Attribute Information:

#### 1.date: Date in format dd/mm/yyyy 
#### 2.time: time in format hh:mm:ss 
#### 3.global_active_power: household global minute-averaged active power (in kilowatt) 
#### 4.global_reactive_power: household global minute-averaged reactive power (in kilowatt) 
#### 5.voltage: minute-averaged voltage (in volt) 
#### 6.global_intensity: household global minute-averaged current intensity (in ampere) 
#### 7.sub_metering_1: energy sub-metering No. 1 (in watt-hour of active energy). It corresponds to the kitchen, containing mainly a dishwasher, an oven and a microwave (hot plates are not electric but gas powered). 
#### 8.sub_metering_2: energy sub-metering No. 2 (in watt-hour of active energy). It corresponds to the laundry room, containing a washing-machine, a tumble-drier, a refrigerator and a light. 
#### 9.sub_metering_3: energy sub-metering No. 3 (in watt-hour of active energy). It corresponds to an electric water-heater and an air-conditioner.


In [None]:

import sys 
import numpy as np # linear algebra
from scipy.stats import randint
import matplotlib.pyplot as plt # this is used for the plot the graph 
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv), data manipulation as in SQL
import seaborn as sns # used for plot interactive graph. 
from sklearn.model_selection import train_test_split # to split the data into two parts
from sklearn.model_selection import KFold # use for cross validation
from sklearn.preprocessing import StandardScaler # for normalization
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline # pipeline making
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn import metrics # for the check the error and accuracy of the model
from sklearn.metrics import mean_squared_error,r2_score

%matplotlib inline
import matplotlib.pyplot as plt
from tqdm import trange     
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2


### Importing the data 

In [None]:

df = pd.read_csv('household_power_consumption.txt', sep=';', 
                 parse_dates={'dt' : ['Date', 'Time']}, infer_datetime_format=True, 
                 low_memory=False, na_values=['nan','?'], index_col='dt')

In [None]:
df.agg({'Voltage':'mean','Global_intensity':'std'})

In [None]:
df.index

In [None]:
df.iloc[0:10,:]

In [None]:
df.describe()

### mean value imputation for missingness

In [None]:
## finding all columns that have nan:

droping_list_all=[]
for j in range(0,7):
    if not df.iloc[:, j].notnull().all():
        droping_list_all.append(j)        
        #print(df.iloc[:,j].unique())
droping_list_all

In [None]:
# filling nan with mean in any columns
for j in range(0,7):        
        df.iloc[:,j]=df.iloc[:,j].fillna(df.iloc[:,j].mean())


In [None]:
pd.DataFrame.dropna(df).shape

In [None]:
# another sanity check to make sure that there are not more any nan
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df['Global_active_power'].resample('Y').mean()

### resample to days

In [None]:
## resampling of data over hour
df_resample = df.resample('D').mean() 
df_resample.shape

In [None]:
df_resample.head()

In [None]:
df_resample.describe()

### Normalize First

In [None]:
df1=(df_resample-df_resample.mean())/df_resample.std()

In [None]:
df1.describe()

### use all previous measurement to predict the next global_active power

In [None]:
df1.head()

In [None]:
dfx=df1.iloc[:-1,:]

In [None]:
dfy=df1.iloc[1:,0]

In [None]:
dfnew=dfx

In [None]:
y=(np.array(dfy)).copy()

In [None]:
dfnew['y']=2*y

In [None]:
dfnew.head()

In [None]:
dfnew.shape

In [None]:
train=dfnew.sample(frac=0.6,random_state=223)
test=dfnew.drop(train.index)

In [None]:
trainy=np.array(train.pop('y'))
testy=np.array(test.pop('y'))

In [None]:
test.shape

In [None]:
train=np.array(train)
test=np.array(test)

In [None]:
train.shape,trainy.shape,test.shape,testy.shape

In [None]:
Train=np.c_[train,trainy]
Test=np.c_[test,testy]


In [None]:
np.save('Train',Train)
np.save('Test',Test)
