# Machine Learning Nanodegree
## Capstone Project: Electric Vehicle Detection 

### Setup

In [1]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from IPython.display import display 

# Pretty display for notebooks
%matplotlib inline

### Data Clean Up

In [2]:
ev_train_raw = pd.read_csv('EV_train.csv')
ev_train_labels_raw = pd.read_csv('EV_train_labels.csv')

In [3]:
print(ev_train_raw.shape)
print(ev_train_labels_raw.shape)
display(ev_train_raw.head())
display(ev_train_labels_raw.head())

(1590, 2881)
(1590, 2881)


Unnamed: 0,House ID,Interval_1,Interval_2,Interval_3,Interval_4,Interval_5,Interval_6,Interval_7,Interval_8,Interval_9,...,Interval_2871,Interval_2872,Interval_2873,Interval_2874,Interval_2875,Interval_2876,Interval_2877,Interval_2878,Interval_2879,Interval_2880
0,11655099,0.95,0.826,0.361,0.238,0.342,0.233,0.351,0.194,0.292,...,0.664,0.783,0.601,0.639,0.417,0.439,0.226,0.19,0.71,0.728
1,11633257,0.353,0.327,0.358,0.292,0.285,0.304,0.361,0.342,0.355,...,0.536,0.558,0.622,0.634,0.513,0.421,0.273,0.296,0.291,0.289
2,11651552,0.15,0.181,0.15,0.15,0.131,0.125,0.088,0.106,0.094,...,2.125,0.881,0.481,1.194,0.138,0.119,0.038,0.088,0.056,0.113
3,11636092,2.088,2.075,2.121,2.098,2.046,2.081,1.847,0.42,0.399,...,0.62,0.487,0.563,0.419,0.379,0.359,0.347,0.325,0.33,0.34
4,11647239,1.416,1.25,1.27,1.258,1.239,1.753105,4.609256,4.619256,4.075151,...,1.596,1.667,1.569,1.664,1.58,1.635,1.568,1.565,1.575,1.571


Unnamed: 0,House ID,Interval_1,Interval_2,Interval_3,Interval_4,Interval_5,Interval_6,Interval_7,Interval_8,Interval_9,...,Interval_2871,Interval_2872,Interval_2873,Interval_2874,Interval_2875,Interval_2876,Interval_2877,Interval_2878,Interval_2879,Interval_2880
0,11655099,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,11633257,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,11651552,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,11636092,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,11647239,0,0,0,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [4]:
ev_train_pivot = ev_train_raw.copy()
ev_train_pivot = pd.wide_to_long(ev_train_pivot, stubnames='Interval_', i='House ID', j='kWh').reset_index()
ev_train_pivot.columns = ['House ID', 'Interval', 'kWh']
ev_train_pivot['Interval'] = pd.to_numeric(ev_train_pivot['Interval'], downcast='integer')
ev_train_pivot = ev_train_pivot.sort_values(['House ID', 'Interval']).reset_index(drop=True)

In [5]:
ev_train_labels_pivot = ev_train_labels_raw.copy()
ev_train_labels_pivot = pd.wide_to_long(ev_train_labels_pivot, stubnames='Interval_', i='House ID', j='label').reset_index()
ev_train_labels_pivot.columns = ['House ID', 'Interval', 'Label']
ev_train_labels_pivot['Interval'] = pd.to_numeric(ev_train_labels_pivot['Interval'], downcast='integer')
ev_train_labels_pivot = ev_train_labels_pivot.sort_values(['House ID', 'Interval']).reset_index(drop=True)

In [6]:
ev_train_pivot_label = pd.concat([ev_train_pivot, ev_train_labels_pivot['Label']], axis=1)

In [7]:
##Checking Label Concatenation (pd.merge was failing due to memory limits?)

import random

display(ev_train_pivot_label[ev_train_pivot_label['House ID']==11647239].head(9))

all_raw_house_ids  = ev_train_raw['House ID'].unique()

random_house_id = random.choice(all_raw_house_ids)
random_interval = random.randint(1,2880)


lcr1 = ev_train_pivot_label['House ID']==random_house_id
lcc1 = ev_train_pivot_label['Interval']==random_interval
display(ev_train_pivot_label[lcr1&lcc1])

lcr2 = ev_train_raw.index[ev_train_raw['House ID']==random_house_id].values[0]
raw_kwh = ev_train_raw.iloc[lcr2,random_interval]
raw_label = ev_train_labels_raw.iloc[lcr2,random_interval]

rand_dict = {
    "House ID": random_house_id,
    "Interval": random_interval,
    "kWh": raw_kwh,
    "Label": raw_label 
}
print(rand_dict)


Unnamed: 0,House ID,Interval,kWh,Label
3173760,11647239,1,1.416,0
3173761,11647239,2,1.25,0
3173762,11647239,3,1.27,0
3173763,11647239,4,1.258,0
3173764,11647239,5,1.239,0
3173765,11647239,6,1.753105,1
3173766,11647239,7,4.609256,1
3173767,11647239,8,4.619256,1
3173768,11647239,9,4.075151,1


Unnamed: 0,House ID,Interval,kWh,Label
3293729,11647861,1890,2.843,0


{'House ID': 11647861, 'Interval': 1890, 'kWh': 2.843, 'Label': 0}


In [8]:
ev_train_raw.iloc[lcr2,random_interval-1:random_interval+1]

Interval_1889    3.785
Interval_1890    2.843
Name: 473, dtype: float64

In [9]:
print(ev_train_pivot.shape)
print(2*24*60*1590)

temp_dim_dict = {
    "Day": np.tile(np.repeat(np.arange(1,61),48),1590),
    "Hour": np.tile(np.tile(np.repeat(np.arange(1,25),2),60),1590),
    "Half Hour": np.tile(np.tile(np.arange(1,49),60),1590)
}

temp_dim_df = pd.DataFrame.from_dict(temp_dim_dict)

(4579200, 3)
4579200


In [10]:
ev_train_piv_lab_tmp = pd.concat([ev_train_pivot_label, temp_dim_df], axis=1)
ev_train_piv_lab_tmp_cols = ['House ID','Day','Hour','Half Hour','Interval', 'kWh', 'Label']
ev_train_piv_lab_tmp = ev_train_piv_lab_tmp[ev_train_piv_lab_tmp_cols] 
ev_train_piv_lab_tmp.head()

Unnamed: 0,House ID,Day,Hour,Half Hour,Interval,kWh,Label
0,11628280,1,1,1,1,1.114,0
1,11628280,1,1,2,2,0.845,0
2,11628280,1,2,3,3,0.463,0
3,11628280,1,2,4,4,0.453,0
4,11628280,1,3,5,5,0.61,0


In [11]:
##Checking Temporal Concatenation (pd.merge was failing due to memory limits?)

check_aggs =['max', 'min','mean','count']

check_dict = {
    'Day' : check_aggs, 
    'Hour': check_aggs,
    'Half Hour': check_aggs,
}

temp_merge_check_df = ev_train_piv_lab_tmp.groupby('House ID').agg(check_dict)

unique_records = len(temp_merge_check_df.drop_duplicates())
total_records = len(temp_merge_check_df)

assert unique_records == 1, "Problem with temporal concatenation"

display(temp_merge_check_df.head())

Unnamed: 0_level_0,Day,Day,Day,Day,Hour,Hour,Hour,Hour,Half Hour,Half Hour,Half Hour,Half Hour
Unnamed: 0_level_1,max,min,mean,count,max,min,mean,count,max,min,mean,count
House ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
11628280,60,1,30.5,2880,24,1,12.5,2880,48,1,24.5,2880
11628291,60,1,30.5,2880,24,1,12.5,2880,48,1,24.5,2880
11628301,60,1,30.5,2880,24,1,12.5,2880,48,1,24.5,2880
11628319,60,1,30.5,2880,24,1,12.5,2880,48,1,24.5,2880
11628335,60,1,30.5,2880,24,1,12.5,2880,48,1,24.5,2880


In [12]:
display(ev_train_piv_lab_tmp.iloc[45:50,])
display(ev_train_piv_lab_tmp[ev_train_piv_lab_tmp['House ID']==11647239].head(9))

Unnamed: 0,House ID,Day,Hour,Half Hour,Interval,kWh,Label
45,11628280,1,23,46,46,1.146,0
46,11628280,1,24,47,47,1.13,0
47,11628280,1,24,48,48,1.17,0
48,11628280,2,1,1,49,1.239,0
49,11628280,2,1,2,50,0.952,0


Unnamed: 0,House ID,Day,Hour,Half Hour,Interval,kWh,Label
3173760,11647239,1,1,1,1,1.416,0
3173761,11647239,1,1,2,2,1.25,0
3173762,11647239,1,2,3,3,1.27,0
3173763,11647239,1,2,4,4,1.258,0
3173764,11647239,1,3,5,5,1.239,0
3173765,11647239,1,3,6,6,1.753105,1
3173766,11647239,1,4,7,7,4.609256,1
3173767,11647239,1,4,8,8,4.619256,1
3173768,11647239,1,5,9,9,4.075151,1


### Validation Split

In [31]:
from sklearn.model_selection import train_test_split

all_raw_house_ids

h_train, h_test = train_test_split(all_raw_house_ids, test_size=0.25, random_state=42)

print("Trainig Houses:{0}, Validation Houses:{1}, Total Houses:{2}".format(
    len(h_train), 
    len(h_test), 
    len(h_train)+len(h_test))
     )

Trainig Houses:1192, Validation Houses:398, Total Houses:1590


In [46]:
X_train_records = ev_train_piv_lab_tmp['House ID'].isin(h_train)

#training
X_train = ev_train_piv_lab_tmp[X_train_records]
y_train = X_train['Label']
X_train.drop(['Label'], axis=1, inplace=True)
display(X_train.head())
display(y_train.head())

#validation
X_test = ev_train_piv_lab_tmp[~X_train_records]
y_test = X_test['Label']
X_test.drop(['Label'], axis=1, inplace=True)
display(X_test.head())
display(y_test.head())

print(X_train.shape)
print(len(h_train)*2880)

print(X_test.shape)
print(len(h_test)*2880)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,House ID,Day,Hour,Half Hour,Interval,kWh
0,11628280,1,1,1,1,1.114
1,11628280,1,1,2,2,0.845
2,11628280,1,2,3,3,0.463
3,11628280,1,2,4,4,0.453
4,11628280,1,3,5,5,0.61


0    0
1    0
2    0
3    0
4    0
Name: Label, dtype: int64

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


Unnamed: 0,House ID,Day,Hour,Half Hour,Interval,kWh
8640,11628319,1,1,1,1,0.712
8641,11628319,1,1,2,2,0.763
8642,11628319,1,2,3,3,0.709
8643,11628319,1,2,4,4,0.771
8644,11628319,1,3,5,5,0.729


8640    0
8641    0
8642    0
8643    0
8644    0
Name: Label, dtype: int64

(3432960, 6)
3432960
(1146240, 6)
1146240


### Feature Engineering