# Machine Learning Nanodegree
## Capstone Project: Electric Vehicle Detection 

# Introduction

### Setup

In [1]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from IPython.display import display, Image 
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier


# Pretty display for notebooks
%matplotlib inline

# Data Clean Up

### Import

In [2]:
ev_train_raw = pd.read_csv('EV_train.csv')
ev_train_labels_raw = pd.read_csv('EV_train_labels.csv')

In [3]:
print(ev_train_raw.shape)
print(ev_train_labels_raw.shape)
display(ev_train_raw.head())
display(ev_train_labels_raw.head())
print(ev_train_raw.isnull().values.any())  # Null values in training dataset?
print(ev_train_labels_raw.isnull().values.any())  #Null values in training labels?

(1590, 2881)
(1590, 2881)


Unnamed: 0,House ID,Interval_1,Interval_2,Interval_3,Interval_4,Interval_5,Interval_6,Interval_7,Interval_8,Interval_9,...,Interval_2871,Interval_2872,Interval_2873,Interval_2874,Interval_2875,Interval_2876,Interval_2877,Interval_2878,Interval_2879,Interval_2880
0,11655099,0.95,0.826,0.361,0.238,0.342,0.233,0.351,0.194,0.292,...,0.664,0.783,0.601,0.639,0.417,0.439,0.226,0.19,0.71,0.728
1,11633257,0.353,0.327,0.358,0.292,0.285,0.304,0.361,0.342,0.355,...,0.536,0.558,0.622,0.634,0.513,0.421,0.273,0.296,0.291,0.289
2,11651552,0.15,0.181,0.15,0.15,0.131,0.125,0.088,0.106,0.094,...,2.125,0.881,0.481,1.194,0.138,0.119,0.038,0.088,0.056,0.113
3,11636092,2.088,2.075,2.121,2.098,2.046,2.081,1.847,0.42,0.399,...,0.62,0.487,0.563,0.419,0.379,0.359,0.347,0.325,0.33,0.34
4,11647239,1.416,1.25,1.27,1.258,1.239,1.753105,4.609256,4.619256,4.075151,...,1.596,1.667,1.569,1.664,1.58,1.635,1.568,1.565,1.575,1.571


Unnamed: 0,House ID,Interval_1,Interval_2,Interval_3,Interval_4,Interval_5,Interval_6,Interval_7,Interval_8,Interval_9,...,Interval_2871,Interval_2872,Interval_2873,Interval_2874,Interval_2875,Interval_2876,Interval_2877,Interval_2878,Interval_2879,Interval_2880
0,11655099,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,11633257,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,11651552,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,11636092,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,11647239,0,0,0,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


True
False


### Quantify Missign Values

In [4]:
# Missing value frequency by interval
ev_train_raw.isnull().sum(axis=0)[ev_train_raw.isnull().sum(axis=0)>0].value_counts()

1    288
4     48
3     48
2     48
dtype: int64

In [5]:
# Missing value frequency by house
ev_train_raw.isnull().sum(axis=1)[ev_train_raw.isnull().sum(axis=1)>0].value_counts()

432    1
48     1
144    1
96     1
dtype: int64

### Add Temporal Dimensions

In [6]:
ev_train_pivot = ev_train_raw.copy()
ev_train_pivot = pd.wide_to_long(ev_train_pivot, stubnames='Interval_', i='House ID', j='kWh').reset_index()
ev_train_pivot.columns = ['House ID', 'Interval', 'kWh']
ev_train_pivot['Interval'] = pd.to_numeric(ev_train_pivot['Interval'], downcast='integer')
ev_train_pivot = ev_train_pivot.sort_values(['House ID', 'Interval']).reset_index(drop=True)

In [7]:
ev_train_labels_pivot = ev_train_labels_raw.copy()
ev_train_labels_pivot = pd.wide_to_long(ev_train_labels_pivot, stubnames='Interval_', i='House ID', j='label').reset_index()
ev_train_labels_pivot.columns = ['House ID', 'Interval', 'Label']
ev_train_labels_pivot['Interval'] = pd.to_numeric(ev_train_labels_pivot['Interval'], downcast='integer')
ev_train_labels_pivot = ev_train_labels_pivot.sort_values(['House ID', 'Interval']).reset_index(drop=True)

In [8]:
ev_train_pivot_label = pd.concat([ev_train_pivot, ev_train_labels_pivot['Label']], axis=1)

In [9]:
##Checking Label Concatenation (pd.merge was failing due to memory limits?)

import random

display(ev_train_pivot_label[ev_train_pivot_label['House ID']==11647239].head(9))

all_raw_house_ids  = ev_train_raw['House ID'].unique()

random_house_id = random.choice(all_raw_house_ids)
random_interval = random.randint(1,2880)


lcr1 = ev_train_pivot_label['House ID']==random_house_id
lcc1 = ev_train_pivot_label['Interval']==random_interval
display(ev_train_pivot_label[lcr1&lcc1])

lcr2 = ev_train_raw.index[ev_train_raw['House ID']==random_house_id].values[0]
raw_kwh = ev_train_raw.iloc[lcr2,random_interval]
raw_label = ev_train_labels_raw.iloc[lcr2,random_interval]

rand_dict = {
    "House ID": random_house_id,
    "Interval": random_interval,
    "kWh": raw_kwh,
    "Label": raw_label 
}
print(rand_dict)


Unnamed: 0,House ID,Interval,kWh,Label
3173760,11647239,1,1.416,0
3173761,11647239,2,1.25,0
3173762,11647239,3,1.27,0
3173763,11647239,4,1.258,0
3173764,11647239,5,1.239,0
3173765,11647239,6,1.753105,1
3173766,11647239,7,4.609256,1
3173767,11647239,8,4.619256,1
3173768,11647239,9,4.075151,1


Unnamed: 0,House ID,Interval,kWh,Label
2161022,11640914,1023,0.269,0


{'House ID': 11640914, 'Interval': 1023, 'kWh': 0.26899999999999996, 'Label': 0}


In [10]:
ev_train_raw.iloc[lcr2,random_interval-1:random_interval+1]

Interval_1022    0.212
Interval_1023    0.269
Name: 403, dtype: float64

In [11]:
print(ev_train_pivot.shape)
print(2*24*60*1590)

temp_dim_dict = {
    "Day": np.tile(np.repeat(np.arange(1,61),48),1590),
    "Hour": np.tile(np.tile(np.repeat(np.arange(1,25),2),60),1590),
    "Half Hour": np.tile(np.tile(np.arange(1,49),60),1590)
}

temp_dim_df = pd.DataFrame.from_dict(temp_dim_dict)

(4579200, 3)
4579200


In [12]:
ev_train_piv_lab_tmp = pd.concat([ev_train_pivot_label, temp_dim_df], axis=1)
ev_train_piv_lab_tmp_cols = ['House ID','Day','Hour','Half Hour','Interval', 'kWh', 'Label']
ev_train_piv_lab_tmp = ev_train_piv_lab_tmp[ev_train_piv_lab_tmp_cols] 
ev_train_piv_lab_tmp.head()

Unnamed: 0,House ID,Day,Hour,Half Hour,Interval,kWh,Label
0,11628280,1,1,1,1,1.114,0
1,11628280,1,1,2,2,0.845,0
2,11628280,1,2,3,3,0.463,0
3,11628280,1,2,4,4,0.453,0
4,11628280,1,3,5,5,0.61,0


In [13]:
# Checking Temporal Concatenation (pd.merge was failing due to memory limits?)

check_aggs =['max', 'min','mean','count']

check_dict = {
    'Day' : check_aggs, 
    'Hour': check_aggs,
    'Half Hour': check_aggs,
}

temp_merge_check_df = ev_train_piv_lab_tmp.groupby('House ID').agg(check_dict)

unique_records = len(temp_merge_check_df.drop_duplicates())
total_records = len(temp_merge_check_df)

assert unique_records == 1, "Problem with temporal concatenation"

display(temp_merge_check_df.head())

Unnamed: 0_level_0,Day,Day,Day,Day,Hour,Hour,Hour,Hour,Half Hour,Half Hour,Half Hour,Half Hour
Unnamed: 0_level_1,max,min,mean,count,max,min,mean,count,max,min,mean,count
House ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
11628280,60,1,30.5,2880,24,1,12.5,2880,48,1,24.5,2880
11628291,60,1,30.5,2880,24,1,12.5,2880,48,1,24.5,2880
11628301,60,1,30.5,2880,24,1,12.5,2880,48,1,24.5,2880
11628319,60,1,30.5,2880,24,1,12.5,2880,48,1,24.5,2880
11628335,60,1,30.5,2880,24,1,12.5,2880,48,1,24.5,2880


In [14]:
display(ev_train_piv_lab_tmp.iloc[45:50,])
display(ev_train_piv_lab_tmp[ev_train_piv_lab_tmp['House ID']==11647239].head(9))

Unnamed: 0,House ID,Day,Hour,Half Hour,Interval,kWh,Label
45,11628280,1,23,46,46,1.146,0
46,11628280,1,24,47,47,1.13,0
47,11628280,1,24,48,48,1.17,0
48,11628280,2,1,1,49,1.239,0
49,11628280,2,1,2,50,0.952,0


Unnamed: 0,House ID,Day,Hour,Half Hour,Interval,kWh,Label
3173760,11647239,1,1,1,1,1.416,0
3173761,11647239,1,1,2,2,1.25,0
3173762,11647239,1,2,3,3,1.27,0
3173763,11647239,1,2,4,4,1.258,0
3173764,11647239,1,3,5,5,1.239,0
3173765,11647239,1,3,6,6,1.753105,1
3173766,11647239,1,4,7,7,4.609256,1
3173767,11647239,1,4,8,8,4.619256,1
3173768,11647239,1,5,9,9,4.075151,1


### Fill in Missing Values

In [15]:
all_missing_rows = ev_train_piv_lab_tmp['kWh'].isnull()
display(ev_train_piv_lab_tmp[all_missing_rows].head())

Unnamed: 0,House ID,Day,Hour,Half Hour,Interval,kWh,Label
757296,11632700,58,1,1,2737,,0
757297,11632700,58,1,2,2738,,0
757298,11632700,58,2,3,2739,,0
757299,11632700,58,2,4,2740,,0
757300,11632700,58,3,5,2741,,0


In [16]:
house_hh_mean_df = ev_train_piv_lab_tmp.groupby(['House ID','Half Hour']).agg({'kWh':'mean'}).reset_index()

In [None]:
house_hh_mean_df.head()

Unnamed: 0,House ID,Half Hour,kWh
0,11628280,1,1.03495
1,11628280,2,0.990733
2,11628280,3,0.904383
3,11628280,4,0.940583
4,11628280,5,0.95835


In [None]:
ev_train_piv_lab_tmp_filled = ev_train_piv_lab_tmp.copy()

for idx, row in ev_train_piv_lab_tmp_filled[all_missing_rows].iterrows():

    missing_interval = row['Interval']
    missing_house = row['House ID']
    missing_half_hour = row['Half Hour']
    
    house_rm = house_hh_mean_df['House ID'] == missing_house
    half_hour_rm = house_hh_mean_df['Half Hour'] == missing_half_hour
    house_half_hour_average = house_hh_mean_df[house_rm&half_hour_rm]['kWh'].values[0]
    
    house_rf = ev_train_piv_lab_tmp_filled['House ID'] == missing_house
    interval_rf = ev_train_piv_lab_tmp_filled['Interval'] == missing_interval
    ev_train_piv_lab_tmp_filled['kWh'][house_rf&interval_rf] = house_half_hour_average

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [None]:
display(ev_train_piv_lab_tmp.iloc[757292:757299,:])
display(ev_train_piv_lab_tmp_filled.iloc[757292:757299,:])

In [None]:
print(ev_train_piv_lab_tmp_filled.shape)
print(sum(ev_train_piv_lab_tmp_filled['Label']))
print(sum(ev_train_piv_lab_tmp_filled['Label'])/ev_train_piv_lab_tmp_filled.shape[0])

# House Classification

### Transformation

In [None]:
house_data_summary = ev_train_piv_lab_tmp_filled.groupby(['House ID', 'Half Hour']
                                                ).agg({'kWh':['mean',np.std]}
                                                     ).pivot_table(values='kWh',
                                                                   index=['House ID'],
                                                                   columns='Half Hour').reset_index()

In [None]:
mean_cols = ['u_{}'.format(n) for n in np.arange(1,49)]
sd_cols = ['s_{}'.format(n) for n in np.arange(1,49)]
hds_cols = ['House ID'] + mean_cols + sd_cols
house_data_summary.columns = hds_cols

In [None]:
pct_cols = ['p_{}'.format(n) for n in np.arange(1,49)]
house_data_summary[pct_cols] = house_data_summary[mean_cols].div(house_data_summary[mean_cols].sum(axis=1),axis=0)

In [None]:
house_data_labels = ev_train_piv_lab_tmp_filled.groupby(['House ID']).agg({'Label':sum}).reset_index()
house_data_labels['House Label'] = [1 if x > 0 else 0 for x in house_data_labels['Label']]
house_data_labels.drop(['Label'], axis=1, inplace=True)
house_data_df = house_data_summary.merge(house_data_labels, how='left', on='House ID')

In [None]:
X_house = house_data_df.copy()
y_house = X_house['House Label']
X_house.drop(['House Label'], axis=1, inplace=True)
display(X_house.head())
display(y_house.head())

### House Training split

In [None]:
from sklearn.model_selection import train_test_split

X_train_house, X_valid_house, y_train_house, y_valid_house = train_test_split(X_house,
                                                                            y_house,
                                                                            test_size=0.25,
                                                                            random_state=42)

### F1 Score for Houses

In [None]:
zero_zero_h = np.tile([0,0],int(len(y_valid_house)/2))
zero_one_h = np.tile([0,1],int(len(y_valid_house)/2))
one_one_h = np.tile([1,1],int(len(y_valid_house)/2))

In [None]:
print(f1_score(y_valid_house, zero_zero_h))
print(f1_score(y_valid_house, zero_one_h))
print(f1_score(y_valid_house, one_one_h))

### K-Means Clustering

In [None]:
k_values = np.arange(2,20)

centers_and_plot_dfs = []

for k in k_values:
    
    kmeans_model = KMeans(n_clusters=k, random_state=42).fit(X_train_house[pct_cols].as_matrix())
    
    k_centers = kmeans_model.cluster_centers_
    
    k_col_name = "k_{}".format(k)
    
    X_train_house[k_col_name] = kmeans_model.labels_
    
    k_cols = pct_cols.copy()
    
    k_cols.insert(0, k_col_name)
    
    print(X_train_house[k_col_name].value_counts())
    
    k_cluster_mean_df = X_train_house.reset_index()[k_cols].groupby(k_col_name).mean()
    
    k_plot_df = k_cluster_mean_df.transpose()
    
    k_tuple = (k_centers, k_plot_df)
    
    centers_and_plot_dfs.append(k_tuple) 

In [None]:
for c, df in centers_and_plot_dfs:
    df.plot(xticks=np.arange(1,49), figsize=(12,6))

In [None]:
count_test = X_train_house.copy()
count_test['Label'] = y_train_house

In [None]:
for k in k_values:
    k_col_name = "k_{}".format(k)
    count_test_percent = count_test.groupby(k_col_name)['Label'].sum()/count_test.groupby(k_col_name)['Label'].count()
    print(k_col_name, (max(count_test_percent) / min(count_test_percent)))

In [None]:
k_list = np.arange(2,20).tolist()
k_list.remove(9)
columns_to_remove = ["k_{}".format(k) for k in k_list]
X_train_house.drop(columns_to_remove, axis=1, inplace=True)

In [None]:
X_train_house.iloc[:, 1:145].head()

### Apply Predict k-means cluster based on k-nearest neighbors

In [None]:
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(X_train_house.iloc[:, 1:145], X_train_house.iloc[:,145]) 
X_valid_house['k_9'] = neigh.predict(X_valid_house.iloc[:, 1:145])

### House Prediction

In [None]:
h_clf1 = GradientBoostingClassifier(random_state = 42)
h_clf1.fit(X_train_house, y_train_house)

In [None]:
h_clf1.feature_importances_

In [None]:
y_train_house_pred = h_clf1.predict(X_train_house)

In [None]:
f1_score(y_train_house, y_train_house_pred)

In [None]:
confusion_matrix(y_train_house, y_train_house_pred)

In [None]:
y_valid_house_pred = h_clf1.predict(X_valid_house)

In [None]:
f1_score(y_valid_house, y_valid_house_pred)

In [None]:
confusion_matrix(y_valid_house, y_valid_house_pred)

# Time Classification

In [None]:
ev_train_piv_lab_tmp_filled_tmp_transformed = ev_train_piv_lab_tmp_filled.copy()

In [None]:
print(y_train_house.sum())
print(y_valid_house_pred.sum())

In [None]:
h_time_train = X_train_house['House ID'][y_train_house>0].values
h_time_valid = X_valid_house['House ID'][y_valid_house_pred>0].values

In [None]:
X_train_bool = ev_train_piv_lab_tmp_filled_tmp_transformed['House ID'].isin(h_time_train)
X_valid_bool = ev_train_piv_lab_tmp_filled_tmp_transformed['House ID'].isin(h_time_valid)

In [None]:
X_train_time = ev_train_piv_lab_tmp_filled_tmp_transformed[X_train_bool]
y_train_time = X_train_time['Label']
X_train_time.drop(['Label'], axis=1, inplace=True)
display(X_train_time.head())
display(y_train_time.head())

In [None]:
X_valid_time = ev_train_piv_lab_tmp_filled_tmp_transformed[X_valid_bool]
y_valid_time = X_valid_time['Label']
X_valid_time.drop(['Label'], axis=1, inplace=True)
display(X_valid_time.head())
display(y_valid_time.head())

In [None]:
print(X_train_time.shape[0]/2880)
print(X_valid_time.shape[0]/2880)