In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler 
# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
from sklearn.decomposition import PCA
# http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html

import boto3
import sagemaker.amazon.common as smac

<h2>Kaggle Bike Sharing Demand Dataset Preparation For PCA</h2>
<h4>Use PCA to find new components to replace 'temp','atemp','humidity','windspeed' in both training and test datasets</h4>
<h4>To download dataset, sign-in and download from this link: https://www.kaggle.com/c/bike-sharing-demand/data</h4>
<br>
Input Features: ['season', 'holiday', 'workingday', 'weather', 'year', 'month', 'day', 'dayofweek','hour', <b>'pca components'</b>]<br>
Target Feature: [log1p('count')]<br>
PCA Training: ['temp','atemp','humidity','windspeed']<br><br>

Objective: <quote>You are provided hourly rental data spanning two years. For this competition, the training set is comprised of the first 19 days of each month, while the test set is the 20th to the end of the month. You must predict the total count of bikes rented during each hour covered by the test set, using only information available prior to the rental period (Ref: Kaggle.com)</quote>

In [2]:
# We are not going to use numeric features: 'temp','atemp','humidity','windspeed'
# Instead, we are going to use new components (aka features) generated by PCA for model training and testing
columns = ['count', 'season', 'holiday', 'workingday', 'weather','year', 'month', 'day', 'dayofweek','hour']

# PCA Training
colums_for_pca = ['temp','atemp','humidity','windspeed']

In [3]:
df = pd.read_csv('train_normalized.csv')
df_test = pd.read_csv('test_normalized.csv')

In [4]:
df.head(2)

Unnamed: 0,count,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour
0,2.833213,1,0,0,1,-1.333661,-1.092737,0.993213,-1.567754,2011,1,1,5,0
1,3.713572,1,0,0,1,-1.438907,-1.182421,0.941249,-1.567754,2011,1,1,5,1


In [5]:
df_test.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour
0,2011-01-20 00:00:00,1,0,1,1,-1.228414,-1.450292,-0.305883,1.617227,2011,1,20,3,0
1,2011-01-20 01:00:00,1,0,1,1,-1.228414,-1.182421,-0.305883,-1.567754,2011,1,20,3,1


In [6]:
# Find PCA
pca = PCA(n_components=0.9) # Capture 90% total variation

In [7]:
# Find new components
pca.fit(df[colums_for_pca])

PCA(copy=True, iterated_power='auto', n_components=0.9, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [8]:
# No. of PCA Components
print ('Variance: ', pca.n_components)
print ('No. of components to keep: ', pca.n_components_)

Variance:  0.9
No. of components to keep:  3


In [9]:
def transform_with_pca(pca, df, columns):
    transformed_data = pca.transform(df[columns])
    
    tcols = []
    for i in range(pca.n_components_):       
        tcols.append('component_' + str(i))
    
    print ('components:',tcols)
    df_transformed = pd.DataFrame(transformed_data, columns=tcols)
    
    for col in df_transformed.columns:
        df[col] = df_transformed[col]
    
    df.drop(columns, inplace=True, axis=1)
    
    return tcols

In [10]:
new_cols = transform_with_pca(pca, df, colums_for_pca)

components: ['component_0', 'component_1', 'component_2']


In [11]:
transform_with_pca(pca, df_test, colums_for_pca)

components: ['component_0', 'component_1', 'component_2']


['component_0', 'component_1', 'component_2']

In [12]:
df.head(2)

Unnamed: 0,count,season,holiday,workingday,weather,year,month,day,dayofweek,hour,component_0,component_1,component_2
0,2.833213,1,0,0,1,2011,1,1,5,0,-1.727078,-1.773606,-0.523238
1,3.713572,1,0,0,1,2011,1,1,5,1,-1.861064,-1.733926,-0.569779


In [13]:
df_test.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,year,month,day,dayofweek,hour,component_0,component_1,component_2
0,2011-01-20 00:00:00,1,0,1,1,2011,1,20,3,0,-1.920363,1.412416,0.791086
1,2011-01-20 01:00:00,1,0,1,1,2011,1,20,3,1,-1.629284,-0.856109,-1.439095


In [14]:
for col in new_cols:
    columns.append(col)

In [15]:
columns

['count',
 'season',
 'holiday',
 'workingday',
 'weather',
 'year',
 'month',
 'day',
 'dayofweek',
 'hour',
 'component_0',
 'component_1',
 'component_2']

## Training, Validation and Test Set
### Target Variable as first column followed by input features
### Training, Validation files do not have a column header

In [16]:
# Training = 70% of the data
# Validation = 30% of the data
# Randomize the datset
np.random.seed(5)
l = list(df.index)
np.random.shuffle(l)
df = df.iloc[l]

In [17]:
rows = df.shape[0]
train = int(.7 * rows)
test = int(.3 * rows)

In [18]:
rows, train, test

(10886, 7620, 3265)

In [19]:
columns

['count',
 'season',
 'holiday',
 'workingday',
 'weather',
 'year',
 'month',
 'day',
 'dayofweek',
 'hour',
 'component_0',
 'component_1',
 'component_2']

In [20]:
# Write Training Set
df[:train].to_csv('bike_train_pca.csv'
                          ,index=False,header=False
                          ,columns=columns)

In [21]:
# Write Validation Set
df[train:].to_csv('bike_validation_pca.csv'
                          ,index=False,header=False
                          ,columns=columns)

In [22]:
# Test Data has only input features
df_test.to_csv('bike_test_pca.csv',index=False)

In [23]:
# Write Column List
with open('bike_train_column_list_pca.txt','w') as f:
    f.write(','.join(columns))