# Data Processing Notebook

This notebook uses code developed in previous notebooks to combine and clean data from the Seattle Fremont Bridge bike counter and Dark Sky weather. The end result will be a single cleaned and coded Pandas DataFrame for using in regression modeling.

In [198]:
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta
import time
import pickle
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import seaborn

### Load Bike Data

In [199]:
bike = pd.read_csv("../../data/raw/Fremont_bridge.csv")
bike['Date'] = pd.to_datetime(bike['Date'], format = "%m/%d/%Y %I:%M:%S %p")
bike.drop(labels = ['Fremont Bridge East Sidewalk', 'Fremont Bridge West Sidewalk'], axis = 1, inplace = True)
bike['Date'] = bike['Date'].map(datetime.date)
bike.drop(index = bike[bike['Date'] > datetime.fromisoformat('2019-10-03').date()].index, axis = 0, inplace = True)
daily_total = bike.groupby('Date', as_index = False).sum()

In [200]:
daily_total.head()

Unnamed: 0,Date,Fremont Bridge Total
0,2012-10-03,7042.0
1,2012-10-04,6950.0
2,2012-10-05,6296.0
3,2012-10-06,4012.0
4,2012-10-07,4284.0


### Load Weather Data

In [201]:
with open("../../data/raw/weather.pkl", 'rb') as handle:
    weather = pickle.load(handle)

In [202]:
weather.columns

Index(['apparentTemperatureHigh', 'apparentTemperatureHighTime',
       'apparentTemperatureLow', 'apparentTemperatureLowTime',
       'apparentTemperatureMax', 'apparentTemperatureMaxTime',
       'apparentTemperatureMin', 'apparentTemperatureMinTime', 'cloudCover',
       'dewPoint', 'humidity', 'icon', 'moonPhase', 'precipIntensity',
       'precipIntensityMax', 'precipIntensityMaxTime', 'precipProbability',
       'precipType', 'pressure', 'summary', 'sunriseTime', 'sunsetTime',
       'temperatureHigh', 'temperatureHighTime', 'temperatureLow',
       'temperatureLowTime', 'temperatureMax', 'temperatureMaxTime',
       'temperatureMin', 'temperatureMinTime', 'time', 'uvIndex',
       'uvIndexTime', 'visibility', 'windBearing', 'windGust', 'windGustTime',
       'windSpeed', 'precipAccumulation', 'ozone'],
      dtype='object')

In [203]:
time_columns = []
for x in weather.columns:
    if x.endswith('Time') or x.endswith('time'):
        time_columns.append(x)

In [204]:
time_columns

['apparentTemperatureHighTime',
 'apparentTemperatureLowTime',
 'apparentTemperatureMaxTime',
 'apparentTemperatureMinTime',
 'precipIntensityMaxTime',
 'sunriseTime',
 'sunsetTime',
 'temperatureHighTime',
 'temperatureLowTime',
 'temperatureMaxTime',
 'temperatureMinTime',
 'time',
 'uvIndexTime',
 'windGustTime']

In [205]:
weather[time_columns].isna().sum()

apparentTemperatureHighTime      0
apparentTemperatureLowTime       1
apparentTemperatureMaxTime       0
apparentTemperatureMinTime       0
precipIntensityMaxTime         397
sunriseTime                      0
sunsetTime                       0
temperatureHighTime              0
temperatureLowTime               1
temperatureMaxTime               0
temperatureMinTime               0
time                             0
uvIndexTime                      0
windGustTime                     0
dtype: int64

In [206]:
datetime.fromtimestamp(weather.loc[weather['apparentTemperatureLowTime'].isna(), 'time'])

  datetime.fromtimestamp(weather.loc[weather['apparentTemperatureLowTime'].isna(), 'time'])


datetime.datetime(2018, 10, 29, 0, 0)

In [207]:
#Replace missing low temp time with ~6am, ref: https://www.timeanddate.com/weather/usa/seattle/historic?month=10&year=2018
low_time = datetime(2018, 10, 29, 6, 0)
low_time = time.mktime(datetime.timetuple(low_time))
weather.loc[weather['apparentTemperatureLowTime'].isna(), 'apparentTemperatureLowTime'] = low_time
weather.loc[weather['temperatureLowTime'].isna(), 'temperatureLowTime'] = low_time

In [208]:
#Drop precipIntensityMaxTime from dataframe
weather.drop('precipIntensityMaxTime', inplace = True, axis = 1)

In [209]:
time_columns.remove('precipIntensityMaxTime')

In [210]:
#convert all time columns from seconds since the epoch to datetime objects
for column in time_columns:
    if column in weather.columns:
        weather[column] = weather[column].apply(datetime.fromtimestamp)

In [211]:
weather[time_columns]

Unnamed: 0,apparentTemperatureHighTime,apparentTemperatureLowTime,apparentTemperatureMaxTime,apparentTemperatureMinTime,sunriseTime,sunsetTime,temperatureHighTime,temperatureLowTime,temperatureMaxTime,temperatureMinTime,time,uvIndexTime,windGustTime
0,2012-10-03 15:05:00,2012-10-04 06:55:00,2012-10-03 15:05:00,2012-10-03 06:02:00,2012-10-03 07:13:00,2012-10-03 18:45:00,2012-10-03 15:05:00,2012-10-04 07:46:00,2012-10-03 15:05:00,2012-10-03 07:37:00,2012-10-03,2012-10-03 13:00:00,2012-10-03 14:11:00
1,2012-10-04 15:23:00,2012-10-05 06:56:00,2012-10-04 15:23:00,2012-10-04 06:55:00,2012-10-04 07:15:00,2012-10-04 18:43:00,2012-10-04 15:23:00,2012-10-05 06:56:00,2012-10-04 15:23:00,2012-10-04 07:46:00,2012-10-04,2012-10-04 12:58:00,2012-10-04 15:47:00
2,2012-10-05 15:41:00,2012-10-06 06:53:00,2012-10-05 15:41:00,2012-10-05 06:56:00,2012-10-05 07:16:00,2012-10-05 18:41:00,2012-10-05 15:41:00,2012-10-06 06:53:00,2012-10-05 15:41:00,2012-10-05 06:56:00,2012-10-05,2012-10-05 12:58:00,2012-10-05 16:44:00
3,2012-10-06 14:39:00,2012-10-07 07:46:00,2012-10-06 14:39:00,2012-10-06 06:53:00,2012-10-06 07:18:00,2012-10-06 18:39:00,2012-10-06 14:39:00,2012-10-07 07:46:00,2012-10-06 14:39:00,2012-10-06 06:53:00,2012-10-06,2012-10-06 12:56:00,2012-10-06 16:01:00
4,2012-10-07 15:58:00,2012-10-08 07:39:00,2012-10-07 15:58:00,2012-10-07 07:46:00,2012-10-07 07:19:00,2012-10-07 18:37:00,2012-10-07 15:58:00,2012-10-08 07:39:00,2012-10-07 15:58:00,2012-10-07 07:46:00,2012-10-07,2012-10-07 12:57:00,2012-10-07 14:06:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2552,2019-09-29 16:02:00,2019-09-30 06:50:00,2019-09-29 16:02:00,2019-09-29 06:16:00,2019-09-29 07:07:00,2019-09-29 18:55:00,2019-09-29 16:02:00,2019-09-30 06:51:00,2019-09-29 16:02:00,2019-09-29 03:55:00,2019-09-29,2019-09-29 12:57:00,2019-09-29 15:57:00
2553,2019-09-30 14:57:00,2019-10-01 07:44:00,2019-09-30 14:57:00,2019-09-30 06:50:00,2019-09-30 07:08:00,2019-09-30 18:53:00,2019-09-30 14:57:00,2019-10-01 06:50:00,2019-09-30 14:57:00,2019-09-30 06:51:00,2019-09-30,2019-09-30 12:54:00,2019-09-30 16:05:00
2554,2019-10-01 15:45:00,2019-10-02 05:24:00,2019-10-01 15:45:00,2019-10-01 07:44:00,2019-10-01 07:10:00,2019-10-01 18:51:00,2019-10-01 15:45:00,2019-10-02 06:55:00,2019-10-01 15:45:00,2019-10-01 06:50:00,2019-10-01,2019-10-01 13:00:00,2019-10-01 17:57:00
2555,2019-10-02 15:13:00,2019-10-03 04:04:00,2019-10-02 15:13:00,2019-10-02 05:34:00,2019-10-02 07:11:00,2019-10-02 18:49:00,2019-10-02 15:13:00,2019-10-03 04:04:00,2019-10-02 15:13:00,2019-10-02 06:55:00,2019-10-02,2019-10-02 12:56:00,2019-10-02 16:35:00


In [212]:
weather['Date'] = weather['time'].apply(datetime.date)
weather.drop('time', axis = 1, inplace = True)

In [213]:
time_columns.remove('time')

In [214]:
for column in time_columns:
    weather[column] = weather[column].apply(datetime.time)

### Combine Bike and Weather Data

In [215]:
df = pd.merge(daily_total, weather, on = 'Date')
df.head()

Unnamed: 0,Date,Fremont Bridge Total,apparentTemperatureHigh,apparentTemperatureHighTime,apparentTemperatureLow,apparentTemperatureLowTime,apparentTemperatureMax,apparentTemperatureMaxTime,apparentTemperatureMin,apparentTemperatureMinTime,...,temperatureMinTime,uvIndex,uvIndexTime,visibility,windBearing,windGust,windGustTime,windSpeed,precipAccumulation,ozone
0,2012-10-03,7042.0,63.04,15:05:00,46.79,06:55:00,63.04,15:05:00,46.08,06:02:00,...,07:37:00,4.0,13:00:00,10.0,353.0,10.72,14:11:00,4.07,,
1,2012-10-04,6950.0,62.99,15:23:00,45.79,06:56:00,62.99,15:23:00,46.79,06:55:00,...,07:46:00,4.0,12:58:00,10.0,351.0,11.51,15:47:00,3.79,,
2,2012-10-05,6296.0,67.65,15:41:00,47.17,06:53:00,67.65,15:41:00,45.79,06:56:00,...,06:56:00,4.0,12:58:00,10.0,342.0,10.46,16:44:00,3.27,,
3,2012-10-06,4012.0,71.25,14:39:00,47.92,07:46:00,71.25,14:39:00,47.17,06:53:00,...,06:53:00,4.0,12:56:00,9.977,352.0,9.1,16:01:00,2.83,,
4,2012-10-07,4284.0,72.07,15:58:00,46.84,07:39:00,72.07,15:58:00,47.92,07:46:00,...,07:46:00,3.0,12:57:00,9.943,335.0,6.82,14:06:00,2.11,,


In [216]:
#Add Day of Week and Month columns
df['DayOfWeek'] = df['Date'].apply(datetime.weekday)
df['Month'] = df['Date'].apply(lambda x: x.month)

In [217]:
df.describe()

Unnamed: 0,Fremont Bridge Total,apparentTemperatureHigh,apparentTemperatureLow,apparentTemperatureMax,apparentTemperatureMin,cloudCover,dewPoint,humidity,moonPhase,precipIntensity,...,temperatureMin,uvIndex,visibility,windBearing,windGust,windSpeed,precipAccumulation,ozone,DayOfWeek,Month
count,2557.0,2557.0,2556.0,2557.0,2557.0,2557.0,2557.0,2557.0,2557.0,2557.0,...,2557.0,2557.0,2557.0,2557.0,2557.0,2557.0,51.0,336.0,2557.0,2557.0
mean,5492.224482,60.459812,47.573521,60.571118,47.233633,0.586742,45.757208,0.760806,0.499769,0.004234,...,47.434415,3.864685,9.430654,212.662495,10.499437,3.190262,0.232353,338.295833,2.999609,6.525616
std,2739.826864,13.28589,9.330478,13.176644,9.414502,0.316249,8.665711,0.108764,0.288219,0.008682,...,8.728461,2.477692,1.388616,96.145979,4.836399,1.622343,0.636027,42.068403,1.999707,3.449024
min,76.0,20.52,14.01,24.72,13.83,0.0,4.61,0.31,0.0,0.0,...,19.86,1.0,1.239,0.0,2.0,0.49,0.0,246.9,0.0,1.0
25%,3572.0,50.62,40.7075,50.78,40.26,0.33,40.32,0.69,0.25,0.0,...,41.17,1.0,9.735,163.0,7.27,2.02,0.0,307.425,1.0,4.0
50%,5146.0,59.53,47.88,59.58,47.37,0.66,46.49,0.77,0.5,0.0006,...,47.56,4.0,10.0,188.0,9.58,2.75,0.01,333.15,3.0,7.0
75%,7656.0,70.96,55.2825,70.98,55.11,0.87,52.44,0.85,0.75,0.0043,...,54.62,6.0,10.0,320.0,12.52,4.06,0.07,361.025,5.0,10.0
max,12856.0,94.89,67.79,94.89,67.79,1.0,62.98,0.97,1.0,0.0925,...,67.3,9.0,10.0,359.0,42.36,11.32,2.44,460.1,6.0,12.0


### Missing Data

In [218]:
df.isna().sum()

Date                              0
Fremont Bridge Total              0
apparentTemperatureHigh           0
apparentTemperatureHighTime       0
apparentTemperatureLow            1
apparentTemperatureLowTime        0
apparentTemperatureMax            0
apparentTemperatureMaxTime        0
apparentTemperatureMin            0
apparentTemperatureMinTime        0
cloudCover                        0
dewPoint                          0
humidity                          0
icon                              2
moonPhase                         0
precipIntensity                   0
precipIntensityMax                0
precipProbability                 1
precipType                      790
pressure                          0
summary                          12
sunriseTime                       0
sunsetTime                        0
temperatureHigh                   0
temperatureHighTime               0
temperatureLow                    1
temperatureLowTime                0
temperatureMax              

---

Make "precipAccumulation" "snowAccumulation" and replace NaNs with 0

In [219]:
df[df['precipAccumulation'].isna() == False]['precipType']

89      snow
91      snow
100     snow
101     snow
102     snow
103     snow
105     snow
106     snow
109     snow
111     snow
152     snow
414     snow
427     snow
770     snow
788     snow
789     snow
790     snow
818     snow
820     snow
882     snow
1149    snow
1150    snow
1152    snow
1183    snow
1184    snow
1185    snow
1186    snow
1526    snow
1527    snow
1534    snow
1552    snow
1561    snow
1905    snow
1909    snow
1965    snow
1966    snow
1967    snow
1968    snow
1969    snow
2315    snow
2316    snow
2317    snow
2318    snow
2319    snow
2320    snow
2321    snow
2322    snow
2338    snow
2343    snow
2344    snow
2348    snow
Name: precipType, dtype: object

In [220]:
df.rename({'precipAccumulation': 'snowAccumulation'}, axis = 1, inplace = True)

In [221]:
df['snowAccumulation'].fillna(value = 0, inplace = True)

Drop 'ozone' (incomplete dataset)

In [222]:
df.drop('ozone', axis = 1, inplace = True)

Drop 'summary' (not useful for modeling)

In [223]:
df.drop('summary', axis = 1, inplace = True)

Replace precipType NaNs with "none"

In [224]:
df['precipType'].unique()

array(['rain', 'snow', nan], dtype=object)

In [225]:
df[df['precipType'].isna() == True]['precipIntensity']

108     0.0
122     0.0
139     0.0
146     0.0
157     0.0
       ... 
2207    0.0
2208    0.0
2209    0.0
2210    0.0
2212    0.0
Name: precipIntensity, Length: 790, dtype: float64

In [226]:
df['precipType'].fillna(value = 'none', inplace = True)

Drop 'icon' column

In [227]:
df.drop('icon', axis = 1, inplace = True)

Replace missing 'precipProbability' value with median value for rain days.

In [228]:
df[df['precipProbability'].isna()][['Date', 'precipType', 'precipIntensity', 'cloudCover']]

Unnamed: 0,Date,precipType,precipIntensity,cloudCover
2218,2018-10-30,rain,0.0015,0.83


In [229]:
df[df['precipType'] == 'rain']['precipProbability'].median()

0.72

In [230]:
df.loc[df['precipProbability'].isna(), 'precipProbability'] = 0.72

Replace missing low temps with low temp for day (43F).

In [231]:
df[df['apparentTemperatureLow'].isna()]

Unnamed: 0,Date,Fremont Bridge Total,apparentTemperatureHigh,apparentTemperatureHighTime,apparentTemperatureLow,apparentTemperatureLowTime,apparentTemperatureMax,apparentTemperatureMaxTime,apparentTemperatureMin,apparentTemperatureMinTime,...,uvIndex,uvIndexTime,visibility,windBearing,windGust,windGustTime,windSpeed,snowAccumulation,DayOfWeek,Month
2217,2018-10-29,6346.0,51.38,11:00:00,,06:00:00,52.49,19:00:00,44.83,04:29:00,...,1.0,11:00:00,4.324,158.0,10.36,00:02:00,4.81,0.0,0,10


In [232]:
df.loc[df['apparentTemperatureLow'].isna(), 'apparentTemperatureLow'] = 43

In [233]:
df[df['temperatureLow'].isna()]

Unnamed: 0,Date,Fremont Bridge Total,apparentTemperatureHigh,apparentTemperatureHighTime,apparentTemperatureLow,apparentTemperatureLowTime,apparentTemperatureMax,apparentTemperatureMaxTime,apparentTemperatureMin,apparentTemperatureMinTime,...,uvIndex,uvIndexTime,visibility,windBearing,windGust,windGustTime,windSpeed,snowAccumulation,DayOfWeek,Month
2217,2018-10-29,6346.0,51.38,11:00:00,43.0,06:00:00,52.49,19:00:00,44.83,04:29:00,...,1.0,11:00:00,4.324,158.0,10.36,00:02:00,4.81,0.0,0,10


In [234]:
df.loc[df['temperatureLow'].isna(), 'temperatureLow'] = 43

Save cleaned dataframe

In [235]:
with open("../../data/processed/data.pkl", 'wb') as handle:
    pickle.dump(df, handle)

### One-hot encoding for qualitative features

In [125]:
df.dtypes

Date                            object
Fremont Bridge Total           float64
apparentTemperatureHigh        float64
apparentTemperatureHighTime     object
apparentTemperatureLow         float64
apparentTemperatureLowTime      object
apparentTemperatureMax         float64
apparentTemperatureMaxTime      object
apparentTemperatureMin         float64
apparentTemperatureMinTime      object
cloudCover                     float64
dewPoint                       float64
humidity                       float64
moonPhase                      float64
precipIntensity                float64
precipIntensityMax             float64
precipProbability              float64
precipType                      object
pressure                       float64
sunriseTime                     object
sunsetTime                      object
temperatureHigh                float64
temperatureHighTime             object
temperatureLow                 float64
temperatureLowTime              object
temperatureMax           

Need to one-hot encode 'precipType', 'DayOfWeek', and 'Month'

In [166]:
df_for_ohe = df[['precipType', 'DayOfWeek', 'Month']]

In [168]:
ohe = OneHotEncoder(sparse = False)

In [169]:
transformed = ohe.fit_transform(df_for_ohe)

In [170]:
ohe.categories_

[array(['none', 'rain', 'snow'], dtype=object),
 array([0, 1, 2, 3, 4, 5, 6]),
 array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])]

In [175]:
cats = []
for ind, val in enumerate(ohe.categories_):
    for each in val:
        if ind == 1:
            cats.append('DoW' + str(each))
        elif ind == 2:
            cats.append('Month' + str(each))
        else:
            cats.append(each)

In [177]:
df_transformed = pd.DataFrame(transformed, columns = cats)

In [178]:
ohe_df = pd.concat([df, df_transformed], axis = 1)

In [179]:
ohe_df.head()

Unnamed: 0,Date,Fremont Bridge Total,apparentTemperatureHigh,apparentTemperatureHighTime,apparentTemperatureLow,apparentTemperatureLowTime,apparentTemperatureMax,apparentTemperatureMaxTime,apparentTemperatureMin,apparentTemperatureMinTime,...,Month3,Month4,Month5,Month6,Month7,Month8,Month9,Month10,Month11,Month12
0,2012-10-03,7042.0,63.04,15:05:00,46.79,06:55:00,63.04,15:05:00,46.08,06:02:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,2012-10-04,6950.0,62.99,15:23:00,45.79,06:56:00,62.99,15:23:00,46.79,06:55:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2012-10-05,6296.0,67.65,15:41:00,47.17,06:53:00,67.65,15:41:00,45.79,06:56:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,2012-10-06,4012.0,71.25,14:39:00,47.92,07:46:00,71.25,14:39:00,47.17,06:53:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,2012-10-07,4284.0,72.07,15:58:00,46.84,07:39:00,72.07,15:58:00,47.92,07:46:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


Need to drop one of the one-hot encoded columns from each of the features to eliminate collinearity. 

In [181]:
ohe_df.drop(['none', 'DoW0', 'Month1', 'DayOfWeek', 'Month', 'precipType'], axis = 1, inplace = True)

### Change times to minutes past midnight

In [187]:
ohe_df.loc[0, 'apparentTemperatureHighTime'].minute

5

In [188]:
def time_conversion(x):
    time = (x.hour)*60 + x.minute
    return time

In [189]:
time_conversion(ohe_df.loc[0, 'apparentTemperatureHighTime'])

905

In [193]:
for each in time_columns:
    if each in ohe_df.columns:
        print(f"{each} in dataframe")
    else:
        print(f"{each} not in dataframe")

apparentTemperatureHighTime in dataframe
apparentTemperatureLowTime in dataframe
apparentTemperatureMaxTime in dataframe
apparentTemperatureMinTime in dataframe
sunriseTime in dataframe
sunsetTime in dataframe
temperatureHighTime in dataframe
temperatureLowTime in dataframe
temperatureMaxTime in dataframe
temperatureMinTime in dataframe
uvIndexTime in dataframe
windGustTime in dataframe


In [194]:
for column in time_columns:
    ohe_df[column] = ohe_df[column].apply(time_conversion)

In [195]:
ohe_df.head()

Unnamed: 0,Date,Fremont Bridge Total,apparentTemperatureHigh,apparentTemperatureHighTime,apparentTemperatureLow,apparentTemperatureLowTime,apparentTemperatureMax,apparentTemperatureMaxTime,apparentTemperatureMin,apparentTemperatureMinTime,...,Month3,Month4,Month5,Month6,Month7,Month8,Month9,Month10,Month11,Month12
0,2012-10-03,7042.0,63.04,905,46.79,415,63.04,905,46.08,362,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,2012-10-04,6950.0,62.99,923,45.79,416,62.99,923,46.79,415,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2012-10-05,6296.0,67.65,941,47.17,413,67.65,941,45.79,416,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,2012-10-06,4012.0,71.25,879,47.92,466,71.25,879,47.17,413,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,2012-10-07,4284.0,72.07,958,46.84,459,72.07,958,47.92,466,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [196]:
ohe_df.describe()

Unnamed: 0,Fremont Bridge Total,apparentTemperatureHigh,apparentTemperatureHighTime,apparentTemperatureLow,apparentTemperatureLowTime,apparentTemperatureMax,apparentTemperatureMaxTime,apparentTemperatureMin,apparentTemperatureMinTime,cloudCover,...,Month3,Month4,Month5,Month6,Month7,Month8,Month9,Month10,Month11,Month12
count,2557.0,2557.0,2557.0,2557.0,2557.0,2557.0,2557.0,2557.0,2557.0,2557.0,...,2557.0,2557.0,2557.0,2557.0,2557.0,2557.0,2557.0,2557.0,2557.0,2557.0
mean,5492.224482,60.459812,931.396167,47.571732,407.369183,60.571118,913.687524,47.233633,497.570591,0.586742,...,0.084865,0.082127,0.084865,0.082127,0.084865,0.084865,0.082127,0.085256,0.082127,0.084865
std,2739.826864,13.28589,113.93632,9.329091,254.494495,13.176644,207.277893,9.414502,378.829469,0.316249,...,0.278735,0.274613,0.278735,0.274613,0.278735,0.278735,0.274613,0.279317,0.274613,0.278735
min,76.0,20.52,420.0,14.01,0.0,24.72,0.0,13.83,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3572.0,50.62,879.0,40.71,309.0,50.78,875.0,40.26,314.0,0.33,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5146.0,59.53,953.0,47.88,367.0,59.58,954.0,47.37,381.0,0.66,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7656.0,70.96,1008.0,55.28,429.0,70.98,1010.0,55.11,467.0,0.87,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,12856.0,94.89,1140.0,67.79,1439.0,94.89,1380.0,67.79,1380.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Save encoded data frame

In [197]:
with open("../../data/processed/encoded_data.pkl", 'wb') as handle:
    pickle.dump(ohe_df, handle)