# About

Test to see if we can do a better pipeline. Should handle imputing missing or bad data, extracting attributes, encode categorical or ordinal features, etc



In [128]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

%reload_ext autoreload
%autoreload 2
import src.features.build_features as bf

%matplotlib inline
import matplotlib.pyplot as plt

In [129]:
# loads data
train_pd = pd.read_csv("../data/raw/train.csv.zip", compression="zip")
test_pd = pd.read_csv("../data/raw/test.csv.zip", compression="zip")

using shuffle_split_data

1. start with train_pd or test_pd
2. shuffle
3. split into data (and labels)

using prep_data

4. extract basic features from data, with SFCCTransformer
5. split data into numeric, categorical
6. from numerics, scrub bad values, such as X and Y
7. minmax scale numerics
8. onehot encode categorical
9. feature union everything

In [130]:
# the various types of features, each type to be handled by a different pipeline
feat_nums = [
    "X"
    , "Y"
    , "hour_delta"
    , "day_delta"
    , "week_delta"
    , "month_delta"
    , "year_delta"
    , "hour_of_day_sin"
    , "hour_of_day_cos"
    , "day_of_week_sin"
    , "day_of_week_cos"
    , "day_of_month_sin"
    , "day_of_month_cos"
    , "day_of_year_sin"
    , "day_of_year_cos"
    , "week_of_year_sin"
    , "week_of_year_cos"
    , "month_of_year_sin"
    , "month_of_year_cos"
    , "quarter_of_year_sin"
    , "quarter_of_year_cos"
]

feat_cat = [
#     , "hour_of_day"
#     , "day_of_week"
#     , "day_of_month"
#     , "day_of_year"
#     , "week_of_year"
#     , "month_of_year"
#     , "quarter_of_year"
    "year"
    , "DayOfWeek"
    , "PdDistrict"
]

feat_binary = [
    "is_weekend"
    , "is_holiday"
    , "is_latenight"
]

feat_text = [
    "Address"
]

In [115]:
# def prep_data(data, feat_nums, feat_cat, feat_binary, feat_text):
    
#     # extracts some new basic attributes from the existing attributes
#     sfcc = bf.SFCCTransformer()
#     pipe = Pipeline([
#         ("transformer", sfcc)
#     ])
#     data = pipe.transform(data)
    
#     # splits into numeric, categorical, text dataframes, so we that can feed them through different pipelines
    
#     # feeds numeric features into pipeline that has
#     # SimpleImputer (median), to fill in any missing values (esp the X and Y)
#     # and MinMaxScaler so that they will have similar scale to our other features
#     imputer = SimpleImputer(missing_values = np.nan, strategy = "median")
#     scaler = MinMaxScaler()
#     pipe_num = Pipeline([
#         ("imputer", imputer),
#         ("scaler", scaler)
#     ])
#     data_num = data[feat_nums]
#     data_num_out = pipe_num.fit_transform(data_num)
#     data_num_out = pd.DataFrame(data_num_out, columns = data_num.columns)
    
#     # feeds categorical features into pipeline, which has
#     # OneHotEncoder, which will turn the categorical features into 1 or 0 per level
#     one_hot = OneHotEncoder(sparse = False)
#     pipe_cat = Pipeline([
#         ("encoder", one_hot)
#     ])
#     data_cat = data[feat_cat]
#     data_cat_out = pipe_cat.fit_transform(data_cat)
#     data_cat_cols = np.concatenate(one_hot.categories_).ravel().tolist()
#     data_cat_out = pd.DataFrame(data_cat_out, columns = data_cat_cols)
    
#     # don't need to do anything to prepare the binary features
#     data_binary_out = data[feat_binary]
    
#     # feeds text features into pipeline, which as
#     # TODO find out how to use count vectorizer here
# #     feat_text

#     result = pd.concat([data_num_out, data_cat_out, data_binary_out], axis = 1, sort = False)
    
#     return result

In [131]:
# Note, we don't need a dev set because we will be using kfold cross validation
train_data, train_labels = bf.shuffle_split_data(train_pd)

# print(train_labels.shape)

# print(train_data.shape)
# print(train_data.info())


train_data = bf.prep_data(train_data, feat_nums, feat_cat, feat_binary, feat_text)

print(train_data.shape)
print(train_data.info())
print(train_data.describe())
# print(train_data[0])

# print(train_data[train_data["X"].isnull()].shape)
# print(train_data[train_data["Y"].isnull()].shape)



(878049, 54)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 878049 entries, 0 to 878048
Data columns (total 54 columns):
X                      878049 non-null float64
Y                      878049 non-null float64
hour_delta             878049 non-null float64
day_delta              878049 non-null float64
week_delta             878049 non-null float64
month_delta            878049 non-null float64
year_delta             878049 non-null float64
hour_of_day_sin        878049 non-null float64
hour_of_day_cos        878049 non-null float64
day_of_week_sin        878049 non-null float64
day_of_week_cos        878049 non-null float64
day_of_month_sin       878049 non-null float64
day_of_month_cos       878049 non-null float64
day_of_year_sin        878049 non-null float64
day_of_year_cos        878049 non-null float64
week_of_year_sin       878049 non-null float64
week_of_year_cos       878049 non-null float64
month_of_year_sin      878049 non-null float64
month_of_year_cos      878049 

In [122]:
from sklearn.ensemble import RandomForestClassifier

train_prepared = train_data.to_numpy()

clf = RandomForestClassifier(n_estimators = 500, max_leaf_nodes = 16, n_jobs = -1)
clf_pipe = Pipeline([
    ("clf", clf)
])
res = clf_pipe.fit(train_prepared, train_labels)



In [127]:
# prints relative importance of every feature so far
res = list(zip(train_data.columns, clf.feature_importances_))
res = pd.DataFrame(res, columns = ["feature", "importance"])
res.sort_values(by = "importance", ascending = False)

Unnamed: 0,feature,importance
1,Y,0.222234
50,TENDERLOIN,0.184128
0,X,0.135781
48,SOUTHERN,0.052486
42,CENTRAL,0.046582
2,hour_delta,0.046424
41,BAYVIEW,0.045356
3,day_delta,0.0404
7,hour_of_day_sin,0.036486
5,month_delta,0.032025


In [132]:
test_data, _ = bf.shuffle_split_data(test_pd, is_test = True)

# print(test_data.shape)
# print(test_data.info())

test_data = bf.prep_data(test_data, feat_nums, feat_cat, feat_binary, feat_text)

# print(test_data.shape)
# print(test_data.info())

