-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
142 lines (109 loc) · 4.41 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# -*- coding: utf-8 -*-
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, StandardScaler
from sklearn_pandas import DataFrameMapper
import noshow_lib.util as utils
from sklearn.externals import joblib
class WeekdayTransform(TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return X['AppointmentDay'].dt.weekday.values
class DaysAheadTransform(TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
appt = X['AppointmentDay']
sched = X['ScheduledDay']
days_ahead = (appt - sched).dt.days.values.astype('float64')
return days_ahead
def get_features_pipeline():
weekday_mapper = DataFrameMapper([
(['AppointmentDay'], WeekdayTransform())
],
input_df=True)
weekday_pipeline = Pipeline([
('weekday_adder', weekday_mapper),
('weekday_encoder', OneHotEncoder(n_values=7))
]
)
days_ahead_mapper = DataFrameMapper([
(['AppointmentDay', 'ScheduledDay'], DaysAheadTransform())
],
input_df=True
)
days_ahead_pipeline = Pipeline([
('days_ahead_mapper', days_ahead_mapper),
('days_ahead_scaler', StandardScaler())
]
)
pass_through_attributes =\
['Scholarship',
'Hypertension',
'Diabetes',
'Alcoholism',
'SMS_received']
pass_through_mapper = DataFrameMapper(
list(zip(
pass_through_attributes,
[None for x in pass_through_attributes]
))
)
mapper = DataFrameMapper([
(['Age'], StandardScaler()),
('Gender', LabelBinarizer()),
(['Handicap'], OneHotEncoder(n_values=5)),
('Neighbourhood', LabelBinarizer())
])
full_pipeline = FeatureUnion(transformer_list=[
('weekday_pipeline', weekday_pipeline), # add weekday feature
('days_ahead_pipeline', days_ahead_pipeline), # add days ahead feature
('pass_through_mapper', pass_through_mapper), # pass 5 features through
('mapper', mapper) # age (1), gender (1), handicap (4), neighourhood (?)
])
return full_pipeline
def get_labels_pipeline():
mapper = DataFrameMapper([
('No-show', LabelBinarizer(pos_label=1, neg_label=-1))
])
return mapper
def fit_save_pipelines(config=utils.file_config):
train_df = utils.read_csv(config['processed_data_path'],
config['train_csv'])
# fit feature pipeline
feature_pipeline = get_features_pipeline()
feature_pipeline = feature_pipeline.fit(train_df)
# save feature pipeline after fitting
joblib.dump(feature_pipeline,
config['objstore_path'] + '/' + config['feature_pipeline_file'])
# fit labels pipeline
label_pipeline = get_labels_pipeline()
label_pipeline = label_pipeline.fit(train_df)
# save labels pipeline
joblib.dump(label_pipeline,
config['objstore_path'] + '/' + config['labels_pipeline_file'])
def load_pipelines(config=utils.file_config):
feature_pipeline_path = config['objstore_path'] + '/' +\
config['feature_pipeline_file']
feature_pipeline = joblib.load(feature_pipeline_path)
labels_pipeline_path = config['objstore_path'] + '/' +\
config['labels_pipeline_file']
label_pipeline = joblib.load(labels_pipeline_path)
return feature_pipeline, label_pipeline
def preprocess_data(data_frame, config=utils.file_config):
feature_pipeline, labels_pipeline = load_pipelines(config=config)
X = feature_pipeline.transform(data_frame).toarray()
nobs = X.shape[0]
y = labels_pipeline.transform(data_frame).reshape((nobs))
return X, y
def load_train_data(config=utils.file_config):
train_df = utils.read_csv(config['processed_data_path'],
config['train_csv'])
train_X, train_y = preprocess_data(train_df, config=config)
return train_X, train_y
def load_test_data(config=utils.file_config):
test_df = utils.read_csv(config['processed_data_path'],
config['test_csv'])
test_X, test_y = preprocess_data(test_df, config=config)
return test_X, test_y