In [24]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import numpy as np
import lightgbm as lgbm
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from hyperopt import fmin, rand, tpe, space_eval, STATUS_OK, Trials, hp
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import FunctionTransformer
from joblib import dump,load

## Preprocessing

In [25]:
df_train = pd.read_csv('./data_sets/train.csv')
df_test = pd.read_csv('./data_sets/test.csv')
df_train.drop_duplicates(inplace=True)
df_train.drop(columns=['Descript', 'Resolution', 'Address','DayOfWeek'], inplace=True)
df_test.drop(columns=['Address','DayOfWeek'], inplace=True)
id_test = df_test.pop('Id')

In [26]:
# df_train.replace({'X': -120.5, 'Y': 90.0}, np.NaN, inplace=True)
# df_test.replace({'X': -120.5, 'Y': 90.0}, np.NaN, inplace=True)

# imp = SimpleImputer(strategy='mean')

# for district in df_train['PdDistrict'].unique():
#     df_train.loc[df_train['PdDistrict'] == district, ['X', 'Y']] = imp.fit_transform(
#         df_train.loc[df_train['PdDistrict'] == district, ['X', 'Y']])
#     df_test.loc[df_test['PdDistrict'] == district, ['X', 'Y']] = imp.transform(
#         df_test.loc[df_test['PdDistrict'] == district, ['X', 'Y']])

In [27]:
# dump(imp, 'imputer.joblib')

In [28]:
imp = load('imputer.joblib')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [29]:
df_test.replace({'X': -120.5, 'Y': 90.0}, np.NaN, inplace=True)
df_train.replace({'X': -120.5, 'Y': 90.0}, np.NaN, inplace=True)

for district in df_train['PdDistrict'].unique():
    df_train.loc[df_train['PdDistrict'] == district, ['X', 'Y']] = imp.transform(
        df_train.loc[df_train['PdDistrict'] == district, ['X', 'Y']])
    df_test.loc[df_test['PdDistrict'] == district, ['X', 'Y']] = imp.transform(
        df_test.loc[df_test['PdDistrict'] == district, ['X', 'Y']])

In [30]:
y_train = df_train['Category']
df_train = df_train.drop('Category', axis=1)

In [31]:
base_df  = df_train

In [32]:

class DateFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        X[self.column] = pd.to_datetime(X[self.column])
        X['day'] = X[self.column].dt.day - 1
        X['month'] = X[self.column].dt.month - 1
        X['year'] = X[self.column].dt.year
        X['hour'] = X[self.column].dt.hour
        X['quarter'] = X[self.column].dt.quarter - 1
        X['dayofyear'] = X[self.column].dt.dayofyear - 1
        X['dayofweek'] = X[self.column].dt.dayofweek
        X['is_weekend'] = (
            X[self.column].dt.dayofweek >= 5).astype(int)
        X['minute'] = X[self.column].dt.minute
        X=X.drop(columns=self.column)
        return X


class CosTransformation(BaseEstimator, TransformerMixin):
    def __init__(self, column, minus=False):
        self.period = None
        self.minus = minus
        self.column = column

    def fit(self, X, y=None):
        self.minus = -1 if self.minus else 1
        self.period = X[self.column].nunique()
        return self

    def transform(self, X, y=None):
        X = X.copy()
        X[self.column + 'cos'] = self.minus * \
            np.cos(X[self.column] * 2 * np.pi / self.period)
        return X


class SinTransformation(BaseEstimator, TransformerMixin):
    def __init__(self, column, minus=False):
        self.period = None
        self.minus = minus
        self.column = column

    def fit(self, X, y=None):
        self.minus = -1 if self.minus else 1
        self.period = X[self.column].nunique()
        return self

    def transform(self, X, y=None):
        X = X.copy()
        X[self.column + 'sin'] = self.minus * \
            np.sin(X[self.column] * 2 * np.pi / self.period)
        return X


class BucketingCoordinatesTransformation(BaseEstimator, TransformerMixin):
    def __init__(self, columns=['X', 'Y'], bins=10):
        self.columns = columns
        self.bins = bins

    def fit(self, X, y=None):
        self.x_mean = X[self.columns[0]].mean()
        self.y_mean = X[self.columns[1]].mean()
        self.x_std = X[self.columns[0]].std()
        self.y_std = X[self.columns[1]].std()

        self.x_min = ((X[self.columns[0]] - self.x_mean) / self.x_std).min()
        self.x_max = ((X[self.columns[0]] - self.x_mean) / self.x_std).max()
        self.y_min = ((X[self.columns[1]] - self.y_mean) / self.y_std).min()
        self.y_max = ((X[self.columns[1]] - self.y_mean) / self.y_std).max()

        self.bin_edges_x = np.linspace(self.x_min, self.x_max, self.bins)
        self.bin_edges_y = np.linspace(self.y_min, self.y_max, self.bins)

        return self

    def transform(self, X, y=None):
        X = X.copy()
        X['sector_x'] = pd.cut((X[self.columns[0]] - self.x_mean) / self.x_std,
                               bins=self.bin_edges_x, labels=False, include_lowest=True)
        X['sector_y'] = pd.cut((X[self.columns[1]] - self.y_mean) / self.y_std,
                               bins=self.bin_edges_y, labels=False, include_lowest=True)
        X = X.drop(columns=self.columns)
        return X


def drop_columns(X):
    X = X.drop(['day', 'month', 'year', 'hour', 'quarter',
               'dayofweek', 'minute', 'dayofyear'], axis=1)
    return X


drop_columns_transformer = FunctionTransformer(drop_columns)

In [33]:
cos_transformer = Pipeline([
    ('cos_transformation_hour', CosTransformation(column='hour')),
    ('cos_transformation_month', CosTransformation(column='month')),
    ('cos_transformation_quarter', CosTransformation(column='quarter')),
    ('cos_transformation_dayofyear', CosTransformation(column='dayofyear')),
    ('cos_transformation_day', CosTransformation(column='day')),
    ('cos_transformation_dayofweek', CosTransformation(column='dayofweek'))
])
sin_transformer = Pipeline([
    ('sin_transformation_hour', SinTransformation(column='hour')),
    ('sin_transformation_month', SinTransformation(column='month')),
    ('sin_transformation_quarter', SinTransformation(column='quarter')),
    ('sin_transformation_dayofyear', SinTransformation(column='dayofyear')),
    ('sin_transformation_day', SinTransformation(column='day')),
    ('sin_transformation_dayofweek', SinTransformation(column='dayofweek'))

])

base_transformer = Pipeline([
    ('date_features', DateFeatureExtractor(column='Dates')),
])
sincos_transformer = Pipeline([
    ('date_features', DateFeatureExtractor(column='Dates')),
    ('cos', cos_transformer),
    ('sin', sin_transformer),
    ('drop', drop_columns_transformer)

])

bucketing_transformer = Pipeline([
    ('date_features', DateFeatureExtractor(column='Dates')),
    ('cos', cos_transformer),
    ('sin', sin_transformer),
    ('drop', drop_columns_transformer),
    ('bucketing', BucketingCoordinatesTransformation(
        columns=['X', 'Y'], bins=10))
])

categorical_transformer_label = Pipeline(steps=[
    ('label', OrdinalEncoder())
])

### EVALUATION

In [34]:
params = {
    'colsample_bytree': 0.8399515080498189,
    'gamma': 1.8809687874683567,
    'learning_rate': 0.15019457619783694,
    'max_depth': 12,
    'min_child_weight': 9.0,
    'reg_lambda': 0.3917588749286913,
    'subsample': 0.8665760243726364,
    'objective': 'multi:softprob',
    'num_class': 39,
    'verbosity': 0,
    'eval_metric': 'mlogloss',
    'tree_method': 'hist',
    'n_estimators': 1000

}

In [35]:
df_train

Unnamed: 0,Dates,PdDistrict,X,Y
0,2015-05-13 23:53:00,NORTHERN,-122.425892,37.774599
1,2015-05-13 23:53:00,NORTHERN,-122.425892,37.774599
2,2015-05-13 23:33:00,NORTHERN,-122.424363,37.800414
3,2015-05-13 23:30:00,NORTHERN,-122.426995,37.800873
4,2015-05-13 23:30:00,PARK,-122.438738,37.771541
...,...,...,...,...
878044,2003-01-06 00:15:00,TARAVAL,-122.459033,37.714056
878045,2003-01-06 00:01:00,INGLESIDE,-122.447364,37.731948
878046,2003-01-06 00:01:00,SOUTHERN,-122.403390,37.780266
878047,2003-01-06 00:01:00,SOUTHERN,-122.390531,37.780607


In [36]:
# df_train_sin = sincos_transformer.fit_transform(df_train)
# df_test_sin = sincos_transformer.transform(df_test)

In [37]:
# dump(sincos_transformer, 'sincos.joblib')

In [38]:
sincos_transformer=load('sincos.joblib')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [39]:
df_train_sin = sincos_transformer.transform(df_train)

df_test_sin = sincos_transformer.transform(df_test)

In [40]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_train)

In [41]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat_label', categorical_transformer_label, ['PdDistrict']),
    ], remainder='passthrough')

In [42]:
pipeline_ = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', xgb.XGBClassifier(**params))
])

In [43]:
pipeline_.fit(df_train_sin, y)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [44]:
# dump(pipeline_, 'pipeline.joblib')

In [45]:
pipeline_ = load('pipeline.joblib')

In [47]:
pipeline_.predict_proba(df_test_sin)