In [2]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import numpy as np
import lightgbm as lgbm
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from hyperopt import fmin, rand, tpe, space_eval, STATUS_OK, Trials, hp
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import FunctionTransformer

## Preprocessing

In [3]:
df_train = pd.read_csv('./data_sets/train.csv')
df_test = pd.read_csv('./data_sets/test.csv')
df_train.drop_duplicates(inplace=True)
df_train.drop(columns=['Descript', 'Resolution', 'Address','DayOfWeek'], inplace=True)
df_test.drop(columns=['Address','DayOfWeek'], inplace=True)
id_test = df_test.pop('Id')

In [4]:
df_train.replace({'X': -120.5, 'Y': 90.0}, np.NaN, inplace=True)
df_test.replace({'X': -120.5, 'Y': 90.0}, np.NaN, inplace=True)

imp = SimpleImputer(strategy='mean')

for district in df_train['PdDistrict'].unique():
    df_train.loc[df_train['PdDistrict'] == district, ['X', 'Y']] = imp.fit_transform(
        df_train.loc[df_train['PdDistrict'] == district, ['X', 'Y']])
    df_test.loc[df_test['PdDistrict'] == district, ['X', 'Y']] = imp.transform(
        df_test.loc[df_test['PdDistrict'] == district, ['X', 'Y']])

In [5]:
y_train = df_train['Category']
df_train = df_train.drop('Category', axis=1)

In [6]:
base_df  = df_train

In [7]:

class DateFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        X[self.column] = pd.to_datetime(X[self.column])
        X['day'] = X[self.column].dt.day - 1
        X['month'] = X[self.column].dt.month - 1
        X['year'] = X[self.column].dt.year
        X['hour'] = X[self.column].dt.hour
        X['quarter'] = X[self.column].dt.quarter - 1
        X['dayofyear'] = X[self.column].dt.dayofyear - 1
        X['dayofweek'] = X[self.column].dt.dayofweek
        X['is_weekend'] = (
            X[self.column].dt.dayofweek >= 5).astype(int)
        X['minute'] = X[self.column].dt.minute
        X=X.drop(columns=self.column)
        return X


class CosTransformation(BaseEstimator, TransformerMixin):
    def __init__(self, column, minus=False):
        self.period = None
        self.minus = minus
        self.column = column

    def fit(self, X, y=None):
        self.minus = -1 if self.minus else 1
        self.period = X[self.column].nunique()
        return self

    def transform(self, X, y=None):
        X = X.copy()
        X[self.column + 'cos'] = self.minus * \
            np.cos(X[self.column] * 2 * np.pi / self.period)
        return X


class SinTransformation(BaseEstimator, TransformerMixin):
    def __init__(self, column, minus=False):
        self.period = None
        self.minus = minus
        self.column = column

    def fit(self, X, y=None):
        self.minus = -1 if self.minus else 1
        self.period = X[self.column].nunique()
        return self

    def transform(self, X, y=None):
        X = X.copy()
        X[self.column + 'sin'] = self.minus * \
            np.sin(X[self.column] * 2 * np.pi / self.period)
        return X


class BucketingCoordinatesTransformation(BaseEstimator, TransformerMixin):
    def __init__(self, columns=['X', 'Y'], bins=10):
        self.columns = columns
        self.bins = bins

    def fit(self, X, y=None):
        self.x_mean = X[self.columns[0]].mean()
        self.y_mean = X[self.columns[1]].mean()
        self.x_std = X[self.columns[0]].std()
        self.y_std = X[self.columns[1]].std()

        self.x_min = ((X[self.columns[0]] - self.x_mean) / self.x_std).min()
        self.x_max = ((X[self.columns[0]] - self.x_mean) / self.x_std).max()
        self.y_min = ((X[self.columns[1]] - self.y_mean) / self.y_std).min()
        self.y_max = ((X[self.columns[1]] - self.y_mean) / self.y_std).max()

        self.bin_edges_x = np.linspace(self.x_min, self.x_max, self.bins)
        self.bin_edges_y = np.linspace(self.y_min, self.y_max, self.bins)

        return self

    def transform(self, X, y=None):
        X = X.copy()
        X['sector_x'] = pd.cut((X[self.columns[0]] - self.x_mean) / self.x_std,
                               bins=self.bin_edges_x, labels=False, include_lowest=True)
        X['sector_y'] = pd.cut((X[self.columns[1]] - self.y_mean) / self.y_std,
                               bins=self.bin_edges_y, labels=False, include_lowest=True)
        X = X.drop(columns=self.columns)
        return X


def drop_columns(X):
    X = X.drop(['day', 'month', 'year', 'hour', 'quarter',
               'dayofweek', 'minute', 'dayofyear'], axis=1)
    return X


drop_columns_transformer = FunctionTransformer(drop_columns)

In [8]:
cos_transformer = Pipeline([
    ('cos_transformation_hour', CosTransformation(column='hour')),
    ('cos_transformation_month', CosTransformation(column='month')),
    ('cos_transformation_quarter', CosTransformation(column='quarter')),
    ('cos_transformation_dayofyear', CosTransformation(column='dayofyear')),
    ('cos_transformation_day', CosTransformation(column='day')),
    ('cos_transformation_dayofweek', CosTransformation(column='dayofweek'))
])
sin_transformer = Pipeline([
    ('sin_transformation_hour', SinTransformation(column='hour')),
    ('sin_transformation_month', SinTransformation(column='month')),
    ('sin_transformation_quarter', SinTransformation(column='quarter')),
    ('sin_transformation_dayofyear', SinTransformation(column='dayofyear')),
    ('sin_transformation_day', SinTransformation(column='day')),
    ('sin_transformation_dayofweek', SinTransformation(column='dayofweek'))

])

base_transformer = Pipeline([
    ('date_features', DateFeatureExtractor(column='Dates')),
])
sincos_transformer = Pipeline([
    ('date_features', DateFeatureExtractor(column='Dates')),
    ('cos', cos_transformer),
    ('sin', sin_transformer),
    ('drop', drop_columns_transformer)

])

bucketing_transformer = Pipeline([
    ('date_features', DateFeatureExtractor(column='Dates')),
    ('cos', cos_transformer),
    ('sin', sin_transformer),
    ('drop', drop_columns_transformer),
    ('bucketing', BucketingCoordinatesTransformation(
        columns=['X', 'Y'], bins=10))
])

categorical_transformer_label = Pipeline(steps=[
    ('label', OrdinalEncoder())
])

In [9]:
label_encoder=LabelEncoder()
y=label_encoder.fit_transform(y_train)

In [10]:
df_train_base = base_transformer.fit_transform(base_df)
df_test_base = base_transformer.transform(df_test)

In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat_label', categorical_transformer_label, ['PdDistrict']),
    ], remainder='passthrough')

In [12]:
X_train = preprocessor.fit_transform(df_train_base)
X_test = preprocessor.transform(df_test_base)

In [13]:
X_train = base_df

## Trenowanie

| Algorithm                     | Parameters                  | Logloss            |
|-------------------------------|-----------------------------|--------------------|
| Stochastic Gradient Descent   | Default Parameters          | 20.10143           |
| K-Nearest Neighbors           | Default Parameters          | 26.25314           |
| HistGradientBoostingClassifier| Default Parameters          | 6.59939            |
| XGBoost                       | Default Parameters          | 2.29494            |
| LightGBM                      | Default Parameters          | 2.59236            |
| Random Forest                 | Default Parameters          | 5.03352            |

Ostatecznie, spośród algorytmów, które osiągnęły wynik poniżej 3.0, zdecydowaliśmy się pracować z XGBoost i LightGBM ze względu na ich efektywność i wszechstronność w dostrajaniu hiperparametrów.

### Default Parameters

#### Stochastic Gradient Descent

In [None]:

sgd = SGDClassifier(loss='log_loss')

y_prob = cross_val_predict(sgd, X_train, y, method='predict_proba', n_jobs=-1)


logloss = log_loss(y, y_prob)

print(f'LogLoss: {logloss}')




LogLoss: 20.10143316473642


LogLoss: 20.10143316473642

#### KNearestNeighbours

In [None]:
knn = KNeighborsClassifier()

y_prob = cross_val_predict(knn, X_train, y, method='predict_proba',n_jobs=-1)

logloss = log_loss(y, y_prob)
print(f'LogLoss: {logloss}')



LogLoss: 26.253138564296812


LogLoss: 26.253138564296812

#### HistGradientBoostingClassifier

In [None]:
hgb = HistGradientBoostingClassifier()

y_prob = cross_val_predict(hgb, X_train, y, method='predict_proba', n_jobs=-1)

logloss = log_loss(y, y_prob)

print(f'LogLoss: {logloss}')



LogLoss: 6.599390039999148


LogLoss: 6.599390039999148

#### XGradientBoost

In [None]:
dtrain = xgb.DMatrix(X_train, label=y)

In [None]:
xgb_params = {
    'device':'cuda',
    'tree_method': 'hist',
    'num_class': 39,
    'random_state': 42,
    'eval_metric': 'mlogloss',
    'objective': 'multi:softprob'
}

dtrain = xgb.DMatrix(X_train, label=y)

cv_results = xgb.cv(
    params=xgb_params,
    dtrain=dtrain,
    num_boost_round=100,
    nfold=5,
    metrics='mlogloss',
    early_stopping_rounds=10
)

print('Best score: ', cv_results['test-mlogloss-mean'].min())

Best score:  2.294936056193336


In [None]:
num_boost_round = np.argmin(cv_results['test-mlogloss-mean'].min())
print('Best epoch: ', num_boost_round)

In [None]:
bst = xgb.train(xgb_params, dtrain, num_boost_round=100)

In [None]:
dtest = xgb.DMatrix(X_test)
predictions = bst.predict(dtest)

In [None]:
submission = pd.DataFrame(predictions,
                          columns=label_encoder.classes_,
                          index=id_test)
submission.to_csv('models/XGBM - base model.csv', index_label='Id')

Best score:  2.294936056193336 XGB VALIDATION CV TEST RESULTS: 2.30348 ON KAGGLE

#### LightGradientBoost

In [None]:
train=lgbm.Dataset(X_train, label=y,categorical_feature=[0],free_raw_data=False) 

In [None]:
params_lgbm = {
    'objective': 'multiclass',
    'num_class': 39
}
cv_results = lgbm.cv(params_lgbm, train,
                    metrics='multi_logloss')

In [None]:
print('Best score: ', min(cv_results['valid multi_logloss-mean']))

Best score:  2.592361487579474


In [None]:
bst = lgbm.train(params_lgbm, train, num_boost_round=100)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004795 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 918
[LightGBM] [Info] Number of data points in the train set: 875726, number of used features: 12
[LightGBM] [Info] Start training from score -6.361620
[LightGBM] [Info] Start training from score -2.433653
[LightGBM] [Info] Start training from score -7.676455
[LightGBM] [Info] Start training from score -8.016382
[LightGBM] [Info] Start training from score -3.175005
[LightGBM] [Info] Start training from score -5.313420
[LightGBM] [Info] Start training from score -5.956155
[LightGBM] [Info] Start training from score -2.787570
[LightGBM] [Info] Start training from score -5.321801
[LightGBM] [Info] Start training from score -6.623191
[LightGBM] [Info] Start training from score -8.137631
[LightGBM] [Info] Start training from score -7.492493

In [None]:
predictions = bst.predict(X_test)

In [None]:
submission = pd.DataFrame(predictions,
                          columns=label_encoder.classes_,
                          index=id_test)
submission.to_csv('models/LGBM - base model.csv', index_label='Id')

#### RandomForest

In [None]:
clf = RandomForestClassifier()
y_prob = cross_val_predict(clf, X_train, y, method='predict_proba', n_jobs=-1)

logloss = log_loss(y, y_prob)
print(f'LogLoss: {logloss}')

Best score:  5.033523749445065 RANDOM FOREST VALIDATION CV
TEST RESULTS:  3.81670 ON KAGGLE

## Tunowanie Hiperparametrów

Próbowaliśmy dostrajać hiperparametry przy użyciu Random Grid Search, ale postanowiliśmy skorzystać z optymalizacji bayesowskiej. 

### Optymalizacja Bayesowska

Optymalizacja bayesowska to metoda optymalizacji hiperparametrów, która buduje probabilistyczny model funkcji celu i wykorzystuje go do wyboru najbardziej obiecujących hiperparametrów do przetestowania. W przeciwieństwie do Random Grid Search, który losowo próbuje różne kombinacje hiperparametrów, optymalizacja bayesowska inteligentnie nawigując przestrzenią hiperparametrów, aby szybciej znaleźć optymalne ustawienia.

Dzięki zastosowaniu optymalizacji bayesowskiej możemy bardziej efektywnie dostrajać nasze modele XGBoost i LightGBM, co prowadzi do lepszych wyników predykcyjnych przy mniejszej liczbie prób.

### Pipeline'y do Przetwarzania Danych

W celu skutecznego przetwarzania danych i porównania różnych podejść, zastosowaliśmy trzy różne pipeline'y:

1. **Pipeline z rozdzielonymi datami**:
   - W tym pipeline'ie rozdzieliliśmy daty na oddzielne kolumny reprezentujące dni, miesiące i godziny. Pozwoliło to modelowi na lepsze zrozumienie czasowych wzorców w danych.

2. **Pipeline z funkcjami trygonometrycznymi**:
   - W tym podejściu zastosowaliśmy funkcje trygonometryczne (sinus i cosinus) do przekształcenia dni, miesięcy i godzin. Dzięki temu mogliśmy uchwycić cykliczność w danych, co mogło poprawić wydajność modeli.

3. **Pipeline z funkcjami trygonometrycznymi i bucketowaniem**:
   - Ten pipeline łączył funkcje trygonometryczne z techniką grupowania (bucketing) współrzędnych geograficznych. Podzieliliśmy szerokość i długość geograficzną na mniejsze segmenty (wiaderka), aby uwzględnić podobieństwa między miejscami znajdującymi się blisko siebie.

Każdy z tych pipeline'ów został zaprojektowany w celu poprawy wyników predykcyjnych naszych modeli poprzez różne techniki przetwarzania danych.


## XGBoost

| Model           | Metoda                             | Wynik CV                | Wynik Kaggle          |
|-----------------|--------------------------------------|-------------------------|-----------------------|
| XGBoost         | Bez hiperparametrów                  | 2.29494                 | 2.30348               |
| XGBoost         | Pipeline 1                           | 2.28839                 | 2.30605               |
| XGBoost         | Pipeline 2                           | 2.27688                 | 2.29785               |
| XGBoost         | Pipeline 3                           | 2.50038                 | 2.52143               |


In [None]:
def objective(params):

    # Convert hyperparameters to correct types
    params = {
        'learning_rate': float(params['learning_rate']),
        'max_depth': int(params['max_depth']),
        'subsample': float(params['subsample']),
        'colsample_bytree': float(params['colsample_bytree']),
        'reg_lambda': float(params['reg_lambda']),
        'gamma': float(params['gamma']),
        'min_child_weight': int(params['min_child_weight']),

        # Fixed parameters
        'objective': 'multi:softprob',
        'num_class': 39,
        'verbosity': 0,
        'eval_metric': 'mlogloss',
        'device':'cuda',
        'tree_method': 'hist'
    }

    # Train using cross-validation
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=100,
        nfold=5,
        stratified=True,
        metrics='mlogloss',
        early_stopping_rounds=10,
        seed=42
    )
    
    loss = min(cv_results['test-mlogloss-mean'])
    epochs = np.argmin(cv_results['test-mlogloss-mean']) + 1

    print('loss:', loss)

    return {
        'loss': loss,
        'params': params,
        'status': STATUS_OK,
        'epochs': epochs,
    }

In [None]:
space = {
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(1)),
    'max_depth': hp.quniform('max_depth', 1, 20, 1),
    'subsample': hp.uniform('subsample', 0.0, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.0, 1.0),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
    'gamma': hp.uniform('gamma', 0.0, 5.0),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1)
}

### Model Tuned

In [None]:
X_train = base_df
dtrain = xgb.DMatrix(X_train, label=y)

In [None]:
tpe_algo = tpe.suggest
tpe_trials = Trials()

best = fmin(
    fn=objective,
    space=space,
    algo=tpe_algo,
    max_evals=10,
    trials=tpe_trials
)

loss:                                                 
2.334618900317089                                     
loss:                                                                           
2.75388501919925                                                                
loss:                                                                           
2.5812220854607957                                                              
loss:                                                                           
2.2883935014434504                                                              
loss:                                                                            
2.531286980250815                                                                
loss:                                                                            
2.9472151314701294                                                               
loss:                                                                       

In [None]:
best

{'colsample_bytree': 0.6327741097399604,
 'gamma': 2.1259392559055446,
 'learning_rate': 0.09797353834548439,
 'max_depth': 20.0,
 'min_child_weight': 2.0,
 'reg_lambda': 0.7764276919090378,
 'subsample': 0.3738444042140795}

In [None]:
params = {'colsample_bytree': 0.6327741097399604,
 'gamma': 2.1259392559055446,
 'learning_rate': 0.09797353834548439,
 'max_depth': 20,
 'min_child_weight': 2.0,
 'reg_lambda': 0.7764276919090378,
 'subsample': 0.3738444042140795,
         # Fixed parameters
        'objective': 'multi:softprob',
        'num_class': 39,
        'verbosity': 0,
        'eval_metric': 'mlogloss',
        'device':'cuda',
        'tree_method': 'hist'
        }

In [None]:
bst = xgb.train(params, dtrain, num_boost_round=100)

In [None]:
predictions = bst.predict(dtest)
submission = pd.DataFrame(predictions,
                          columns=label_encoder.classes_,
                          index=id_test)
submission.to_csv('models/XGBM - model tuned.csv', index_label='Id')

val accuracy 2.2883935014434504  kaggle 2.30605

### SinCos

In [None]:
X_train = base_df

In [35]:
df_train_xgb3 = sincos_transformer.fit_transform(df_train)
df_test_xgb3 = sincos_transformer.transform(df_test)

In [36]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_train)

In [37]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat_label', categorical_transformer_label, ['PdDistrict']),
    ], remainder='passthrough')

In [38]:
X_train = preprocessor.fit_transform(df_train_xgb3)
X_test = preprocessor.transform(df_test_xgb3)

In [45]:
df_train_xgb3

Unnamed: 0,PdDistrict,X,Y,day,month,year,hour,quarter,dayofyear,dayofweek,...,quartercos,dayofyearcos,daycos,dayofweekcos,hoursin,monthsin,quartersin,dayofyearsin,daysin,dayofweeksin
0,NORTHERN,-122.425892,37.774599,12,4,2015,23,1,132,2,...,6.123234e-17,-0.772642,-0.758758,-0.222521,-0.258819,0.866025,1.0,0.634842,0.651372,0.974928
1,NORTHERN,-122.425892,37.774599,12,4,2015,23,1,132,2,...,6.123234e-17,-0.772642,-0.758758,-0.222521,-0.258819,0.866025,1.0,0.634842,0.651372,0.974928
2,NORTHERN,-122.424363,37.800414,12,4,2015,23,1,132,2,...,6.123234e-17,-0.772642,-0.758758,-0.222521,-0.258819,0.866025,1.0,0.634842,0.651372,0.974928
3,NORTHERN,-122.426995,37.800873,12,4,2015,23,1,132,2,...,6.123234e-17,-0.772642,-0.758758,-0.222521,-0.258819,0.866025,1.0,0.634842,0.651372,0.974928
4,PARK,-122.438738,37.771541,12,4,2015,23,1,132,2,...,6.123234e-17,-0.772642,-0.758758,-0.222521,-0.258819,0.866025,1.0,0.634842,0.651372,0.974928
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
878044,TARAVAL,-122.459033,37.714056,5,0,2003,0,0,5,0,...,1.000000e+00,0.995684,0.528964,1.000000,0.000000,0.000000,0.0,0.092813,0.848644,0.000000
878045,INGLESIDE,-122.447364,37.731948,5,0,2003,0,0,5,0,...,1.000000e+00,0.995684,0.528964,1.000000,0.000000,0.000000,0.0,0.092813,0.848644,0.000000
878046,SOUTHERN,-122.403390,37.780266,5,0,2003,0,0,5,0,...,1.000000e+00,0.995684,0.528964,1.000000,0.000000,0.000000,0.0,0.092813,0.848644,0.000000
878047,SOUTHERN,-122.390531,37.780607,5,0,2003,0,0,5,0,...,1.000000e+00,0.995684,0.528964,1.000000,0.000000,0.000000,0.0,0.092813,0.848644,0.000000


In [41]:
dtrain = xgb.DMatrix(X_train, label=y)
dtest = xgb.DMatrix(X_test)

In [46]:
tpe_algo = tpe.suggest
tpe_trials = Trials()

best = fmin(
    fn=objective,
    space=space,
    algo=tpe_algo,
    max_evals=10,
    trials=tpe_trials
)

loss:                                                 
2.854777294781713                                     
loss:                                                                           
2.421471975816339                                                               
loss:                                                                           
2.3408720883361314                                                              
loss:                                                                            
2.276883080686809                                                                
loss:                                                                            
2.3542853164856172                                                              
loss:                                                                           
2.3324126709732504                                                              
loss:                                                                        

In [47]:
best

{'colsample_bytree': 0.8399515080498189,
 'gamma': 1.8809687874683567,
 'learning_rate': 0.15019457619783694,
 'max_depth': 12.0,
 'min_child_weight': 9.0,
 'reg_lambda': 0.3917588749286913,
 'subsample': 0.8665760243726364}

{'colsample_bytree': 0.8399515080498189,
 'gamma': 1.8809687874683567,
 'learning_rate': 0.15019457619783694,
 'max_depth': 12.0,
 'min_child_weight': 9.0,
 'reg_lambda': 0.3917588749286913,
 'subsample': 0.8665760243726364}

In [55]:
params = {'colsample_bytree': 0.8399515080498189,
 'gamma': 1.8809687874683567,
 'learning_rate': 0.15019457619783694,
 'max_depth': 12,
 'min_child_weight': 9.0,
 'reg_lambda': 0.3917588749286913,
 'subsample': 0.8665760243726364,
  'objective': 'multi:softprob',
  'num_class': 39,
  'verbosity': 0,
  'eval_metric': 'mlogloss',
  'device':'cuda',
  'tree_method': 'hist'
     }

In [56]:
bst = xgb.train(params, dtrain, num_boost_round=100)

In [57]:
predictions = bst.predict(dtest)
submission = pd.DataFrame(predictions,
                          columns=label_encoder.classes_,
                          index=id_test)
submission.to_csv('models/XGBM - sincos model tuned.csv', index_label='Id')

val loss: 2.276883080686809  keggle: 2.27688   

### Bucketing

In [60]:
X_train = base_df

In [None]:
df_train_xgb2 = bucketing_transformer.fit_transform(df_train)
df_test_xgb2 = bucketing_transformer.transform(df_test)

In [None]:
df_test_xgb2.sector_x.fillna(8, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test_xgb2.sector_x.fillna(8, inplace=True)


In [None]:
df_test_xgb2.sector_y.fillna(8, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test_xgb2.sector_y.fillna(8, inplace=True)


In [None]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_train)

In [None]:
df_train_xgb2 = df_train_xgb2.drop(['day', 'month', 'hour','quarter','dayofyear','dayofweek','minute','X','Y'],axis=1)

In [None]:
df_test_xgb2 = df_test_xgb2.drop(['day', 'month', 'hour', 'quarter', 'dayofyear','dayofweek','minute','X','Y'],axis=1)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat_label', categorical_transformer_label, ['PdDistrict']),
    ], remainder='passthrough')

In [None]:
X_train = preprocessor.fit_transform(df_train_xgb2)
X_test = preprocessor.transform(df_test_xgb2)

In [None]:
dtrain = xgb.DMatrix(X_train, label=y)
dtest = xgb.DMatrix(X_test)

In [None]:
tpe_algo = tpe.suggest
tpe_trials = Trials()

best = fmin(
    fn=objective,
    space=space,
    algo=tpe_algo,
    max_evals=10,
    trials=tpe_trials
)

loss:                                                 
2.50343654568066                                      
loss:                                                                          
2.5571448296583887                                                             
loss:                                                                          
2.5003795260570145                                                             
loss:                                                                            
2.8357352817765586                                                               
loss:                                                                            
2.538878462258584                                                                
loss:                                                                            
2.8956564616631333                                                               
loss:                                                                         

In [None]:
best

{'colsample_bytree': 0.7771885113792781,
 'gamma': 4.26023572510469,
 'learning_rate': 0.08290773728323132,
 'max_depth': 11.0,
 'min_child_weight': 5.0,
 'reg_lambda': 0.38119084682817406,
 'subsample': 0.7629235886854302}

{'colsample_bytree': 0.7771885113792781,
 'gamma': 4.26023572510469,
 'learning_rate': 0.08290773728323132,
 'max_depth': 11.0,
 'min_child_weight': 5.0,
 'reg_lambda': 0.38119084682817406,
 'subsample': 0.7629235886854302}

In [None]:
params = {'colsample_bytree': 0.6327741097399604,
 'gamma': 2.1259392559055446,
 'learning_rate': 0.09797353834548439,
 'max_depth': 20,
 'min_child_weight': 2.0,
 'reg_lambda': 0.7764276919090378,
 'subsample': 0.3738444042140795,
         # Fixed parameters
        'objective': 'multi:softprob',
        'num_class': 39,
        'verbosity': 0,
        'eval_metric': 'mlogloss',
        'device':'cuda',
        'tree_method': 'hist'
        }

In [None]:
bst = xgb.train(params, dtrain, num_boost_round=100)

In [None]:
predictions = bst.predict(dtest)
submission = pd.DataFrame(predictions,
                          columns=label_encoder.classes_,
                          index=id_test)
submission.to_csv('models/XGBM - sincos buckieting model tuned.csv', index_label='Id')

val loss: 2.5003795260570145    keggle: 2.52143

## Light GBM

| Model           | Metoda                             | Wynik CV                | Wynik Kaggle          |
|-----------------|--------------------------------------|-------------------------|-----------------------|
| LGBM         | Bez hiperparametrów                  | 2.59236                 | 2.68984               |
| LGBM         | Pipeline 1                           | 2.31692                 | 2.32228               |
| LGBM         | Pipeline 2                           | 2.32257                 | 2.37876               |
| LGBM         | Pipeline 3                           | 2.39088                 | 2.40186               |


In [1]:
def objective(params):

    params = {

        # Search Parameters
        'learning_rate': float(params['learning_rate']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'bagging_fraction': float(params['bagging_fraction']),
        'feature_fraction': float(params['feature_fraction']),
        'reg_lambda': float(params['reg_lambda']),


        # Fixed Parameters
        'force_col_wise': 'true',
        'verbose': -1,
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'num_class': 39

    }

    # LightBGM classifier
    cv_results = lgbm.cv(params, train, metrics='multi_logloss',
                         num_boost_round=100, nfold=5, stratified=True, shuffle=True)

    loss = min(cv_results['valid multi_logloss-mean'])
    epochs = np.argmin(cv_results['valid multi_logloss-mean']) + 1

    print('loss: ', loss)

    return {
        'loss': loss,
        'params': params,
        'status': STATUS_OK,
        'epochs': epochs,
    }

In [None]:
space = {
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(1)),
    'max_depth': hp.quniform('max_depth', 1, 20, 1),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.0, 1.0),
    'feature_fraction': hp.uniform('feature_fraction', 0.0, 1.0),
    'num_leaves': hp.quniform('gbdt_num_leaves', 5, 50, 1),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0)

}

### Model Tuned

In [None]:
X_train = base_df
train = lgbm.Dataset(X_train, label=y, categorical_feature=[0],free_raw_data=False) 

In [None]:
tpe_algo = tpe.suggest

tpe_trials = Trials()


best = fmin(
    fn=objective,
    space=space,
    algo=tpe_algo,
    max_evals=10,
    trials=tpe_trials
)

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]
loss:                                                 
2.6677862752093686                                    
loss:                                                                              
2.575902346837997                                                                  
loss:                                                                              
2.418834835955627                                                                 
loss:                                                                             
2.316923463173695                                                               
loss:                                                                           
2.352342391515032                                                               
loss:                                                                           
2.366662449509916                                                               
loss:                                                                           
2.3170386802382943                                                                
loss:                                                                             
2.3636252449281847                                                                
loss:                                                                             
2.3428560390238427                                                                
loss:                                                                             
2.4397639503117317                                                                
100%|██████████| 10/10 [1:25:16<00:00, 511.60s/trial, best loss: 2.316923463173695]

In [3]:
params = {'bagging_fraction': 0.9810700845464868,
          'feature_fraction': 0.42462131607761644,
          'gbdt_num_leaves': 37,
          'learning_rate': 0.2155168538629674,
          'max_depth': int(5),
          'reg_lambda': 0.976507677160282,
          'force_col_wise': 'true',
          'verbose': -1,
          'boosting_type': 'gbdt',
          'objective': 'multiclass',
          'num_class': 39

          }

In [None]:
bst = lgbm.train(params, train, num_boost_round=100)

In [None]:
predictions = bst.predict(X_test)
submission = pd.DataFrame(predictions,
                          columns=label_encoder.classes_,
                          index=id_test)
submission.to_csv('LGBM - base model tuned.csv', index_label='Id')

val loss 2.3169234631736 kaggle 2.32228

### SinCos

In [None]:
X_train = base_df

In [None]:
df_train3 = sincos_transformer.fit_transform(X_train)
df_test3 = sincos_transformer.transform(X_test)

In [None]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_train)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat_label', categorical_transformer_label, ['PdDistrict']),
    ], remainder='passthrough')

In [None]:
X_train = preprocessor.fit_transform(df_train3)
X_test = preprocessor.transform(df_test3)

In [None]:
train = lgbm.Dataset(X_train, label=y, categorical_feature=[0], free_raw_data=False) 

In [None]:
tpe_algo = tpe.suggest
tpe_trials = Trials()

best = fmin(
    fn=objective,
    space=space,
    algo=tpe_algo,
    max_evals=10,
    trials=tpe_trials
)

In [None]:
params = {'bagging_fraction': 0.4828385421929686,
          'feature_fraction': 0.5391472756753458,
          'gbdt_num_leaves': 47.0,
          'learning_rate': 0.21951141536344673,
          'max_depth': int(6.0),
          'reg_lambda': 0.2938920884793351,
          'force_col_wise': 'true',
          'verbose': -1,
          'boosting_type': 'gbdt',
          'objective': 'multiclass',
          'num_class': 39
          }

In [None]:
bst = lgbm.train(params, train, num_boost_round=100)

In [None]:
predictions = bst.predict(X_test)
submission = pd.DataFrame(predictions,
                          columns=label_encoder.classes_,
                          index=id_test)
submission.to_csv('models/LGBM - sincos model tuned.csv', index_label='Id')

VAL LOSS 2.32 KAGGLE 2.37

### Bucketing

In [None]:
df_train2 = bucketing_transformer.fit_transform(df_train)
df_test2 = bucketing_transformer.transform(df_test)

In [None]:
df_test2.sector_x.fillna(8, inplace=True)
df_test2.sector_y.fillna(8, inplace=True)

In [None]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_train)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat_label', categorical_transformer_label, ['PdDistrict']),
    ], remainder='passthrough')

In [None]:
X_train = preprocessor.fit_transform(df_train2)
X_test = preprocessor.transform(df_test2)

In [None]:
train = lgbm.Dataset(X_train, label=y, categorical_feature=[0],free_raw_data=False) 

In [None]:
tpe_algo = tpe.suggest

tpe_trials = Trials()


best = fmin(
    fn=objective,
    space=space,
    algo=tpe_algo,
    max_evals=30,
    trials=tpe_trials
)

loss:                                                 
2.4726554765201167                                    
loss:                                                                              
2.4916876076539127                                                                 
loss:                                                                              
2.8252750786175516                                                                 
loss:                                                                              
2.412484439664079                                                                  
loss:                                                                              
2.412417123577532                                                                 
loss:                                                                             
2.4642874147822167                                                                
loss:                                                                             
2.4735694714395207                                                                
loss:                                                                             
2.484860627436979                                                                   
loss:                                                                               
2.4234691805312636                                                                  
loss:                                                                               
2.3953974528578654                                                                  
loss:                                                                                 
2.437728918624332                                                                     
loss:                                                                                 
2.4054962286828347                                                                    
loss:                                                                                 
...
2.4023219673620266                                                                  
loss:                                                                               
2.390885226293963                                                                   
100%|██████████| 30/30 [4:23:44<00:00, 527.49s/trial, best loss: 2.390885226293963] 

In [None]:
params = {'bagging_fraction': 0.529092163194387,
          'feature_fraction': 0.4410500133711229,
          'gbdt_num_leaves': 46.0,
          'learning_rate': 0.09823954487471348,
          'max_depth': int(14.0),
          'reg_lambda': 0.20317611010438422,
          'force_col_wise': 'true',
          'verbose': -1,
          'boosting_type': 'gbdt',
          'objective': 'multiclass',
          'num_class': 39

          }

In [None]:
bst = lgbm.train(params, train, num_boost_round=100)

In [None]:
predictions = bst.predict(X_test)
submission = pd.DataFrame(predictions,
                          columns=label_encoder.classes_,
                          index=id_test)
submission.to_csv('models/LGBM - sincos bucketing model tuned.csv', index_label='Id')

VAL LOSS 2.390885226293963 KAGGLE 2.40

## EVALUATION

ewaluujemy model z najlepszym wynikiem

In [14]:
params = {
    'colsample_bytree': 0.8399515080498189,
    'gamma': 1.8809687874683567,
    'learning_rate': 0.15019457619783694,
    'max_depth': 12,
    'min_child_weight': 9.0,
    'reg_lambda': 0.3917588749286913,
    'subsample': 0.8665760243726364,
    'objective': 'multi:softprob',
    'num_class': 39,
    'verbosity': 0,
    'eval_metric': 'mlogloss',
    'tree_method': 'hist',
    'n_estimators': 1000

}

In [15]:
df_train

Unnamed: 0,Dates,PdDistrict,X,Y
0,2015-05-13 23:53:00,NORTHERN,-122.425892,37.774599
1,2015-05-13 23:53:00,NORTHERN,-122.425892,37.774599
2,2015-05-13 23:33:00,NORTHERN,-122.424363,37.800414
3,2015-05-13 23:30:00,NORTHERN,-122.426995,37.800873
4,2015-05-13 23:30:00,PARK,-122.438738,37.771541
...,...,...,...,...
878044,2003-01-06 00:15:00,TARAVAL,-122.459033,37.714056
878045,2003-01-06 00:01:00,INGLESIDE,-122.447364,37.731948
878046,2003-01-06 00:01:00,SOUTHERN,-122.403390,37.780266
878047,2003-01-06 00:01:00,SOUTHERN,-122.390531,37.780607


In [16]:
df_train_sin = sincos_transformer.fit_transform(df_train)
df_test_sin = sincos_transformer.transform(df_test)

In [17]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_train)

In [18]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat_label', categorical_transformer_label, ['PdDistrict']),
    ], remainder='passthrough')

In [19]:
pipeline_ = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', xgb.XGBClassifier(**params))
])

In [20]:
pipeline_.fit(df_train_sin, y)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [21]:
pipeline_.predict_proba(df_test_sin)

array([[3.5557006e-03, 1.3274600e-01, 1.1195862e-04, ..., 1.6885036e-01,
        2.6564455e-02, 1.7516308e-02],
       [3.7755324e-03, 1.2994067e-01, 9.8932673e-05, ..., 9.0842836e-02,
        6.9496982e-02, 2.1187212e-02],
       [1.0574546e-03, 3.7685674e-02, 9.3225659e-05, ..., 1.4754875e-01,
        1.3255123e-02, 3.9884122e-03],
       ...,
       [2.0255581e-03, 6.4184837e-02, 1.8616297e-03, ..., 1.4615826e-02,
        2.4931779e-02, 7.1453420e-03],
       [2.2165834e-03, 5.1004346e-02, 1.8149982e-03, ..., 1.1548169e-02,
        1.7895738e-02, 1.7141821e-02],
       [1.9769953e-03, 5.9456471e-02, 3.0137259e-03, ..., 2.7621713e-02,
        1.3696849e-02, 5.8788541e-03]], dtype=float32)

## Dalsze kroki

Ewaluujemy tylko model z najlepszym wynikiem. Obecnie mamy ograniczenie do 10 ewaluacji ze względu na ograniczenia sprzętowe. Aby przeprowadzić więcej ewaluacji i znaleźć optymalne hiperparametry, musimy zdobyć większą moc obliczeniową poprzez lepszy sprzęt lub skorzystanie z usług chmurowych.