In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import os
import gc
import random
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
import seaborn as sns
import pickle
import json
import re
import time
import sys
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import warnings
warnings.filterwarnings('ignore')

In [2]:
tfn1 = 'train_all_25n_'
tfn2 = 'valid_all_25n_'

nsets = 3
model_file_name = 'lgb_set12_'

In [3]:
tdata = pd.read_csv('../src/data/raw/train.csv')
tdata['name'] = tdata['name'].isna().astype(int)
tdata['categories'] = tdata['categories'].isna().astype(int)
tdata['address'] = tdata['address'].isna().astype(int)
tdata['state'] = tdata['state'].isna().astype(int)
tdata['url'] = tdata['url'].isna().astype(int)
tdata['country'] = tdata['country'].isna().astype(int)

tdata = tdata[['id','name', 'categories', 'address', 'state', 
               'url', 'country']].drop_duplicates().reset_index(drop=True)

In [4]:
def prepare_data(df_set):
    
    train_df = []
    
    for i in range(1, nsets+1):
    
        t1 = pd.read_csv('../src/data/processed/'+df_set+str(i)+'.csv')
        print(t1.shape)
        t1['kdist_diff'] = (t1['kdist'] - t1['kdist_country'])/t1['kdist_country']
        t1['kneighbors_mean'] = t1[['kneighbors', 'kneighbors_country']].mean(axis = 1)
        
        t1['sim_mean'] = t1[[col for col in t1.columns if 'sim' in col]].mean(axis=1)
        t1['jaro_mean'] = t1[[col for col in t1.columns if 'jaro' in col]].mean(axis=1)
        t1['lcs_mean'] = t1[[col for col in t1.columns if '_lcs' in col]].mean(axis=1)        
        
        t1['sim_sum'] = t1[[col for col in t1.columns if 'sim' in col]].sum(axis=1)
        t1['gesh_sum'] = t1[[col for col in t1.columns if 'gesh' in col]].sum(axis=1)
        t1['leven_sum'] = t1[[col for col in t1.columns if '_leven' in col]].sum(axis=1)
        t1['jaro_sum'] = t1[[col for col in t1.columns if 'jaro' in col]].sum(axis=1)
        t1['lcs_sum'] = t1[[col for col in t1.columns if '_lcs' in col]].sum(axis=1)
        t1['nlcsk_sum'] = t1[[col for col in t1.columns if '_nlcsk' in col]].sum(axis=1)
        t1['nleven_sum'] = t1[[col for col in t1.columns if '_nleven' in col]].sum(axis=1)
        t1['nlcs_sum'] = t1[[col for col in t1.columns if '_nlcs' in col]].sum(axis=1)

        t1['sim_std'] = t1[[col for col in t1.columns if 'sim' in col]].std(axis=1)
        t1['gesh_std'] = t1[[col for col in t1.columns if 'gesh' in col]].std(axis=1)
        t1['leven_std'] = t1[[col for col in t1.columns if '_leven' in col]].std(axis=1)
        t1['jaro_std'] = t1[[col for col in t1.columns if 'jaro' in col]].std(axis=1)
        t1['lcs_std'] = t1[[col for col in t1.columns if '_lcs' in col]].std(axis=1)
        t1['nlcsk_std'] = t1[[col for col in t1.columns if '_nlcsk' in col]].std(axis=1)
        t1['nleven_std'] = t1[[col for col in t1.columns if '_nleven' in col]].std(axis=1)
        t1['nlcs_std'] = t1[[col for col in t1.columns if '_nlcs' in col]].std(axis=1)

        t1 = t1.merge(tdata, on='id', how='left')
        t1 = t1.merge(tdata, left_on='match_id', right_on='id', how='left', suffixes=['_1','_2'])
        t1 = t1.drop('id_2', axis=1).rename(columns={'id_1':'id'})
        
        t1['info_power_1'] = t1[[col for col in t1.columns if '_1' in col]].lt(1).sum(axis=1)
        t1['info_power_2'] = t1[[col for col in t1.columns if '_2' in col]].lt(1).sum(axis=1)
        t1['info_diff'] = t1['info_power_1'] - t1['info_power_2']
        
        t1['kdist_diff_x_info_diff'] = t1['kdist_diff']*t1['info_diff']
        
        t1 = t1.drop([col for col in t1.columns if '_1' in col], axis=1)
        t1 = t1.drop([col for col in t1.columns if '_2' in col], axis=1)
        gc.collect()
                
        train_df.append(t1)
        
        del t1
        gc.collect()
    
    return train_df

In [5]:
%%time
train_df = prepare_data(tfn1)

(7692825, 96)
(5794725, 96)
(5357151, 96)
CPU times: user 7min 37s, sys: 2min 58s, total: 10min 36s
Wall time: 10min 36s


In [6]:
%%time
train_df = pd.concat(train_df).reset_index(drop=True)

CPU times: user 14.8 s, sys: 9.87 s, total: 24.7 s
Wall time: 24.7 s


In [7]:
train_df.shape

(18844701, 119)

In [9]:
%%time
valid_df = prepare_data(tfn2)

(7546775, 96)
(5798729, 96)
(5360801, 96)
CPU times: user 7min 29s, sys: 2min 59s, total: 10min 29s
Wall time: 10min 29s


In [10]:
%%time
valid_df = pd.concat(valid_df).reset_index(drop=True)

CPU times: user 16.4 s, sys: 11.7 s, total: 28.2 s
Wall time: 28.2 s


In [11]:
valid_df.shape

(18706305, 119)

In [12]:
train_df.label.value_counts()

0    17808387
1     1036314
Name: label, dtype: int64

In [13]:
valid_df.label.value_counts()

0    17676533
1     1029772
Name: label, dtype: int64

In [14]:
%%time
train_df = pd.concat([train_df, valid_df], axis=0).reset_index(drop=True)

del valid_df
gc.collect()

CPU times: user 58.6 s, sys: 1min 58s, total: 2min 56s
Wall time: 2min 57s


0

In [16]:
TRAIN_FEATURES = ['kdist', 'kneighbors', 'kdist_country', 'kneighbors_country', 'name_sim', 'name_gesh','name_leven', 
                'name_jaro', 'name_lcs', 'name_len_diff', 'name_nleven', 'name_nlcsk', 'name_nlcs', 'address_sim', 
                'address_gesh', 'address_leven', 'address_jaro', 'address_lcs', 'address_len_diff', 'address_nleven', 
                'address_nlcsk', 'address_nlcs', 'city_gesh', 'city_leven', 'city_jaro', 'city_lcs', 'city_len_diff', 
                'city_nleven', 'city_nlcsk', 'city_nlcs', 'state_sim', 'state_gesh', 'state_leven', 'state_jaro', 
                'state_lcs', 'state_len_diff', 'state_nleven', 'state_nlcsk', 'state_nlcs', 'zip_gesh', 'zip_leven', 
                'zip_jaro', 'zip_lcs', 'url_sim', 'url_gesh', 'url_leven', 'url_jaro', 'url_lcs', 'url_len_diff', 
                'url_nleven', 'url_nlcsk', 'url_nlcs', 'phone_gesh', 'phone_leven', 'phone_jaro', 'phone_lcs', 
                'categories_sim', 'categories_gesh', 'categories_leven', 'categories_jaro', 'categories_lcs', 
                'categories_len_diff', 'categories_nleven', 'categories_nlcsk', 'categories_nlcs', 'country_sim', 
                'country_gesh', 'country_leven', 'country_nleven', 'kdist_diff', 'kneighbors_mean', 
                'sim_sum', 'gesh_sum', 'leven_sum', 'jaro_sum','lcs_sum', 'sim_std', 'gesh_std', 'leven_std',
                'jaro_std', 'lcs_std', 'info_diff', 'nleven_sum', 'nlcsk_sum', 'nlcs_sum', 'nleven_std', 
                'nlcsk_std', 'nlcs_std', 'sim_mean','jaro_mean','lcs_mean','kdist_diff_x_info_diff',
                'name_w_ratio', 'name_partial_ratio', 'name_tokenset_ratio', 'name_tokensort_ratio', 
                'name_fuzz_power', 'categories_w_ratio', 'categories_partial_ratio', 'categories_tokenset_ratio', 
                'categories_tokensort_ratio', 'categories_fuzz_power', 'address_w_ratio', 'address_partial_ratio', 
                'address_tokenset_ratio', 'address_tokensort_ratio', 'address_fuzz_power', 
                'dlon','dlat','country_x_poi_count_mean', 'name_similarity'
                ]

In [17]:
%%time
NFOLDS = 5
kf = StratifiedKFold(n_splits = NFOLDS, shuffle=True, random_state=42)
for i, (trn_idx, val_idx) in tqdm(enumerate(kf.split(train_df, train_df["label"], train_df["label"]))):
    train_df.loc[val_idx, "fold"] = i

0it [00:00, ?it/s]

CPU times: user 1min 1s, sys: 57.1 s, total: 1min 58s
Wall time: 1min 58s


In [18]:
train_df.label.value_counts()/len(train_df)

0    0.944979
1    0.055021
Name: label, dtype: float64

In [19]:
import lightgbm as lgbm

def fit_lgbm(X, y, params=None, es_rounds=20, seed=42, N_SPLITS=5, 
             n_class=None, model_dir=None, folds=None):
    
    models = []
    oof = np.zeros((len(y), n_class), dtype=np.float64)
    
    for i in tqdm(range(NFOLDS)):
        print(f"== fold {i} ==")
        trn_idx = folds!=i
        val_idx = folds==i
        X_train, y_train = X[trn_idx], y.iloc[trn_idx]
        X_valid, y_valid = X.iloc[val_idx], y.iloc[val_idx]

        if model_dir is None:
            model = lgbm.LGBMClassifier(**params)
            model.fit(
                X_train, y_train, 
                eval_set=[(X_valid, y_valid)],  
                early_stopping_rounds=es_rounds, 
                verbose=50)
        else:
            with open(f'../src/models/ScaledModels/{model_file_name}{i}.pkl', 'rb') as f:
                model = pickle.load(f)
            
        pred = model.predict_proba(X_valid)
        oof[val_idx] = pred
        models.append(model)
        
        file = f'../src/models/ScaledModels/{model_file_name}{i}.pkl'
        pickle.dump(model, open(file, 'wb'))
        print()

    cv = (oof.argmax(axis=-1) == y).mean()
    print(f"CV-accuracy: {cv}")

    return oof, models

def inference_lgbm(models, feat_df):
    pred = np.array([model.predict_proba(feat_df) for model in models])
    pred = np.mean(pred, axis=0)
    return pred

In [None]:
params = {
    'boosting_type': 'gbdt', 
    'objective': 'binary', 
    'tree_learner': 'feature',
    'metric': ['binary_logloss', 'AUC'],
    'learning_rate': 0.3,
    'reg_alpha': 0.1,
    'reg_lambda': 0.682,
    'random_state': 2018,
    'max_depth': 9,
    'num_leaves': 127, 
    'n_estimators': 3000,
    "colsample_bytree": 0.785,
    "first_metric_only": True,
    "max_bin": 512
}

oof, models = fit_lgbm(train_df[TRAIN_FEATURES], train_df["label"].astype(int), 
                       params=params, n_class=int(train_df["label"].max() + 1), 
                       N_SPLITS=NFOLDS, folds=train_df["fold"].values)

  0%|          | 0/5 [00:00<?, ?it/s]

== fold 0 ==
[50]	valid_0's binary_logloss: 0.0283144	valid_0's auc: 0.996188
[100]	valid_0's binary_logloss: 0.0267797	valid_0's auc: 0.996621
[150]	valid_0's binary_logloss: 0.0258853	valid_0's auc: 0.99688
[200]	valid_0's binary_logloss: 0.0252718	valid_0's auc: 0.997043
[250]	valid_0's binary_logloss: 0.0247781	valid_0's auc: 0.997179
[300]	valid_0's binary_logloss: 0.0243312	valid_0's auc: 0.997303
[350]	valid_0's binary_logloss: 0.0239429	valid_0's auc: 0.997402
[400]	valid_0's binary_logloss: 0.0235855	valid_0's auc: 0.997486
[450]	valid_0's binary_logloss: 0.0232416	valid_0's auc: 0.997564
[500]	valid_0's binary_logloss: 0.0229268	valid_0's auc: 0.997639
[550]	valid_0's binary_logloss: 0.0226654	valid_0's auc: 0.997699
[600]	valid_0's binary_logloss: 0.022333	valid_0's auc: 0.997765
[650]	valid_0's binary_logloss: 0.0221325	valid_0's auc: 0.997806
[700]	valid_0's binary_logloss: 0.0218676	valid_0's auc: 0.99786
[750]	valid_0's binary_logloss: 0.0216085	valid_0's auc: 0.997915
[

[200]	valid_0's binary_logloss: 0.0254465	valid_0's auc: 0.99701
[250]	valid_0's binary_logloss: 0.0248903	valid_0's auc: 0.997154
[300]	valid_0's binary_logloss: 0.0244962	valid_0's auc: 0.99725
[350]	valid_0's binary_logloss: 0.0240988	valid_0's auc: 0.997363
[400]	valid_0's binary_logloss: 0.0237461	valid_0's auc: 0.997453
[450]	valid_0's binary_logloss: 0.023406	valid_0's auc: 0.997539
[500]	valid_0's binary_logloss: 0.0231145	valid_0's auc: 0.997604
[550]	valid_0's binary_logloss: 0.0228231	valid_0's auc: 0.997665
[600]	valid_0's binary_logloss: 0.0225422	valid_0's auc: 0.997717
[650]	valid_0's binary_logloss: 0.0222601	valid_0's auc: 0.997773
[700]	valid_0's binary_logloss: 0.0220047	valid_0's auc: 0.997822
[750]	valid_0's binary_logloss: 0.0217762	valid_0's auc: 0.997865
[800]	valid_0's binary_logloss: 0.0215335	valid_0's auc: 0.997918
[850]	valid_0's binary_logloss: 0.021322	valid_0's auc: 0.997954
[900]	valid_0's binary_logloss: 0.0210714	valid_0's auc: 0.998
[950]	valid_0's b

In [None]:
def plot_importances(models):
    importance_df = pd.DataFrame(models[0].feature_importances_, 
                                 index=TRAIN_FEATURES, 
                                 columns=['importance'])\
                        .sort_values("importance", ascending=False)

    plt.subplots(figsize=(len(TRAIN_FEATURES) // 4, 5))
    plt.bar(importance_df.index, importance_df.importance)
    plt.grid()
    plt.xticks(rotation=90)
    plt.ylabel("importance")
    plt.tight_layout()
    plt.show()

plot_importances(models)

In [31]:
# !kaggle datasets init -p ../src/models/BinaryModels

In [32]:
# !kaggle datasets create -p ../src/models/BinaryModels --dir-mode zip

In [None]:
!kaggle datasets version -m "NewModel" -p ../src/models/BinaryModels --dir-mode zip