In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import scipy
from tqdm import tqdm_notebook as tqdm

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train_data = pd.read_csv('../input/cat-in-the-dat/train.csv')
test_data = pd.read_csv('../input/cat-in-the-dat/test.csv')

In [3]:
ddall = pd.concat([train_data,test_data],axis=0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [4]:
train_data.head(3)

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,0,T,Y,Green,Triangle,Snake,Finland,Bassoon,50f116bcf,3ac1b8814,68f6ad3e9,c389000ab,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0
1,1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,Piano,b3b4d25d0,fbcb50fc1,3b6dd5612,4cd920251,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0
2,2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,Theremin,3263bdce5,0922e3cb8,a6a36f527,de9c9f684,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0


In [5]:
test_data.head(3)

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
0,300000,0,0,1,T,Y,Blue,Triangle,Axolotl,Finland,Piano,0870b0a5d,9ceb19dd6,530f8ecc3,9d117320c,3c49b42b8,2,Novice,Warm,j,P,be,5,11
1,300001,0,0,0,T,N,Red,Square,Lion,Canada,Piano,a5c276589,1ad744242,12e6161c9,46ae3059c,285771075,1,Master,Lava Hot,l,A,RP,7,5
2,300002,1,0,1,F,Y,Blue,Square,Dog,China,Piano,568550f04,1fe17a1fd,27d6df03f,b759e21f0,6f323c53f,2,Expert,Freezing,a,G,tP,1,12


In [6]:
train_data.nunique()

id        300000
bin_0          2
bin_1          2
bin_2          2
bin_3          2
bin_4          2
nom_0          3
nom_1          6
nom_2          6
nom_3          6
nom_4          4
nom_5        222
nom_6        522
nom_7       1220
nom_8       2215
nom_9      11981
ord_0          3
ord_1          5
ord_2          6
ord_3         15
ord_4         26
ord_5        192
day            7
month         12
target         2
dtype: int64

In [7]:
test_data.nunique()

id       200000
bin_0         2
bin_1         2
bin_2         2
bin_3         2
bin_4         2
nom_0         3
nom_1         6
nom_2         6
nom_3         6
nom_4         4
nom_5       222
nom_6       522
nom_7      1219
nom_8      2214
nom_9     11839
ord_0         3
ord_1         5
ord_2         6
ord_3        15
ord_4        26
ord_5       192
day           7
month        12
dtype: int64

In [8]:
drop_cols=["bin_0"]

# Split 2 Letters; This is the only part which is not generic and would actually require data inspection
ddall["ord_5a"]=ddall["ord_5"].str[0]
ddall["ord_5b"]=ddall["ord_5"].str[1]
drop_cols.append("ord_5")

In [9]:
for col in ["nom_5", "nom_6", "nom_7", "nom_8", "nom_9"]:
    train_vals = set(train_data[col].unique())
    test_vals = set(test_data[col].unique())
   
    xor_cat_vals=train_vals ^ test_vals
    if xor_cat_vals:
        ddall.loc[ddall[col].isin(xor_cat_vals), col]="xor"

In [10]:
drop_cols

['bin_0', 'ord_5']

In [11]:
X=ddall[ddall.columns.difference(["id", "target"] + drop_cols)]

In [12]:
X_oh=X[X.columns.difference(["ord_1", "ord_4", "ord_5a", "ord_5b", "day", "month"])]
oh1=pd.get_dummies(X_oh, columns=X_oh.columns, drop_first=True, sparse=True)
ohc1=oh1.to_coo()

In [13]:
ohc1.shape

(500000, 15968)

In [14]:
from sklearn.base import TransformerMixin
from itertools import repeat
import scipy


class ThermometerEncoder(TransformerMixin):
    """
    Assumes all values are known at fit
    """
    def __init__(self, sort_key=None):
        self.sort_key = sort_key
        self.value_map_ = None
    
    def fit(self, X, y=None):
        self.value_map_ = {val: i for i, val in enumerate(sorted(X.unique(), key=self.sort_key))}
        return self
    
    def transform(self, X, y=None):
        values = X.map(self.value_map_)
        
        possible_values = sorted(self.value_map_.values())
        
        idx1 = []
        idx2 = []
        
        all_indices = np.arange(len(X))
        
        for idx, val in enumerate(possible_values[:-1]):
            new_idxs = all_indices[values > val]
            idx1.extend(new_idxs)
            idx2.extend(repeat(idx, len(new_idxs)))
            
        result = scipy.sparse.coo_matrix(([1] * len(idx1), (idx1, idx2)), shape=(len(X), len(possible_values)), dtype="int8")
            
        return result

In [15]:
thermos=[]
for col in ["ord_1", "ord_2", "ord_3", "ord_4", "ord_5a", "day", "month"]:
    if col=="ord_1":
        sort_key=['Novice', 'Contributor', 'Expert', 'Master', 'Grandmaster'].index
    elif col=="ord_2":
        sort_key=['Freezing', 'Cold', 'Warm', 'Hot', 'Boiling Hot', 'Lava Hot'].index
    elif col in ["ord_3", "ord_4", "ord_5a"]:
        sort_key=str
    elif col in ["day", "month"]:
        sort_key=int
    else:
        raise ValueError(col)
    
    enc=ThermometerEncoder(sort_key=sort_key)
    thermos.append(enc.fit_transform(X[col]))

In [16]:
ohc=scipy.sparse.hstack([ohc1] + thermos).tocsr()
display(ohc)

num_train = train_data.shape[0]
X_train = ohc[:num_train]
X_test = ohc[num_train:]
y_train = train_data["target"].values

<500000x16091 sparse matrix of type '<class 'numpy.int16'>'
	with 34244885 stored elements in Compressed Sparse Row format>

In [17]:
from sklearn.decomposition import FactorAnalysis, FastICA, NMF, SparsePCA

In [18]:
nmf = NMF(n_components=100)
nmf_embedding = nmf.fit_transform(ohc1)

In [19]:
ohc=scipy.sparse.hstack([ohc] + [nmf_embedding]).tocsr()
display(ohc)

num_train = train_data.shape[0]
X_train = ohc[:num_train]
X_test = ohc[num_train:]
y_train = train_data["target"].values

<500000x16191 sparse matrix of type '<class 'numpy.float64'>'
	with 57320091 stored elements in Compressed Sparse Row format>

In [20]:
clf=LogisticRegression(C=0.123456789, solver="lbfgs", max_iter=5000)

In [21]:
from sklearn.model_selection import cross_validate

score=cross_validate(clf, X_train, y_train, cv=3, scoring="roc_auc")["test_score"].mean()
print(f"{score:.6f}")

0.802517


In [22]:
clf.fit(X_train, y_train)

pred=clf.predict_proba(X_test)[:,1]



NameError: name 'ddtest0' is not defined

In [23]:
pd.DataFrame({"id": test_data["id"], "target": pred}).to_csv("submission.csv", index=False)

In [25]:
params = {
    'bagging_freq': 5,
    'bagging_fraction': 1.0,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 1.0,
    'learning_rate': 0.005,
    'max_depth': -1,
    'metric':'binary_logloss',
    'min_data_in_leaf': 30,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 64,
    'tree_learner': 'serial',
    'objective': 'binary',
    'verbosity': -1
    }

In [28]:
lgb_model = lgb.LGBMClassifier(**params)

In [29]:
lgb_model

LGBMClassifier(bagging_fraction=1.0, bagging_freq=5, boost='gbdt',
        boost_from_average='false', boosting_type='gbdt',
        class_weight=None, colsample_bytree=1.0, feature_fraction=1.0,
        importance_type='split', learning_rate=0.005, max_depth=-1,
        metric='binary_logloss', min_child_samples=20,
        min_child_weight=0.001, min_data_in_leaf=30, min_split_gain=0.0,
        min_sum_hessian_in_leaf=10.0, n_estimators=100, n_jobs=-1,
        num_leaves=64, objective='binary', random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0, tree_learner='serial',
        verbosity=-1)

In [None]:
score2=cross_validate(lgb_model, X_train, y_train, cv=3, scoring="roc_auc")["test_score"].mean()
print(f"{score:.6f}")

### Useful links of categorical encodings

1. <b> Comprehensive encoding techniques </b> - https://www.kaggle.com/shahules/an-overview-of-encoding-techniques
2. <b> Winning solution </b> - https://www.kaggle.com/adaubas/2nd-place-solution-categorical-fe-callenge
3. <b> Deep learning embedding </b> - https://www.kaggle.com/abhishek/entity-embeddings-to-handle-categories
4. <b> Target encoding </b> - https://www.kaggle.com/ogrellier/python-target-encoding-for-categorical-features