# Inspiration: [Kaggle Notebook](https://www.kaggle.com/code/anasnofal/1st-place-solution)

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import scipy
import sklearn
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate,cross_val_score,StratifiedKFold

In [5]:
df_train = pd.read_csv('../data/train.csv/train.csv', index_col='id')
df_test = pd.read_csv('../data/test.csv/test.csv', index_col='id')

In [6]:
binvar = ['bin_' + str(i) for i in range(1,5)]
ordvar = ['ord_' + str(i) for i in range(6)]
nomvar = ['nom_' + str(i) for i in range(10)]

dmvar  = ['day', 'month']

In [7]:
train_test = df_train.copy()
y_train = train_test['target'].copy()

train_test = pd.concat([train_test.drop('target', axis = 1), df_test])

In [8]:
#Bin vars
train_test.drop('bin_0', inplace=True, axis=1)
train_test['bin_3'] = train_test['bin_3'].map({'F':0, 'T':1})
train_test['bin_4'] = train_test['bin_4'].map({'N':0, 'Y':1})

In [9]:
#Ord vars
train_test['ord_0'] = train_test['ord_0'] - 1
ord1dict = {'Novice':0, 'Contributor':1, 'Expert':2, 'Master':3, 'Grandmaster':4}
train_test['ord_1'] = train_test['ord_1'].map(ord1dict)
ord2dict = {'Freezing':0, 'Cold':1, 'Warm':2, 'Hot':3, 'Boiling Hot':4, 'Lava Hot':5}
train_test['ord_2'] = train_test['ord_2'].map(ord2dict)

oe = OrdinalEncoder(categories='auto')
train_test[ordvar[3:]] = oe.fit_transform(train_test[ordvar[3:]])

for var, cl in zip(ordvar[3:], oe.categories_):
    print(var)
    print(cl)

#Scaling ord vars
train_test[ordvar] = StandardScaler().fit_transform(train_test[ordvar])

ord_3
['a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o']
ord_4
['A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R'
 'S' 'T' 'U' 'V' 'W' 'X' 'Y' 'Z']
ord_5
['AP' 'Ai' 'Aj' 'BA' 'BE' 'Bb' 'Bd' 'Bn' 'CL' 'CM' 'CU' 'CZ' 'Cl' 'DH'
 'DN' 'Dc' 'Dx' 'Ed' 'Eg' 'Er' 'FI' 'Fd' 'Fo' 'GD' 'GJ' 'Gb' 'Gx' 'Hj'
 'IK' 'Id' 'JX' 'Jc' 'Jf' 'Jt' 'KR' 'KZ' 'Kf' 'Kq' 'LE' 'MC' 'MO' 'MV'
 'Mf' 'Ml' 'Mx' 'NV' 'Nf' 'Nk' 'OR' 'Ob' 'Os' 'PA' 'PQ' 'PZ' 'Ps' 'QM'
 'Qb' 'Qh' 'Qo' 'RG' 'RL' 'RP' 'Rm' 'Ry' 'SB' 'Sc' 'TR' 'TZ' 'To' 'UO'
 'Uk' 'Uu' 'Vf' 'Vx' 'WE' 'Wc' 'Wv' 'XI' 'Xh' 'Xi' 'YC' 'Yb' 'Ye' 'ZR'
 'ZS' 'Zc' 'Zq' 'aF' 'aM' 'aO' 'aP' 'ac' 'av' 'bF' 'bJ' 'be' 'cA' 'cG'
 'cW' 'ck' 'cp' 'dB' 'dE' 'dN' 'dO' 'dP' 'dQ' 'dZ' 'dh' 'eG' 'eQ' 'eb'
 'eg' 'ek' 'ex' 'fO' 'fh' 'gJ' 'gM' 'hL' 'hT' 'hh' 'hp' 'iT' 'ih' 'jS'
 'jV' 'je' 'jp' 'kC' 'kE' 'kK' 'kL' 'kU' 'kW' 'ke' 'kr' 'kw' 'lF' 'lL'
 'll' 'lx' 'mb' 'mc' 'mm' 'nX' 'nh' 'oC' 'oG' 'oH' 'oK' 'od' 'on' 'pa'
 'ps' 'qA' 'qJ' 'qK' 'qP' 'qX' '

In [10]:
#Nom vars
# Definitely there is something covered in nom vars 5 - 9 but I didn't get it. Still cutting some letters works for me
train_test[nomvar[5:]].nunique()

train_test['nom_5'] = train_test['nom_5'].str[4:]
train_test['nom_6'] = train_test['nom_6'].str[3:]
train_test['nom_7'] = train_test['nom_7'].str[3:]
train_test['nom_8'] = train_test['nom_8'].str[3:]
train_test['nom_9'] = train_test['nom_9'].str[3:]
train_test[nomvar[5:]].nunique()

#One hot encoding nomvars & dmvars
enc = OneHotEncoder(categories = 'auto', dtype = 'float64', drop = 'first')
nom_matrix = enc.fit_transform(train_test[nomvar])
train_test.drop(nomvar, inplace=True, axis=1)
enc = OneHotEncoder(categories='auto', dtype = 'float64', drop = 'first')
dm_matrix = enc.fit_transform(train_test[dmvar])
train_test.drop(dmvar, inplace=True, axis=1)

In [11]:
# Note: Use C-ordered arrays or CSR matrices containing 64-bit floats for optimal performance; # any other input format will be converted (and copied).
train_test.columns

df_work_sprs =scipy.sparse.hstack([nom_matrix,
                                   scipy.sparse.coo_matrix(train_test).astype('float64'),
                                   dm_matrix]).tocsr()
display(df_work_sprs)

<500000x16288 sparse matrix of type '<class 'numpy.float64'>'
	with 9235121 stored elements in Compressed Sparse Row format>

In [12]:
X_train = df_work_sprs[:y_train.shape[0]]
X_test = df_work_sprs[y_train.shape[0]:]

In [13]:
from sklearn.model_selection import GridSearchCV

In [14]:
logModel = LogisticRegression()
param_grid = [    
    {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    # 'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter' : [100, 1000]
    }
]

clf = GridSearchCV(logModel, param_grid = param_grid, cv = 2, verbose=True, n_jobs=-1)

In [15]:
best_clf = clf.fit(X_train, y_train)

Fitting 2 folds for each of 40 candidates, totalling 80 fits


In [16]:
y_pred = best_clf.predict_proba(X_test)[:,1]

In [17]:
y_pred

array([0.32181265, 0.69756238, 0.09985319, ..., 0.32966738, 0.63461347,
       0.22172138])

In [20]:
submission = pd.DataFrame()
df_test = pd.read_csv('../data/test.csv/test.csv')

submission['id'] = df_test['id']
submission['target'] = y_pred

In [21]:
submission.head()

Unnamed: 0,id,target
0,300000,0.321813
1,300001,0.697562
2,300002,0.099853
3,300003,0.413403
4,300004,0.873041


In [22]:
submission.to_csv("../data/submission_v3.csv", index=False)