# Inspiration: [Kaggle Notebook](https://www.kaggle.com/code/anasnofal/1st-place-solution)

In [112]:
import numpy as np
import pandas as pd
import scipy
import sklearn
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate,cross_val_score,StratifiedKFold

df_train = pd.read_csv('train.csv', index_col='id')
df_test = pd.read_csv('test.csv', index_col='id')

binvar = ['bin_' + str(i) for i in range(1,5)]
ordvar = ['ord_' + str(i) for i in range(6)]
nomvar = ['nom_' + str(i) for i in range(10)]

dmvar  = ['day', 'month']

train_test = df_train.copy()
y_train = train_test['target'].copy()

train_test = pd.concat([train_test.drop('target', axis = 1), df_test])

#Bin vars
train_test.drop('bin_0', inplace=True, axis=1)
train_test['bin_3'] = train_test['bin_3'].map({'F':0, 'T':1})
train_test['bin_4'] = train_test['bin_4'].map({'N':0, 'Y':1})

#Ord vars
train_test['ord_0'] = train_test['ord_0'] - 1
ord1dict = {'Novice':0, 'Contributor':1, 'Expert':2, 'Master':3, 'Grandmaster':4}
train_test['ord_1'] = train_test['ord_1'].map(ord1dict)
ord2dict = {'Freezing':0, 'Cold':1, 'Warm':2, 'Hot':3, 'Boiling Hot':4, 'Lava Hot':5}
train_test['ord_2'] = train_test['ord_2'].map(ord2dict)

oe = OrdinalEncoder(categories='auto')
train_test[ordvar[3:]] = oe.fit_transform(train_test[ordvar[3:]])

for var, cl in zip(ordvar[3:], oe.categories_):
    print(var)
    print(cl)

#Scaling ord vars
train_test[ordvar] = StandardScaler().fit_transform(train_test[ordvar])
#Nom vars
# Definitely there is something covered in nom vars 5 - 9 but I didn't get it. Still cutting some letters works for me
train_test[nomvar[5:]].nunique()

train_test['nom_5'] = train_test['nom_5'].str[4:]
train_test['nom_6'] = train_test['nom_6'].str[3:]
train_test['nom_7'] = train_test['nom_7'].str[3:]
train_test['nom_8'] = train_test['nom_8'].str[3:]
train_test['nom_9'] = train_test['nom_9'].str[3:]
train_test[nomvar[5:]].nunique()

#One hot encoding nomvars & dmvars
enc = OneHotEncoder(categories = 'auto', dtype = 'float64', drop = 'first')
nom_matrix = enc.fit_transform(train_test[nomvar])
train_test.drop(nomvar, inplace=True, axis=1)
enc = OneHotEncoder(categories='auto', dtype = 'float64', drop = 'first')
dm_matrix = enc.fit_transform(train_test[dmvar])
train_test.drop(dmvar, inplace=True, axis=1)
# Note: Use C-ordered arrays or CSR matrices containing 64-bit floats for optimal performance; # any other input format will be converted (and copied).
train_test.columns

df_work_sprs =scipy.sparse.hstack([nom_matrix,
                                   scipy.sparse.coo_matrix(train_test).astype('float64'),
                                   dm_matrix]).tocsr()
display(df_work_sprs)

X_train = df_work_sprs[:y_train.shape[0]]
X_test = df_work_sprs[y_train.shape[0]:]

ord_3
['a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o']
ord_4
['A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R'
 'S' 'T' 'U' 'V' 'W' 'X' 'Y' 'Z']
ord_5
['AP' 'Ai' 'Aj' 'BA' 'BE' 'Bb' 'Bd' 'Bn' 'CL' 'CM' 'CU' 'CZ' 'Cl' 'DH'
 'DN' 'Dc' 'Dx' 'Ed' 'Eg' 'Er' 'FI' 'Fd' 'Fo' 'GD' 'GJ' 'Gb' 'Gx' 'Hj'
 'IK' 'Id' 'JX' 'Jc' 'Jf' 'Jt' 'KR' 'KZ' 'Kf' 'Kq' 'LE' 'MC' 'MO' 'MV'
 'Mf' 'Ml' 'Mx' 'NV' 'Nf' 'Nk' 'OR' 'Ob' 'Os' 'PA' 'PQ' 'PZ' 'Ps' 'QM'
 'Qb' 'Qh' 'Qo' 'RG' 'RL' 'RP' 'Rm' 'Ry' 'SB' 'Sc' 'TR' 'TZ' 'To' 'UO'
 'Uk' 'Uu' 'Vf' 'Vx' 'WE' 'Wc' 'Wv' 'XI' 'Xh' 'Xi' 'YC' 'Yb' 'Ye' 'ZR'
 'ZS' 'Zc' 'Zq' 'aF' 'aM' 'aO' 'aP' 'ac' 'av' 'bF' 'bJ' 'be' 'cA' 'cG'
 'cW' 'ck' 'cp' 'dB' 'dE' 'dN' 'dO' 'dP' 'dQ' 'dZ' 'dh' 'eG' 'eQ' 'eb'
 'eg' 'ek' 'ex' 'fO' 'fh' 'gJ' 'gM' 'hL' 'hT' 'hh' 'hp' 'iT' 'ih' 'jS'
 'jV' 'je' 'jp' 'kC' 'kE' 'kK' 'kL' 'kU' 'kW' 'ke' 'kr' 'kw' 'lF' 'lL'
 'll' 'lx' 'mb' 'mc' 'mm' 'nX' 'nh' 'oC' 'oG' 'oH' 'oK' 'od' 'on' 'pa'
 'ps' 'qA' 'qJ' 'qK' 'qP' 'qX' '

<500000x16288 sparse matrix of type '<class 'numpy.float64'>'
	with 9235121 stored elements in Compressed Sparse Row format>

In [117]:
y = y_train
X = X_train

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

accs = []
aucs = []

for train_index, test_index in skf.split(X, y):

    #splitting the data
    x_train_fold, x_test_fold = X[train_index], X[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]

    print("Shape: ", x_train_fold.shape)

    #scaling the data
    # scaler = preprocessing.StandardScaler()
    # x_train_fold = scaler.fit_transform(x_train_fold)
    # x_test_fold = scaler.transform(x_test_fold)

    model=LogisticRegression(
        # max_iter=10000,
        # solver='lbfgs',
        # n_jobs=-1
        )

    # fit the model on the train fold
    model.fit(x_train_fold, y_train_fold)

    # predict labels for test fold
    y_pred = model.predict(x_test_fold)
    
    fpr, tpr, thresholds = metrics.roc_curve(y_test_fold, y_pred)
    curr_auc = metrics.auc(fpr, tpr)
    aucs.append(curr_auc)

    curr_acc = accuracy_score(y_test_fold, y_pred)
    accs.append(curr_acc)

    print(curr_acc)
    print(curr_auc)
    if curr_acc >= max(accs):
        print("Best Model")
        best_model = model
    print('\n')

print( "Average accuracy: ", sum(accs)/len(accs))
print( "Average accuracy: ", sum(aucs)/len(aucs))

Shape:  (270000, 16288)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.7603666666666666
0.6816350534106412
Best Model


Shape:  (270000, 16288)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.7640333333333333
0.6835447354521189
Best Model


Shape:  (270000, 16288)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.7597666666666667
0.681599089609233


Shape:  (270000, 16288)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.7629333333333334
0.6872023461628507


Shape:  (270000, 16288)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.7606666666666667
0.6799309595493899


Shape:  (270000, 16288)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.7609
0.6840003746264203


Shape:  (270000, 16288)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.7609
0.6822581672951759


Shape:  (270000, 16288)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.7642333333333333
0.684567941907097
Best Model


Shape:  (270000, 16288)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.7627
0.6827625351471486


Shape:  (270000, 16288)
0.7629
0.6872641206632147


Average accuracy:  0.7619400000000001
Average accuracy:  0.6834765323823289


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [118]:
y_pred = best_model.predict_proba(X_test)[:,1]

In [119]:
y_pred

array([0.36380056, 0.63874652, 0.07776148, ..., 0.29588828, 0.62890853,
       0.23818924])

In [121]:
submission = pd.DataFrame()

submission['id'] = test_df['id']
submission['target'] = y_pred

In [122]:
submission.head()

Unnamed: 0,id,target
0,300000,0.363801
1,300001,0.638747
2,300002,0.077761
3,300003,0.478127
4,300004,0.89756


In [123]:
submission.to_csv("submission_v2.csv", index=False)