In [298]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import model_selection, linear_model, metrics, pipeline, preprocessing, impute


Імпорт даних: для зручності роботи виділимо цільовий стовбець в окремий обєкт. 

In [299]:
path_data='D:/kaggle-competetions/Tabular-Sep-2021/data'
X_test = pd.read_csv(path_data+'/test.csv')
train_df = pd.read_csv(path_data+'/train.csv')


y_train = train_df[train_df.columns[-1]]
X_train = train_df.drop(train_df.columns[-1], axis=1)
print(f'Test shape: {X_test.shape}, X_train shape: {X_train.shape}, y_train shape: {y_train.shape}')
print(f'Count of null-values: {X_train.isna().sum().sum()}')

Test shape: (493474, 119), X_train shape: (957919, 119), y_train shape: (957919,)
Count of null-values: 1820782


# Simple data preparing

Додамо змінну з кількістю відсутніх обєктів в рядку

In [301]:
X_train['n_missing'] = X_train.isna().sum(axis=1)
X_test['n_missing'] = X_test.isna().sum(axis=1)

Використаємо **SimpleImputer**, щоб заповнити Nan-value в нашому датасеті. Важливо, що fit_transform() повертає **ndarray**, а не pd.DataFrame, тому згодом, щоб застосовувати методи з pd.DataFrame, обєкт потрібно буде привести до цього типу. 

In [303]:
columns_for_pred  = X_train.columns
columns_for_pred

Index(['id', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9',
       ...
       'f110', 'f111', 'f112', 'f113', 'f114', 'f115', 'f116', 'f117', 'f118',
       'n_missing'],
      dtype='object', length=120)

In [304]:
si = impute.SimpleImputer()

X_train =  si.fit_transform(X_train)
X_test =  si.fit_transform(X_test)

Використаємо StandardScaler(), щоб масштабувати змінні: для цього 

In [308]:
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns=columns_for_pred)
X_test = pd.DataFrame(scaler.transform(X_test), columns=columns_for_pred)

In [309]:
print(f'Count of null-values: {X_train.isna().sum().sum()}')

Count of null-values: 0


# Logistic regression 

In [312]:
simple_regressor = linear_model.LogisticRegressionCV(random_state=0)
simple_regressor.get_params()

{'Cs': 10,
 'class_weight': None,
 'cv': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1.0,
 'l1_ratios': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 0,
 'refit': True,
 'scoring': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0}

Спроба використати логістичну регресію з крос-валідацією 

In [313]:
%%time
simple_regressor.fit(X = X_train, y= y_train)

In [None]:
simple_regressor.get_params()

{'Cs': 10,
 'class_weight': None,
 'cv': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1.0,
 'l1_ratios': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 0,
 'refit': True,
 'scoring': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0}

In [None]:
predicted_values = simple_regressor.predict(X_train)
predicted_proba = simple_regressor.predict_proba(X_train)[:,1]

In [None]:
print(f'confusion_matrix: {metrics.confusion_matrix(y_train, predicted_values)}')
print(f'accuracy_score: {metrics.accuracy_score(y_train, predicted_values)}')


print(f'recall_score: {metrics.recall_score(y_train, predicted_values)}')
print(f'precision_score: {metrics.precision_score(y_train, predicted_values)}')

print(f'f1_score: {metrics.f1_score(y_train, predicted_values)}')

print(f'roc_auc_score: {metrics.roc_auc_score(y_train, predicted_proba)}')

confusion_matrix: [[373951 106453]
 [144616 332899]]
accuracy_score: 0.7379016388650815
recall_score: 0.6971487806665759
precision_score: 0.7577045284874088
f1_score: 0.7261663905451936
roc_auc_score: 0.7992721749279486
precision_recall_curve: (array([0.49849413, 0.4984936 , 0.49849412, ..., 0.        , 0.        ,
       1.        ]), array([1.        , 0.99999791, 0.99999791, ..., 0.        , 0.        ,
       0.        ]), array([0.1678683 , 0.16813772, 0.16842743, ..., 0.9990535 , 0.99907623,
       0.99915887]))


# Make Submission! 

In [None]:
predictes_for_test = simple_regressor.predict_proba(X_test)

In [None]:
submission = pd.DataFrame({'id': pd.read_csv(path_data+'/test.csv', usecols=['id'])['id'], 'claim': predictes_for_test[:,1]})
submission.to_csv('log_red.csv' ,index = 0)