# Описание Датасета:
* client_id - идентификатор клиента
* education - уровень образования
* sex - пол заемщика
* age - возраст заемщика
* car - флаг наличия автомобиля
* car_type - флаг автомобиля иномарки
* decline_app_cnt - количество отказанных прошлых заявок
* good_work - флаг наличия “хорошей” работы
* bki_request_cnt - количество запросов в БКИ
* home_address - категоризация домашнего адреса
* work_address - категоризация рабочего адреса
* income - доход заемщика
* foreign_passport - наличие загранпаспорта
* sna - связь заемщика с клиентами банка
* first_time - давность наличия информации о заемщике
* score_bki - скоринговый балл по данным из БКИ
* region_rating - рейтинг региона
* app_date - дата подачи заявки
* default - флаг дефолта по кредиту 

# Import

In [553]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import SGDClassifier
from sklearn.base import clone
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import RobustScaler
from imblearn.under_sampling import RandomUnderSampler
import warnings


# Functions

In [554]:
def education_num(value):
    if value == 'SCH':
        return 1
    elif value == 'GRD':
        return 2
    elif value == 'UGR':
        return 3
    elif value == 'PGR':
        return 4
    elif value == 'ACD':
        return 5
    else:
        return 0

In [555]:
def cross_plot(series_1, series_2):
    pt = pd.crosstab([series_1], series_2)
    display(pt)
    pt.plot(kind='bar', title=series_2.name)

In [556]:
def time(value):
    date = datetime.strptime(value, '%d%b%Y')
    return date.timestamp()

In [557]:
def num_hist(series):
    plt.figure()
    sns.distplot(series)
    plt.title(series.name)
    plt.show()

In [558]:
def get_boxplot(data, column):
    for col in list_numeric:
        fig, ax = plt.subplots(figsize = (14, 4))
        ax.set_title('Boxplot for ' + col)
        sns.boxplot(x=data[col])

In [559]:
def log_num(data, columns):
    for col in columns:
        data[col] = data[col].apply(lambda x: np.log(x+1))
    return data

In [560]:
def num_heatmap(data, columns):
    corr = data[columns].corr()
    mask = np.zeros_like(corr)
    mask[np.triu_indices_from(mask)] = True
    with sns.axes_style("white"):
        f, ax = plt.subplots(figsize=(10, 10))
        ax = sns.heatmap(corr, mask=mask, annot=True, square=True, vmin=0, vmax=1)

In [561]:
def increase_data(data):
    data_contact = data[data.default == 1]
    for n in range(1,11):
        data = pd.concat([data, data_contact])

In [562]:
def removal_condition(values):
    global lower_lim
    global upper_lim
    if values < lower_lim:
        return lower_lim
    elif values > upper_lim:
        return upper_lim
    else:
        return values

In [563]:
def remove_outliers(data, columns):
    global lower_lim
    global upper_lim
    for col in columns:
        Q3 = np.quantile(data[col], 0.75)
        Q1 = np.quantile(data[col], 0.25)
        IQR = Q3 - Q1
        lower_lim = Q1 - (1.5 * IQR)
        upper_lim = Q3 + (1.5 * IQR)
        data[col] = data[col].apply(removal_condition)
    return data

# Data

In [564]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [565]:
warnings.filterwarnings("ignore")

In [566]:
DATA_DIR = '/kaggle/input/sf-scoring/'
#DATA_DIR = './data'
df_train = pd.read_csv(DATA_DIR +'/train.csv')
df_test = pd.read_csv(DATA_DIR +'/test.csv')
sample_submission = pd.read_csv(DATA_DIR+'/sample_submission.csv')

In [567]:
df_train.info()

In [568]:
df_test.info()

In [569]:
df_train['sample'] = 1
df_test['sample'] = 0
df_test['default'] = 0 
data = df_test.append(df_train, sort=False).reset_index(drop=True)

# EDA

In [570]:
data.sample(10)

In [571]:
data.nunique(dropna=False)

Как видно имеются бинарные переменные: 
* sex
* car
* car_type
* good_work
* foreign_passport

In [572]:
list_binary = ['sex', 'car', 'car_type', 'good_work', 'foreign_passport']

In [573]:
list_other_columns = set.difference(set(data.columns), set(list_binary), {'default', 'sample'})
for i in list_other_columns:
    print(f'column:                \033[32m{i}\033[0m')
    print(f'Колличество значений:  {data[i].nunique()}')
    print(f'Уникальные значения:   {data[i].unique()}\n')

Из остальных переменных можно выделить

категориальные переменные:
* home_address
* decline_app_cnt
* first_time
* education
* region_rating
* sna
* work_address

числовые переменные:
* income
* score_bki
* age
* bki_request_cnt

даты:
* app_date

client_id не имеет смысла для модели

In [574]:
list_сategorical = ['home_address', 'decline_app_cnt', 'first_time', 
                    'education', 'region_rating', 'sna', 'work_address']
list_numeric = ['income', 'score_bki', 'age', 'bki_request_cnt']

In [575]:
sns.histplot(data.default)
print(f'Отношение значений default: {round(len(data[data.default == 0]) / len(data[data.default == 1]), 2)}')

Имеется дисбаланс признаков. 
Скопируем default = 1 10 раз

In [576]:
data_contact = data[data.default == 1]
for n in range(1,11):
    data = pd.concat([data, data_contact])

In [577]:
sns.histplot(data.default)
print(f'Отношение значений default: {round(len(data[data.default == 0]) / len(data[data.default == 1]), 2)}')

In [578]:
for col in list_сategorical:
    cross_plot(data.default, data[col])

Из распределения данных относительно default видно, что все категориальные переменные имеют значимые отклонения для default=0 и default=1. Также видно, что education имеет строковый вид.

Из распределения данных относительно default видно, что все категориальные переменные имеют значимые отклонения для default=0 и default=1. Также видно, что education имеет строковый вид.

Посмотрим на переменную app_date

In [579]:
data.app_date = data.app_date.apply(lambda x: datetime.strptime(x, '%d%b%Y'))

In [580]:
data['month'] = data.app_date.apply(lambda x: int(x.month))
data['day'] = data.app_date.apply(lambda x: int(x.day))

In [581]:
cross_plot(data.default, data.month)
cross_plot(data.default, data.day)

Как видно из данных распределение month и day относительно default:
* month имеет значимые расхождения
* расхождение day незначительны

Посмотрим на бинарные переменные

In [582]:
for col in list_binary:
    cross_plot(data.default, data[col])

Переведем бинарные переменные в числа

In [583]:
label_encoder = LabelEncoder()
for col in list_binary:
    data[col] = label_encoder.fit_transform(data[col])
data.head(10)

In [584]:
sort_corr = data.corr()
sort_corr = sort_corr.default.apply(lambda x: np.abs(x))
sort_corr.sort_values(ascending=False)

In [585]:
bin_to_num = pd.Series(f_classif(data[['car_type', 'car']], data['default'])[0], index = ['car_type', 'car'])
bin_to_num.sort_values(inplace = True)
bin_to_num.plot(kind = 'barh')

Как видно бинарные переменные имеют не значительные изменения относительно default. Имеются две связянные переменные: car_type, car. Предпочтительно взять car_type для обучения 

In [586]:
for col in list_numeric:
    num_hist(data[col])

In [587]:
#for col in list_numeric:
    #fig, ax = plt.subplots(figsize = (14, 4))
    #ax.set_title('Boxplot for ' + col)
    #sns.boxplot(x=data[col])
get_boxplot(data, list_numeric)

Имеются выбросы. Прологорифмируем и приведем выбросы к нижнему и верхнему квартилю.

In [588]:
data = remove_outliers(data, list_numeric)

In [589]:
get_boxplot(data, list_numeric)

In [590]:
data = log_num(data, ['income', 'age', 'bki_request_cnt'])

In [591]:
for col in list_numeric:
    num_hist(data[col])

In [592]:
num_heatmap(data, ['income', 'score_bki', 'age', 'bki_request_cnt', 'default'])

Значимые корреляции между score_bki и default, что вполне логично.

In [593]:
sort_corr = data[['income', 'score_bki', 'age', 'bki_request_cnt', 'default']].corr()
sort_corr = sort_corr.default.apply(lambda x: np.abs(x))
sort_corr.sort_values(ascending=False)

# Data Preprocessing

In [594]:
def date(data, column):
    data[column] = data[column].apply(lambda x: datetime.strptime(x, '%d%b%Y'))
    data['month'] = data[column].apply(lambda x: x.month)
    data.drop(column, axis=1, inplace=True)
    return data
    
    
def get_dummies(data, col):
    return pd.get_dummies(data, columns=col, dummy_na=True)


def encoder(data, columns):
    label_encoder = LabelEncoder()
    for col in columns:
        data[col] = label_encoder.fit_transform(data[col])
    return data


def standart(data, columns):
    data[columns] = StandardScaler().fit_transform(data[columns].values)
    return data



#def removal_condition(values):
    #global lower_lim
    #global upper_lim
    #if values < lower_lim:
        #return lower_lim
    #elif values > upper_lim:
        #return upper_lim
    #else:
        #return values
        
        
def remove_outliers(data, columns):
    #global lower_lim
    #global upper_lim
    #for col in columns:
        #Q3 = data[col].quantile(0.75)
        #display(Q3)
        #Q1 = data[col].quantile(0.25)
        #display(Q1)
        #IQR = Q3 - Q1
        #lower_lim = Q1 - (1.5 * IQR)
        #upper_lim = Q3 + (1.5 * IQR)
        #data[col] = data[col].apply(removal_condition)
    data[columns] = RobustScaler().fit_transform(data[columns])
    return data
        
#def data_balance(data):
    #data_contact = data[data.default == 1]
    #for n in range(10):
        #data = pd.concat([data, data_contact])
    #return datae


def log_num(data, columns):
    for col in columns:
        data[col] = data[col].apply(lambda x: np.log(x+1))
    return data
        
        
def preprocessing(df_train, df_test, list_binary, list_сategorical, list_numeric, drop_list):
    #data = data_balance(data)
    df_train['sample'] = 1 
    df_test['sample'] = 0  
    df_test['default'] = 0 
    data = df_test.append(df_train, sort=False).reset_index(drop=True) 
    
    data = date(data, 'app_date')
#    print("\033[32m data = date(data, 'app_date')\033[0m")
 #   display(data)
  #  print(data.columns)
   # print()
    
    data = encoder(data, list_binary)
#    print("\033[32m data = encoder(data, list_binary)\033[0m")
 #   display(data)
  #  print(data.columns)
   # print()
    
    data = log_num(data, ['income', 'age', 'bki_request_cnt'])
#    print("\033[32m data = log_num(data, ['income', 'age', 'bki_request_cnt'])\033[0m")
 #   display(data)
  #  print(data.columns)
   # print()
    
    data = remove_outliers(data, list_numeric)
#    print("\033[32m data = remove_outliers(data, list_numeric)\033[0m")
 #   display(data)
  #  print(data.columns)
   # print()
    

    
    data = get_dummies(data, list_сategorical)
#    print("\033[32mget_dummies(data, list_сategorical)\033[0m")
 #   display(data)
  #  print(data.columns)
   # print()
    
    data = data.drop(drop_list, axis = 1)
#    print("\033[32mdata.drop(drop_list, axis = 1, inplace=True)\033[0m")
 #   display(data)
  #  print(data.columns)

    df_train = data.query('sample == 1').drop(['sample'], axis=1)
    df_test = data.query('sample == 0').drop(['sample', 'default'], axis=1)


    df_train = standart(df_train, list_numeric)
    df_test = standart(df_test, list_numeric)
#    print("\033[32mdata = standart(data, list_numeric)\033[0m")
 #   display(data)
  #  print(data.columns)
   # print()


    return df_train, df_test
    
#def preprocessing_test(data, list_binary, list_сategorical, list_numeric, drop_list):
    #data.education = data.education.apply(education_num)
    #data = date(data, 'app_date')
    #data = encoder(data, list_binary)
    #data = log_num(data, ['income', 'age', 'bki_request_cnt'])
    #data = remove_outliers(data, list_numeric)
    #data = standart(data, list_numeric)
    #data = get_dummies(data, list_сategorical)
    #data.drop(drop_list, axis = 1, inplace=True)
    #return data

In [595]:
DATA_DIR = '/kaggle/input/sf-scoring/'
#DATA_DIR = './data'
df_train = pd.read_csv(DATA_DIR +'/train.csv')
df_test = pd.read_csv(DATA_DIR +'/test.csv')
sample_submission = pd.read_csv(DATA_DIR+'/sample_submission.csv')

In [596]:
list_binary = ['sex', 'car_type', 'good_work', 'foreign_passport']
list_сategorical = ['home_address', 'decline_app_cnt', 'first_time', 
                    'education', 'region_rating', 'sna', 'work_address', 'month']
list_numeric = ['income', 'score_bki', 'age', 'bki_request_cnt']
drop_list = ['client_id', 'car']

In [597]:
df_train, df_test = preprocessing(df_train, df_test, list_binary, list_сategorical, list_numeric, drop_list)

In [598]:
df_train.info()

In [599]:
df_test.info()


# Model

In [600]:
y = df_train['default']
X = df_train.drop(['default'], axis=1)

In [601]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [602]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape


In [603]:
#logreg = LogisticRegression()
#logreg.fit(X_train, y_train)
#y_pred = logreg.predict(X_test)

In [604]:
#fpr, tpr, thresholds = roc_curve(y_train,logreg.predict_proba(X_train).T[1])
#roc_auc = roc_auc_score(y_train,logreg.predict_proba(X_train).T[1])   
#plt.figure(figsize=(8, 8))
#plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.4f}')
#plt.title('Receiver Operating Characteristic', fontsize=15)
#plt.xlabel('False positive rate (FPR)', fontsize=15)
#plt.ylabel('True positive rate (TPR)', fontsize=15)
#plt.legend(fontsize=15)

In [605]:
#classification_report = classification_report(y_test, y_pred)
#print(classification_report)

In [606]:
#print(accuracy_score(y_test, y_pred))
#print(precision_score(y_test, y_pred))
#print(recall_score(y_test, y_pred))
#print(f1_score(y_test, y_pred))

In [None]:
model = LogisticRegression()

iter_ = 50
epsilon_stop = 1e-3

param_grid = [
    {'penalty': ['l1'], 
     'solver': ['liblinear', 'lbfgs'], 
     'class_weight':['none', 'balanced'], 
     'multi_class': ['auto','ovr'], 
     'max_iter':[iter_],
     'tol':[epsilon_stop]},
    {'penalty': ['l2'], 
     'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 
     'class_weight':['none', 'balanced'], 
     'multi_class': ['auto','ovr'], 
     'max_iter':[iter_],
     'tol':[epsilon_stop]},
    {'penalty': ['none'], 
     'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'], 
     'class_weight':['none', 'balanced'], 
     'multi_class': ['auto','ovr'], 
     'max_iter':[iter_],
     'tol':[epsilon_stop]},
]


gridsearch = GridSearchCV(model, param_grid, scoring='f1', n_jobs=-1, cv=5)
gridsearch.fit(X_train, y_train)
model = gridsearch.best_estimator_


best_parameters = model.get_params()
for param_name in sorted(best_parameters.keys()):
        print('\t%s: %r' % (param_name, best_parameters[param_name]))

In [None]:
model = LogisticRegression()

In [609]:
gridsearch = GridSearchCV(model, param_grid, scoring='f1', n_jobs=-1, cv=5)
gridsearch.fit(X_train, y_train)
model = gridsearch.best_estimator_

In [610]:
fpr, tpr, thresholds = roc_curve(y_train,model.predict_proba(X_train).T[1])
roc_auc = roc_auc_score(y_train,model.predict_proba(X_train).T[1])   
plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.4f}')
plt.title('Receiver Operating Characteristic', fontsize=15)
plt.xlabel('False positive rate (FPR)', fontsize=15)
plt.ylabel('True positive rate (TPR)', fontsize=15)
plt.legend(fontsize=15)

In [611]:
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

In [612]:
kf = KFold(n_splits=10)
kfmodel = LogisticRegression(C=1.0, class_weight='balanced', dual=False, fit_intercept=True, intercept_scaling=1,
                             l1_ratio=None, max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2', 
                             random_state=None, solver='sag', tol=1e-5, verbose=0, warm_start=False)

In [613]:
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    kfmodel.fit(X_train, y_train)


In [614]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
y_pred = kfmodel.predict(X_test)

In [615]:
fpr, tpr, thresholds = roc_curve(y_train,kfmodel.predict_proba(X_train).T[1])
roc_auc = roc_auc_score(y_train,kfmodel.predict_proba(X_train).T[1])   
plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.4f}')
plt.title('Receiver Operating Characteristic', fontsize=15)
plt.xlabel('False positive rate (FPR)', fontsize=15)
plt.ylabel('True positive rate (TPR)', fontsize=15)
plt.legend(fontsize=15)

In [616]:
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

In [617]:
rus = RandomUnderSampler(random_state=42)
X_us, y_us = rus.fit_resample(X, y)

In [618]:
kfmodelrus = LogisticRegression(C=1.0, class_weight='balanced', dual=False, fit_intercept=True, intercept_scaling=1,
                             l1_ratio=None, max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2', 
                             random_state=None, solver='sag', tol=1e-5, verbose=0, warm_start=False)

In [619]:
for train_index, test_index in kf.split(X_us):
    X_train, X_test = X_us.iloc[train_index], X_us.iloc[test_index]
    y_train, y_test = y_us.iloc[train_index], y_us.iloc[test_index]
    kfmodelrus.fit(X_train, y_train)

In [620]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
y_pred = kfmodelrus.predict(X_test)

In [621]:
fpr, tpr, thresholds = roc_curve(y_train,kfmodelrus.predict_proba(X_train).T[1])
roc_auc = roc_auc_score(y_train,kfmodelrus.predict_proba(X_train).T[1])   
plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.4f}')
plt.title('Receiver Operating Characteristic', fontsize=15)
plt.xlabel('False positive rate (FPR)', fontsize=15)
plt.ylabel('True positive rate (TPR)', fontsize=15)
plt.legend(fontsize=15)

In [622]:
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
matrix_con = confusion_matrix(y_test, y_pred)
sns.heatmap(matrix_con, annot=True)

In [623]:
sgd_clf = SGDClassifier(max_iter=1000) # классификатор на основе метода стохастического градиентного спуска (Stochastic Gradient Descent SGD)
skfolds = StratifiedKFold(n_splits=10)
for train, test in skfolds.split(X, y):
    X_train, X_test, y_train, y_test = X.iloc[train], X.iloc[test], y.iloc[train], y.iloc[test]
    sgd_clf.fit(X_train, y_train)

In [624]:
df_train_contact = df_train[df_train.default == 1]
for n in range(1,11):
    df_train = pd.concat([df_train, df_train_contact])

In [625]:
y = df_train['default']
X = df_train.drop(['default'], axis=1)

In [626]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
y_pred = sgd_clf.predict(X_test)

In [627]:
#fpr, tpr, thresholds = roc_curve(y_train,sgd_clf.predict(X_train).T[1])
#roc_auc = roc_auc_score(y_train,sgd_clf.predict(X_train).T[1])   
#plt.figure(figsize=(8, 8))
#plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.4f}')
#plt.title('Receiver Operating Characteristic', fontsize=15)
#plt.xlabel('False positive rate (FPR)', fontsize=15)
#plt.ylabel('True positive rate (TPR)', fontsize=15)
#plt.legend(fontsize=15)

In [628]:
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
matrix_con = confusion_matrix(y_test, y_pred)
sns.heatmap(matrix_con, annot=True)

Лучшие результаты показала модель "model" обученная через GridSearchCV, разбиение на большее количество частей пириводит к ухудшению показателей. SGDClassifier в итоге выдает одни нуыли, что нас не устраевет. Примем данную модель для Submission

In [629]:
df_test.shape, X.shape

# Submission

In [630]:
predict_submission = model.predict(df_test)

In [631]:
sample_submission['default'] = predict_submission
sample_submission.to_csv('submission.csv', index=False)
sample_submission.head(10)

In [632]:
!kaggle competitions submit -c sf-scoring -f ssubmission.csv -m "Message"