In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as mlt
import seaborn
import sklearn.model_selection
import sklearn.metrics
import sklearn.tree
import sklearn.ensemble
from imblearn.under_sampling import TomekLinks
import xgboost as xgb
import catboost as catb

In [2]:
def fillna(X, columns, data):
    """Заполняет наны"""
    
    for feature in columns:
        X.loc[X[feature].isna(), feature] = data[feature]
        
    return X

In [3]:
def dummies(X, feature):
    
    X = pd.concat([X, pd.get_dummies(X[feature], prefix=feature)], axis=1)
    X = X.drop(labels=[feature], axis=1)
    
    return X

In [4]:
def replace(X, feature, data):
    
    X[feature] = X[feature].replace(data)
    
    return X

In [5]:
def preproc(X):
    
    feature_cat = [
        "Home Ownership", 
        "Years in current job", 
        "Tax Liens", 
        "Number of Open Accounts", 
        "Number of Credit Problems",
        "Bankruptcies",
        "Purpose",
        "Term"    
    ]
    
    feature_float = list(X.columns)
    
    for feature in feature_cat:
        feature_float.remove(feature)
        
    return {
        "feature_cat": feature_cat, 
        "feature_float": feature_float,
        "replace_years_in_current_job": {
            "< 1 year": 0,
            "1 year": 1,
            "2 years": 2,
            "3 years": 3,
            "4 years": 4,
            "5 years": 5,
            "6 years": 6,
            "7 years": 7,
            "8 years": 8,
            "9 years": 9,
            "10 years": 10,
            "10+ years": 11
        },
        "replace_term" : {
            "Short Term": 0,
            "Long Term": 1
        }
    }

In [6]:
def get_medians(X, columns):
    
    medians = {}
    features = preproc(X)
    
    for feature in features["feature_float"]:
        medians[feature] = X[feature].median()
        
    return medians

In [7]:
def get_modes(X, columns):
    
    modes = {}
    features = preproc(X)
    
    for feature in features["feature_cat"]:
        modes[feature] = X[feature].mode().values[0]
        
    return modes

In [8]:
def fit(X):   
    
    features = preproc(X)
    
    return {
        "medians": get_medians(X, features["feature_float"]),
        "modes": get_modes(X, features["feature_cat"])
    }

In [9]:
def transform(X, params):
    
    features = preproc(X)
    
    X = fillna(X, features["feature_float"], params["medians"])
    X = fillna(X, features["feature_cat"], params["modes"])
    
    X = dummies(X, "Home Ownership")
    X = dummies(X, "Purpose")
    
    X = replace(X, "Years in current job", features["replace_years_in_current_job"])
    X = replace(X, "Term", features["replace_term"])
    
    return X

### Считывание данных

In [10]:
df = pd.read_csv("train.csv")

In [11]:
X = df.drop(labels=["Credit Default"], axis=1)
y = df["Credit Default"]

In [12]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.3, stratify=y, random_state=50)

### Преобразование

In [13]:
params = fit(X_train)
X_train = transform(X_train, params)
X_test = transform(X_test, params)
X_test.insert(22, "Purpose_educational expenses", 0)
X_test.insert(28, "Purpose_renewable energy", 0)

### Балансировка через overflow

In [14]:
df_balance_train = pd.concat([X_train, y_train], axis=1)
sample_train = df_balance_train[df_balance_train["Credit Default"] == 1].sample(df_balance_train["Credit Default"].value_counts()[1], random_state=50)
df_balance_train = df_balance_train.append(sample_train, ignore_index=True)
X_train = df_balance_train.drop(labels=["Credit Default"], axis=1)
y_train = df_balance_train["Credit Default"]

### Вычисление

In [15]:
model = catb.CatBoostClassifier(reg_lambda=0.5, silent=True, early_stopping_rounds=20, eval_metric='F1', class_weights=[1, y_train.value_counts()[0] / y_train.value_counts()[1]], iterations=300, max_depth=3, random_state=50)
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
print(sklearn.metrics.classification_report(y_train, y_pred_train))
print(sklearn.metrics.classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.76      0.77      0.76      3771
           1       0.70      0.68      0.69      2958

    accuracy                           0.73      6729
   macro avg       0.73      0.73      0.73      6729
weighted avg       0.73      0.73      0.73      6729

              precision    recall  f1-score   support

           0       0.84      0.74      0.78      1616
           1       0.49      0.64      0.55       634

    accuracy                           0.71      2250
   macro avg       0.66      0.69      0.67      2250
weighted avg       0.74      0.71      0.72      2250



In [16]:
test = pd.read_csv("test.csv")

In [17]:
test = transform(test, params)

In [18]:
test.insert(28, "Purpose_renewable energy", 0)

In [19]:
model = catb.CatBoostClassifier(reg_lambda=0.5, silent=True, early_stopping_rounds=20, eval_metric='F1', class_weights=[1, y_train.value_counts()[0] / y_train.value_counts()[1]], iterations=300, max_depth=3, random_state=50)
model.fit(X_train, y_train)
result = model.predict(test)

In [20]:
sample = pd.read_csv("sample_submission.csv")

In [21]:
sample["Credit Default"] = result

In [22]:
sample.to_csv("results.csv", index=False)