In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install lofo-importance
!pip install shap

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os 
# pd.set_option("display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None)
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import LabelEncoder,MinMaxScaler,StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import RidgeClassifier
from catboost import CatBoostClassifier

import optuna
import shap
from lofo import LOFOImportance, Dataset, plot_importance

import warnings
warnings.filterwarnings("ignore")

### 1. Verilerin Yüklenmesi

In [None]:
base_path = "/kaggle/input/home-credit-default-risk/"

filenames = [
    "application_train.csv",
    "application_test.csv",
    "POS_CASH_balance.csv",
    "bureau.csv",
    "bureau_balance.csv",
    "previous_application.csv",
    "credit_card_balance.csv",
    "installments_payments.csv"
]


dataframes = {}


for filename in filenames:
    df_var = f"{filename[:-4]}_df"  
    dataframes[df_var] = pd.read_csv(f"{base_path}{filename}")


train_df = dataframes["application_train_df"]
test_df = dataframes["application_test_df"]
bureau_df = dataframes["bureau_df"]
bureau_balance_df = dataframes["bureau_balance_df"]
previous_application_df = dataframes["previous_application_df"]
pos_cash_balance_df = dataframes["POS_CASH_balance_df"]
credit_card_balance_df = dataframes["credit_card_balance_df"]
installments_payments_df = dataframes["installments_payments_df"]


In [None]:
description_df = pd.read_csv('/kaggle/input/home-credit-default-risk/HomeCredit_columns_description.csv',encoding='ISO-8859-1')
pd.set_option('display.max_columns', None)
description_df

**Train**

In [None]:
train_df_cat = train_df.select_dtypes(include=["object"]).columns
train_df_num = [x for x in train_df if x not in train_df_cat] 

print(f'Train data shape: {train_df.shape}')
print(train_df.info())
print(f'\nCategoric features count: {len(train_df_cat)}')
print(f'Numeric features count: {len(train_df_num)}')

print('\nTrain Samples')
display(train_df.head())


In [None]:
# Train verisi ile ilgili daha fazla istatistik bilgi almak için
desc = pd.DataFrame(index=list(train_df))
desc['type'] = train_df.dtypes
desc['count'] = train_df.count()
desc['nunique'] = train_df.nunique()
desc['%unique'] = desc['nunique'] / len(train_df) * 100
desc['null'] = train_df.isnull().sum()
desc['%null'] = desc['null'] / len(train_df) * 100
desc = pd.concat([desc, train_df.describe().T.drop('count', axis=1)], axis=1)
desc.sort_values(by=['type', 'null']).style.background_gradient(axis=0)


**Test**

In [None]:
test_df_cat = test_df.select_dtypes(include=["object"]).columns
test_df_num = [x for x in test_df if x not in test_df_cat] 

print(f'Test data shape: {test_df.shape}')
print(test_df.info())
print(f'\nCategoric features count: {len(test_df_cat)}')
print(f'Numeric features count: {len(test_df_num)}')

print('\nTest Samples')
display(test_df.head())

**Extra Datasets**

In [None]:
# Extra verilerin bilgilerinin çektiği fonksiyon
def load_data(path, name):
    
    df = pd.read_csv(path)
    print(f"{name}: shape is {df.shape}")
    print(df.info())
    
    cat_features = df.select_dtypes(include=['object']).columns
    num_features = df.select_dtypes(exclude=['object']).columns

    print(f'\nCategoric features count: {len(cat_features)}')
    print(f'Numeric features count: {len(num_features)}')
    print(f'\n{name} Samples')
    display(df.head())
    return df

In [None]:
datasets = {}
DATA_DIR = '/kaggle/input/home-credit-default-risk/'
ds_names = ("bureau","bureau_balance","credit_card_balance","installments_payments",
            "previous_application","POS_CASH_balance")

for ds_name in ds_names:
    datasets[ds_name] = load_data(os.path.join(DATA_DIR, f'{ds_name}.csv'), ds_name)

### **2.EDA**

* Bu aşamada eldeki ham verilerin kullanılabilir hale getirmek ve gerekli teknikleri belirlemek için veri analiz edindi.
* Hedef değişken dağılımına bakıldı.
* Numerik ve kategorik değerlerin dağılımına bakıldı.
* Korelasyon şeması ve outliers grafiklerine bakıldı


In [None]:
# Hedef değişkeninde değer dağılımı nasıl?
temp = train_df["TARGET"].value_counts()
df = pd.DataFrame({'labels': temp.index,
                   'values': temp.values
                  })
plt.figure(figsize = (6,6))
plt.title('Target')
sns.set_color_codes("pastel")
sns.barplot(x = 'labels', y="values", data=df)
locs, labels = plt.xticks()
plt.show()

**we have a classification problem with imbalanced classes.**

**Numerik Özellikler**;

In [None]:
# Numerik özellikler için dağılım grafikleri kontrol edildi
train_df_num.remove('TARGET')

fig, ax = plt.subplots(10, 5, figsize=(15, 15))  
ax = ax.flatten()

for i, col in enumerate(train_df_num[:50]):
    sns.kdeplot(train_df[col], ax=ax[i], color='r')
    sns.kdeplot(test_df[col], ax=ax[i], color='g')    
    ax[i].set_title(f'{col}')
    ax[i].set_xlabel(None)    
    
for j in range(len(train_df_num[:50]), len(ax)):
    ax[j].axis('off')

fig.suptitle('Numerik Özellik Dağılımları\n', fontsize=24, fontweight='bold')
fig.legend(['Train', 'Test'])
plt.tight_layout(h_pad=0.1, w_pad=0.5)
plt.show()

In [None]:
fig, ax = plt.subplots(10, 6, figsize=(15, 15))  
ax = ax.flatten()

for i, col in enumerate(train_df_num[50:107]):
    sns.kdeplot(train_df[col], ax=ax[i], color='r')
    sns.kdeplot(test_df[col], ax=ax[i], color='g')    
    ax[i].set_title(f'{col}')
    ax[i].set_xlabel(None)    
    
for j in range(len(train_df_num[50:107]), len(ax)):
    ax[j].axis('off')

fig.suptitle('Numerik Özellik Dağılımları\n', fontsize=24, fontweight='bold')
fig.legend(['Train', 'Test'])
plt.tight_layout(h_pad=0.1, w_pad=0.5)
plt.show()

**Kategorik Özellikler**

In [None]:
fig, axes = plt.subplots(4, 4, figsize=(20,20))
axes = axes.flatten()

for i, col in enumerate(train_df.select_dtypes(include='object').columns):
    sns.countplot(x=col, data=train_df, ax=axes[i], linewidth=1.5,orient="h")
    axes[i].set_title(f"{col} Dağılımı", fontsize=10)  

    axes[i].tick_params(axis='x', rotation=90)
    
fig.suptitle("Kategorik Özellik Dağılımları", fontsize=20, fontweight='bold')
plt.subplots_adjust(wspace=0.5, hspace=0.5)
plt.show()

In [None]:
plt.figure(figsize=(10,8))
plt.title("Başvuru Sahibinin Aile Üyelerinin Dağılımı", fontweight='bold', fontsize=16)
sns.countplot(x='CNT_FAM_MEMBERS', hue='TARGET', data=train_df)
plt.show()

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize = (10,10))
plt.title("En çok borç alan kim?", fontweight = 'bold', fontsize = 16)
sns.countplot(x='CODE_GENDER',data=train_df)

In [None]:
fig, ax = plt.subplots(figsize=(15,7))
sns.countplot(x='OCCUPATION_TYPE',hue='TARGET',data=train_df,)
plt.xticks(rotation=70)
plt.xlabel("Occupation Type")
plt.title('Hangi meslekteki kişiler zamanında geri ödeme yapar ve  şirket için daha iyi müşterilerdir?')

**Outliers**

In [None]:
plt.rcParams['axes.facecolor'] = 'white'
fig = plt.figure(figsize=[32,50])
fig.suptitle('BOXPLOT OF ALL COLUMNS', fontsize=18, fontweight='bold')
fig.subplots_adjust(top=0.97);
fig.subplots_adjust(hspace=0.5, wspace=0.4);
for i ,col in enumerate(train_df_num[:50]):
    ax = fig.add_subplot(14,5, i+1);
    ax = sns.boxplot(data = train_df, x=col ,palette="husl");
    ax.set_title(f'{col}')
    ax.set_xlabel(f'{col}')
    ax.grid(False)
plt.show()

In [None]:
plt.rcParams['axes.facecolor'] = 'white'
fig = plt.figure(figsize=[32,50])
fig.suptitle('BOXPLOT OF ALL COLUMNS', fontsize=18, fontweight='bold')
fig.subplots_adjust(top=0.97);
fig.subplots_adjust(hspace=0.5, wspace=0.4);
for i ,col in enumerate(train_df_num[50:107]):
    ax = fig.add_subplot(14,5, i+1);
    ax = sns.boxplot(data = train_df, x=col ,palette="husl");
    ax.set_title(f'{col}')
    ax.set_xlabel(f'{col}')
    ax.grid(False)
plt.show()

In [None]:
def missing_values_summary(data):
   
    missing_values_count = data.isnull().sum()
    missing_values_percentage = 100 * missing_values_count / len(data)

    
    missing_values_summary = pd.DataFrame({
        'Eksik Veri': missing_values_count,
        'Eksik Veri Yüzdesi': missing_values_percentage
    })


    missing_values_summary = missing_values_summary[missing_values_summary['Eksik Veri'] > 0]

    missing_values_summary = missing_values_summary.sort_values(by='Eksik Veri Yüzdesi', ascending=False)

    return missing_values_summary

In [None]:
display(missing_values_summary(train_df).head(10))
display(missing_values_summary(train_df).tail(10))

### **3.Preprocessing**

In [None]:
#Bir kişi için gelir hesabı
train_df['CALC_INCOME_PER_PERSON'] = train_df['AMT_INCOME_TOTAL'] / train_df['CNT_FAM_MEMBERS']
test_df['CALC_INCOME_PER_PERSON'] = train_df['AMT_INCOME_TOTAL'] / train_df['CNT_FAM_MEMBERS']

#Bir kişinin çalıştığı gün sayısının yaşına oranı
train_df['CALC_PERC_DAYS_EMPLOYED'] = train_df['DAYS_EMPLOYED'] / train_df['DAYS_BIRTH']
test_df['CALC_PERC_DAYS_EMPLOYED'] = test_df['DAYS_EMPLOYED'] / test_df['DAYS_BIRTH']

#Gelirin krediye oranının hesaplanmas
train_df['CALC_PERC_INCOME_CREDIT'] = train_df['AMT_INCOME_TOTAL'] /train_df['AMT_CREDIT']
test_df['CALC_PERC_INCOME_CREDIT'] = test_df['AMT_INCOME_TOTAL'] /test_df['AMT_CREDIT']


train_df['ANNUITY_INCOME_PERCENT'] = train_df['AMT_ANNUITY'] / train_df['AMT_INCOME_TOTAL']
test_df['ANNUITY_INCOME_PERCENT'] = test_df['AMT_ANNUITY'] / test_df['AMT_INCOME_TOTAL']


train_df['CREDIT_TERM'] = train_df['AMT_ANNUITY'] / train_df['AMT_CREDIT']
test_df['CREDIT_TERM'] = test_df['AMT_ANNUITY'] / test_df['AMT_CREDIT']



# Gereksiz özelliklerin çıkarılması
train_df.drop(['FLAG_MOBIL','FLAG_DOCUMENT_2','FLAG_DOCUMENT_4','FLAG_DOCUMENT_10','FLAG_DOCUMENT_12'],axis=1,inplace=True)
test_df.drop(['FLAG_MOBIL','FLAG_DOCUMENT_2','FLAG_DOCUMENT_4','FLAG_DOCUMENT_10','FLAG_DOCUMENT_12'],axis=1,inplace=True)

In [None]:
X = train_df.drop('TARGET', axis=1)
y = train_df['TARGET']
X_test = test_df
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns

# Preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])
X_final = preprocessor.fit_transform(X)
processed_features = numerical_features.tolist() + preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features).tolist()
X_final = pd.DataFrame(X_final, columns=processed_features)

X_test_final = preprocessor.fit_transform(X_test)
processed_features = numerical_features.tolist() + preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features).tolist()
X_test_final = pd.DataFrame(X_test_final, columns=processed_features)

In [None]:
#train setinde olup test olmayan verileri kaldılırdı
X_final = X_final.drop(columns = ['CODE_GENDER_XNA', 'NAME_FAMILY_STATUS_Unknown', 'NAME_INCOME_TYPE_Maternity leave'], axis = 1)

In [None]:
train_test_df = pd.concat([X_final, X_test_final])

In [None]:
train_test_df.shape

In [None]:
# Extra datalaggarın preprocessing aşaması için fonksiyon
def preprocess_and_aggregate(df, group_col):
    
    
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    categorical_cols = df.select_dtypes(exclude=[np.number]).columns
    
   
    numeric_imputer = SimpleImputer(strategy='mean')
    df[numeric_cols] = numeric_imputer.fit_transform(df[numeric_cols])

    
    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    df[categorical_cols] = categorical_imputer.fit_transform(df[categorical_cols])

    
    onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    encoded_categorical = onehot_encoder.fit_transform(df[categorical_cols])
    encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=onehot_encoder.get_feature_names_out(categorical_cols))

    
    df = pd.concat([df[numeric_cols], encoded_categorical_df], axis=1)

   
    aggregation_funcs = {
        col: ['median', 'mean', 'std', 'min', 'max'] for col in df.columns if col != group_col
    }
    
    aggregated_df = df.groupby(group_col).agg(aggregation_funcs)
    
    # Çok seviyeli sütun isimlerini düzleştir
    aggregated_df.columns = ['_'.join(col).strip() for col in aggregated_df.columns.values]
    
    return aggregated_df

    

**Bureau**

In [None]:
bureau_df.head()

In [None]:
display(missing_values_summary(bureau_df).head())


In [None]:
bureau_df_final =preprocess_and_aggregate(bureau_df, 'SK_ID_CURR')


In [None]:
# # Özellik seçimi için korelasyon değerlerinden yararlanıldı
# y_df = pd.DataFrame(y)
# bureau_df_final['TARGET'] = y_df['TARGET'] 

In [None]:
# corr_matrix = bureau_df_final.corr()
# plt.figure(figsize=(8,6))
# plt.title('Correlation Heatmap of Iris Dataset')
# a = sns.heatmap(corr_matrix, square=True, annot=True, fmt='.2f', linecolor='black', cmap='coolwarm', cbar_kws={"shrink": .8}, annot_kws={"size": 4})

# # Rotate labels and adjust font sizes
# a.set_xticklabels(a.get_xticklabels(), rotation=45, ha='right', fontsize=5)  
# a.set_yticklabels(a.get_yticklabels(), rotation=45, ha='right', fontsize=5)  
# plt.tight_layout()
# plt.show()   

In [None]:
# upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
# to_drop = [column for column in upper.columns if any(upper[column] > 0.75)]
# print(to_drop)
# bureau_df_final = bureau_df_final.drop(bureau_df_final.columns[to_drop], axis=1)
# bureau_df_final = bureau_df_final.drop(columns = ['TARGET'], axis=1)

In [None]:
# X_final = X_final.merge(right=bureau_df_final.reset_index(), how='left', on='SK_ID_CURR')
# X_final.shape
train_test_df = train_test_df.merge(right=bureau_df_final.reset_index(), how='left', on='SK_ID_CURR')
train_test_df.shape

**Pos_cash**

In [None]:
pos_cash_balance_df.head()

In [None]:
display(missing_values_summary(pos_cash_balance_df).head())

In [None]:
pos_cash_balance_df_final =preprocess_and_aggregate(pos_cash_balance_df, 'SK_ID_CURR')

In [None]:
# pos_cash_balance_df_final['TARGET']  = y_df['TARGET']
# corr_matrix = pos_cash_balance_df_final.corr()
# upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# to_drop = [column for column in upper.columns if any(upper[column] > 0.7)]
# print(to_drop)

# pos_cash_balance_df_final = pos_cash_balance_df_final.drop(columns=to_drop)
# pos_cash_balance_df_final = pos_cash_balance_df_final.drop(columns = ['TARGET'], axis=1)

In [None]:
train_test_df = train_test_df.merge(right=pos_cash_balance_df_final.reset_index(), how='left', on='SK_ID_CURR')
train_test_df.shape

**credit card balance**

In [None]:
credit_card_balance_df

In [None]:
display(missing_values_summary(credit_card_balance_df))

In [None]:
numeric_cols = credit_card_balance_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = credit_card_balance_df.select_dtypes(include=['object']).columns.tolist()

numeric_imputer = SimpleImputer(strategy='mean')
credit_card_balance_df[numeric_cols] = numeric_imputer.fit_transform(credit_card_balance_df[numeric_cols])
scaler = StandardScaler()
credit_card_balance_df[numeric_cols] = scaler.fit_transform(credit_card_balance_df[numeric_cols])




In [None]:
nb_prevs = credit_card_balance_df[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
credit_card_balance_df['SK_ID_PREV'] = credit_card_balance_df['SK_ID_CURR'].map(nb_prevs['SK_ID_PREV'])


avg_cc_bal = credit_card_balance_df.groupby('SK_ID_CURR')[numeric_cols].mean()


avg_cc_bal.columns = ['cc_bal_' + f_ for f_ in avg_cc_bal.columns]
avg_cc_bal = avg_cc_bal.reset_index()


In [None]:
train_test_df = train_test_df.merge(right=avg_cc_bal.reset_index(), how='left', on='SK_ID_CURR')
train_test_df.shape

**installment_payments**

In [None]:
display(missing_values_summary(installments_payments_df).head())

In [None]:
numeric_cols = installments_payments_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = installments_payments_df.select_dtypes(include=['object']).columns.tolist()

numeric_imputer = SimpleImputer(strategy='mean')
installments_payments_df[numeric_cols] = numeric_imputer.fit_transform(installments_payments_df[numeric_cols])
scaler = StandardScaler()
installments_payments_df[numeric_cols] = scaler.fit_transform(installments_payments_df[numeric_cols])


installments_payments_df_final = installments_payments_df[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').median()

In [None]:
# installments_payments_df_final['TARGET']  = y_df['TARGET']
# corr_matrix = installments_payments_df_final.corr()
# upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# to_drop = [column for column in upper.columns if any(upper[column] > 0.7)]
# print(to_drop)

# installments_payments_df_final = installments_payments_df_final.drop(columns=to_drop)
# installments_payments_df_final = installments_payments_df_final.drop(columns = ['TARGET'], axis=1)

In [None]:
train_test_df = train_test_df.merge(right=installments_payments_df_final.reset_index(), how='left', on='SK_ID_CURR')
train_test_df.shape

**previous_application**

In [None]:
previous_application_df.head()

In [None]:
display(missing_values_summary(previous_application_df).head())

In [None]:
previous_application_df = previous_application_df.drop(['RATE_INTEREST_PRIVILEGED','RATE_INTEREST_PRIMARY'],axis=1)

In [None]:
numeric_cols = previous_application_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = previous_application_df.select_dtypes(include=['object']).columns.tolist()

numeric_imputer = SimpleImputer(strategy='mean')
previous_application_df[numeric_cols] = numeric_imputer.fit_transform(previous_application_df[numeric_cols])
scaler = StandardScaler()
previous_application_df[numeric_cols] = scaler.fit_transform(previous_application_df[numeric_cols])


categorical_imputer = SimpleImputer(strategy='most_frequent')
previous_application_df[categorical_cols] = categorical_imputer.fit_transform(previous_application_df[categorical_cols])

    
onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
encoded_categorical = onehot_encoder.fit_transform(previous_application_df[categorical_cols])
encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=onehot_encoder.get_feature_names_out(categorical_cols))

    
previous_application_final = pd.concat([previous_application_df[numeric_cols], encoded_categorical_df], axis=1)

In [None]:
prev_apps_count = previous_application_final[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
previous_application_final['SK_ID_PREV'] = previous_application_final['SK_ID_CURR'].map(prev_apps_count['SK_ID_PREV'])

## Average values for all other features in previous applications
prev_apps_avg = previous_application_final.groupby('SK_ID_CURR').mean()
prev_apps_avg.columns = ['p_' + col for col in prev_apps_avg.columns]


In [None]:
# prev_apps_avg['TARGET']  = y_df['TARGET']
# corr_matrix = prev_apps_avg.corr()
# upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
# print(to_drop)

# prev_apps_avg = prev_apps_avg.drop(columns=to_drop)
# prev_apps_avg = prev_apps_avg.drop(columns = ['TARGET'], axis=1)

In [None]:
train_test_df = train_test_df.merge(right=prev_apps_avg.reset_index(), how='left', on='SK_ID_CURR')
train_test_df.shape

In [None]:
X_train=train_test_df.iloc[:len(train_df),:]
test=train_test_df.iloc[len(train_df):,:]

print(X_train.shape)
print(test.shape)

In [None]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=0.01)
sel.fit(X_final) 
mask = sel.get_support()
print(len(mask))
selected_columns = X_final.columns[mask]
X_final_selected = X_final[selected_columns]


In [None]:
X_final_selected


In [None]:
# X_final_selected = X_final_selected.drop(columns = ['TARGET'], axis = 1)


### **4.Build Classifiers**

**Stratified K-Fold Cross-Validation** 

Dengesiz sınıf dağılımları olan veri setlerinde yararlı olabilir. Verimiz hem büyük hem de dengesiz sınıflar barındırıyor.


In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


# ridge_params = {'alpha': 1.0}
# ridge = RidgeClassifier(**ridge_params)


catboost_params = {'iterations': 100,
                   'depth': 6,
                   'scale_pos_weight': 5,
                   'bootstrap_type': 'Bernoulli',
                   'learning_rate': 0.1,
                   'eval_metric':'AUC',
                   'od_type': 'Iter',
                   'random_strength': 1,
                   'early_stopping_rounds': 50,
                   'subsample': 0.8,
                   'verbose': 0}

catboost = CatBoostClassifier(**catboost_params)

# ridge_scores = cross_val_score(ridge, X_processed, y, cv=skf, scoring='accuracy')
# print("RidgeClassifier Accuracy: ", np.mean(ridge_scores))


catboost_scores = cross_val_score(catboost, X_final_selected, y, cv=skf, scoring='accuracy')
print("CatBoostClassifier Accuracy: ", np.mean(catboost_scores))

# ridge_scores_roc = cross_val_score(ridge, X_processed, y, cv=skf, scoring='roc_auc')
catboost_scores_roc = cross_val_score(catboost, X_final_selected, y, cv=skf, scoring='roc_auc')


# print("RidgeClassifier ROC: ", np.mean(ridge_scores_roc))
print("CatBoostClassifier ROC: ", np.mean(catboost_scores_roc))

### 5.LOFO - Feature Selection

In [None]:
y_df = pd.DataFrame(y, columns = ['TARGET'])
X_final_selected['TARGET'] = y_df['TARGET']

In [None]:
features = [str(col) for col in X_final_selected.columns if col != "TARGET"]

sample_df = X_final_selected.sample(frac=0.1)
sample_df.sort_values("TARGET", inplace=True) 

# define the validation scheme
cv = StratifiedKFold(n_splits=3)

# define the binary target and the features
dataset = Dataset(df=sample_df, target="TARGET", features = features)

# define the validation scheme and scorer. The default model is LightGBM
lofo_imp = LOFOImportance(dataset, cv=cv, model = catboost, scoring="roc_auc")

# get the mean and standard deviation of the importances in pandas format
importance_df = lofo_imp.get_importance()


In [None]:
plot_importance(importance_df, figsize=(12, 20))
plt.gca().yaxis.set_ticks([])

In [None]:
# Modelimiz için en iyi özellikleri seçme
# X_final = X_final.drop('TARGET', axis = 1)
neg_feature = importance_df[importance_df["importance_mean"] < 0]["feature"].tolist()
print(len(neg_feature))

In [None]:
X_final_new = X_final_selected.drop(columns = neg_feature,axis = 1)

In [None]:
X_final_new = X_final_selected.drop(columns = 'TARGET',axis = 1)

### 6.Hiperparametre Optimizasyonu ve SHAP

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_final_new, y, test_size=0.2, random_state=42)

# Optuna çalışma alanını oluşturma
def objective(trial):
    param = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 2, 10),
         'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.1, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.2),
        'scale_pos_weight': trial.suggest_loguniform('scale_pos_weight', 1, 10),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'random_strength': trial.suggest_loguniform('random_strength', 1e-3, 10),
        'bootstrap_type': 'Bernoulli',
        'eval_metric': 'AUC',
        'od_type': 'Iter',
        'early_stopping_rounds': 50,
        'verbose': 0,
    }
    
    model = CatBoostClassifier(**param)
    
    model.fit(X_train, y_train, eval_set=(X_valid, y_valid), verbose=0)
    
    y_pred = model.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, y_pred)
    
    return auc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

best_params = study.best_params
print("En iyi parametreler: ", best_params)

In [None]:
best_params = { 'iterations': 817,
               'depth': 5, 
               'l2_leaf_reg': 1.6188174269842595, 
               'learning_rate': 0.07525332174980123,
               'scale_pos_weight': 2.2276843669235515,
               'subsample': 0.8572821079427205, 
               'random_strength': 0.004554358987876041,
               'bootstrap_type': 'Bernoulli',
               'eval_metric': 'AUC',
               'od_type': 'Iter',
               'early_stopping_rounds': 50,
               'verbose': 0,
    }
best_model = CatBoostClassifier(**best_params)
best_model.fit(X_train, y_train, eval_set=(X_valid, y_valid), verbose=0)

y_pred = best_model.predict_proba(X_valid)[:, 1]
final_roc = roc_auc_score(y_valid, y_pred)
print("En iyi ROC Skoru: ", final_roc)


In [None]:
# best_model = CatBoostClassifier(**best_params,verbose = 0)
# best_model.fit(X_final_new, y)
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_final_new)

# SHAP değerlerini yorumlamak için
shap.initjs()
shap.summary_plot(shap_values, X_final_new, plot_type='bar')


In [None]:
shap.force_plot(explainer.expected_value, shap_values[0,:], X_final_new.iloc[0,:])