In [1]:
import pkg_resources

ls_libs = ["numpy", "pandas", "scipy", "scikit-learn"]
for n_lib in ls_libs:
    vak_ver = pkg_resources.get_distribution(n_lib).version
    print(f"{n_lib}: {vak_ver}")

numpy: 2.2.3
pandas: 2.2.3
scipy: 1.15.2
scikit-learn: 1.6.1


In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.tree import DecisionTreeRegressor
from statsmodels.api import OLS
from statsmodels.api import add_constant

from sklearn.metrics import mean_squared_error

### 전처리

In [3]:
df = pd.read_csv("set_05_data.csv")
df.head(2)

Unnamed: 0,cust_id,balance,balance_freq,purchases,oneoff_p,installments_p,cash_advance,p_freq,oneoff_p_freq,p_installments_freq,cash_advance_freq,cash_advance_trx,p_trx,credit_limit,payments,minimum_payments,prc_full_payment,tenure
0,10001,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0.0,2.0,1000.0,201.802084,139.509787,0.0,12.0
1,10002,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4.0,0.0,7000.0,4103.032597,1072.340217,0.222222,12.0


In [4]:
df.isna().sum()

cust_id                 0
balance                 0
balance_freq            0
purchases               0
oneoff_p                0
installments_p          0
cash_advance            0
p_freq                  0
oneoff_p_freq           0
p_installments_freq     0
cash_advance_freq       0
cash_advance_trx        0
p_trx                   0
credit_limit            0
payments                0
minimum_payments       74
prc_full_payment        0
tenure                  0
dtype: int64

In [5]:
df["minimum_payments"] = df["minimum_payments"].fillna(df["minimum_payments"].mean())

In [6]:
df.isna().sum()

cust_id                0
balance                0
balance_freq           0
purchases              0
oneoff_p               0
installments_p         0
cash_advance           0
p_freq                 0
oneoff_p_freq          0
p_installments_freq    0
cash_advance_freq      0
cash_advance_trx       0
p_trx                  0
credit_limit           0
payments               0
minimum_payments       0
prc_full_payment       0
tenure                 0
dtype: int64

In [7]:
df_base = df.copy()

### Q1. 

In [8]:
df_q1 = df_base.loc[df_base["purchases"] != 0, ["tenure", "balance", "credit_limit"]]

In [9]:
df_q1.head(2)

Unnamed: 0,tenure,balance,credit_limit
0,12.0,40.900749,1000.0
2,12.0,2495.148862,7500.0


In [10]:
df_q1_corr = df_q1.groupby("tenure")[["balance", "credit_limit"]].corr()

In [11]:
df_q1_corr2 = df_q1_corr.reset_index()
df_q1_corr2.loc[df_q1_corr2["level_1"] == "balance", "credit_limit"].round(2).max()

np.float64(0.97)

### Q2.

In [12]:
df_q2 = df_base.drop(columns = "cust_id")
arr_q2_nor = StandardScaler().fit_transform(df_q2) # 단계 1

In [13]:
ls_k = [2, 3, 4, 5]
k = ls_k[0]

model_kmeans = KMeans(n_clusters = k, random_state = 1234)
model_kmeans.fit(arr_q2_nor)
silhouette_score(arr_q2_nor, labels = model_kmeans.labels_)

np.float64(0.21544722357637516)

In [14]:
# 단계 3
ls_k = [2, 3, 4, 5]
ls_sil = []
for k in ls_k:
    model_kmeans = KMeans(n_clusters = k, random_state = 1234)
    model_kmeans.fit(arr_q2_nor)
    val_sil = silhouette_score(arr_q2_nor, labels = model_kmeans.labels_)
    ls_sil = ls_sil + [val_sil]

In [15]:
ser_sil = pd.Series(ls_sil, index = ls_k) # Series 객체는 굳이 필요 없음.
val_k_best = ser_sil.idxmax() # 단계 4
val_k_best

np.int64(2)

In [16]:
model_best = KMeans(n_clusters = val_k_best, random_state = 1234)
model_best.fit(arr_q2_nor) # 단계 5

In [17]:
df_q2["cluster"] = model_best.labels_
ser_g = df_q2.groupby("cluster")["oneoff_p"].mean() # 단계 6
ser_g

cluster
0     272.263897
1    2156.472313
Name: oneoff_p, dtype: float64

In [18]:
ser_g.round(2).max()

np.float64(2156.47)

### Q3. 

In [19]:
df_q3 = df_base.loc[:, df_base.columns.str.contains("oneoff_p|freq")]
# df_q3 = pd.concat([df_base[["oneoff_p"]], df_base.loc[:, df_base.columns.str.contains("freq")]], axis = 1)
df_q3.head(2)

Unnamed: 0,balance_freq,oneoff_p,p_freq,oneoff_p_freq,p_installments_freq,cash_advance_freq
0,0.818182,0.0,0.166667,0.0,0.083333,0.0
1,0.909091,0.0,0.0,0.0,0.0,0.25


In [20]:
df_train = df_q3.loc[(df_base["cust_id"] % 4) != 0, ]
df_test  = df_q3.loc[(df_base["cust_id"] % 4) == 0, ]
len(df_train), len(df_test)

(752, 248)

In [21]:
model_dt = DecisionTreeRegressor(max_depth = 5, random_state = 1234)
model_dt.fit(X = df_train.drop(columns = "oneoff_p"),
             y = df_train["oneoff_p"])
pred_dt = model_dt.predict(df_test.drop(columns = "oneoff_p"))
val_R_dt = mean_squared_error(y_true = df_test["oneoff_p"], y_pred = pred_dt) ** 0.5
val_R_dt

3042.987475423554

In [22]:
df_train_lr = add_constant(df_train)
df_test_lr  = add_constant(df_test)
model_lr = OLS(exog = df_train_lr.drop(columns = "oneoff_p"),
               endog = df_train_lr["oneoff_p"]).fit()
pred_lr = model_lr.predict(df_test_lr.drop(columns = "oneoff_p"))
pred_lr2 = np.where(pred_lr < 0, 0, pred_lr)
pred_lr2[:4]

array([327.50702985,   0.        , 803.88151079,   0.        ])

In [23]:
val_R_lr = mean_squared_error(y_true = df_test["oneoff_p"], y_pred = pred_lr2) ** 0.5
val_R_lr

2126.138317305272

In [24]:
round(val_R_dt - val_R_lr, 2)

916.85