In [1]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 결측치 시각화를 위한 라이브러리
import missingno

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 학습용과 검증용으로 나누는 함수
from sklearn.model_selection import train_test_split

# 교차 검증
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 평가함수
# 분류용
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# 회귀용
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 모델의 최적의 하이퍼 파라미터를 찾기 위한 도구
from sklearn.model_selection import GridSearchCV

# 머신러닝 알고리즘 - 분류
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# 머신러닝 알고리즘 - 회귀
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor

# 학습 모델 저장을 위한 라이브러리
import pickle

In [2]:
import os
import glob
import pandas as pd

# 1. 시트별 경로 정의
table_paths = {
    "1.회원정보": "data/train/1.회원정보",
    "2.신용정보": "data/train/2.신용정보"
}

# 2. 데이터 불러오기
merged_data = {}

for name, path in table_paths.items():
    files = glob.glob(os.path.join(path, "*.parquet"))
    if len(files) > 0:
        df = pd.concat([pd.read_parquet(f) for f in sorted(files)], ignore_index=True)
        merged_data[name] = df
        print(f"{name} ✅ 불러오기 완료: {df.shape}")
    else:
        print(f"{name} ⚠️ 파일 없음")

# 3. 필요한 데이터 추출 및 병합
member_df = merged_data["1.회원정보"]
result_df = merged_data["2.신용정보"]

# ID, Segment만 추출
segment_df = member_df[['ID', 'Segment']].drop_duplicates(subset='ID')

# ID 기준 병합
result_df_with_segment = pd.merge(result_df, segment_df, on='ID', how='left')

# 4. 병합 결과 확인
print(f"병합 후 행 수: {len(result_df_with_segment)}")
print(f"Segment 컬럼 결측치 수: {result_df_with_segment['Segment'].isnull().sum()}")

# 5. CSV 파일로 저장
save_path = "data/신용정보_with_segment.csv"
result_df_with_segment.to_csv(save_path, index=False, encoding='utf-8-sig')
print(f"📁 CSV 저장 완료: {save_path}")

1.회원정보 ✅ 불러오기 완료: (2400000, 78)
2.신용정보 ✅ 불러오기 완료: (2400000, 42)
병합 후 행 수: 2400000
Segment 컬럼 결측치 수: 0
📁 CSV 저장 완료: data/신용정보_with_segment.csv


In [3]:
credit_df_with_segment = pd.read_csv("data/신용정보_with_segment.csv")
credit_df_with_segment

Unnamed: 0,기준년월,ID,최초한도금액,카드이용한도금액,CA한도금액,일시상환론한도금액,월상환론한도금액,CA이자율_할인전,CL이자율_할인전,RV일시불이자율_할인전,...,한도심사요청건수,한도요청거절건수,한도심사요청후경과월,한도심사거절후경과월,시장단기연체여부_R6M,시장단기연체여부_R3M,시장연체상환여부_R6M,시장연체상환여부_R3M,rv최초시작후경과일,Segment
0,201807,TRAIN_000000,0,19354,7270,0,0,22.995207,18.254978,17.264967,...,0회,0,3,3,0,0,0,0,99999999,D
1,201807,TRAIN_000001,0,9996,5718,41996,90611,14.793821,14.834873,10.622446,...,0회,0,3,3,0,0,0,0,322,E
2,201807,TRAIN_000002,0,88193,35207,0,0,22.014276,17.875321,17.155829,...,0회,0,3,3,0,0,0,0,2378,C
3,201807,TRAIN_000003,0,19062,6531,0,0,22.998014,22.999453,19.293674,...,0회,0,3,3,0,0,0,0,99999999,D
4,201807,TRAIN_000004,0,177222,47149,48000,155020,14.661948,10.897410,10.654587,...,0회,0,3,3,0,0,0,0,99999999,E
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2399995,201812,TRAIN_399995,0,20070,10167,0,54445,15.243670,11.900516,10.545876,...,0회,0,3,3,0,0,0,0,99999999,E
2399996,201812,TRAIN_399996,0,84217,31159,0,156800,14.843464,15.652063,11.102726,...,0회,0,3,3,0,0,0,0,99999999,D
2399997,201812,TRAIN_399997,0,52612,19429,0,0,17.038599,17.966213,13.278475,...,0회,0,3,3,0,0,0,0,99999999,C
2399998,201812,TRAIN_399998,0,10002,4228,90004,180906,15.182880,11.901089,10.594124,...,0회,0,3,3,0,0,0,0,99999999,E


## 사용할 컬럼
