In [1]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 데이터 전처리 알고리즘
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# 학습용과 검증용으로 나누는 함수
from sklearn.model_selection import train_test_split

# 교차 검증
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

# 평가함수
# 분류용
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# 회귀용
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 모델의 최적의 하이퍼 파라미터를 찾기 위한 도구
from sklearn.model_selection import GridSearchCV

# 머신러닝 알고리즘 - 분류
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# 머신러닝 알고리즘 - 회귀
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor

from scipy.stats import f_oneway
from scipy.stats import shapiro
from scipy.stats import levene
from scipy.stats import kstest, norm
import scipy.stats as stats
from scipy.stats import spearmanr
from sklearn.preprocessing import PowerTransformer
from scipy.stats import kruskal
from scipy.stats import chi2_contingency

# 차원 축소
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# 군집
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift
from sklearn.cluster import estimate_bandwidth

# 학습 모델 저장을 위한 라이브러리
import pickle

### 가설7 : 신용등급이 낮은 그룹일수록 보유 카드 수 대비 사용 카드 수 비율이 낮다.


In [2]:
# 데이터를 불러온다.
df1 = pd.read_parquet('open/train/1.회원정보/201807_train_회원정보.parquet')
df2 = pd.read_parquet('open/train/1.회원정보/201808_train_회원정보.parquet')
df3 = pd.read_parquet('open/train/1.회원정보/201809_train_회원정보.parquet')
df4 = pd.read_parquet('open/train/1.회원정보/201810_train_회원정보.parquet')
df5 = pd.read_parquet('open/train/1.회원정보/201811_train_회원정보.parquet')
df6 = pd.read_parquet('open/train/1.회원정보/201812_train_회원정보.parquet')

In [3]:
# 데이터 병합하기
# ignore_index = True : 인덱스를 새로 0부터 재부여한다.
df = pd.concat([df1, df2, df3, df4, df5, df6], ignore_index = True)
# 처음과 마지막 5개 행 출력해서 확인하기
# print(train_df.head())
# print(train_df.tail())

In [5]:
# 파생변수 생성 
# 신용카드사용률 = 이용카드수_신용 / 이용가능카드수_신용
df = df[df['이용가능카드수_신용'] != 0]
df['신용카드사용률'] = df['이용카드수_신용'] / df['이용가능카드수_신용']

In [7]:
# 필요한 컬럼 추출
df_g7 = df[['Segment', '이용가능카드수_신용', '이용카드수_신용', '신용카드사용률']].dropna()
df_g7

Unnamed: 0,Segment,이용가능카드수_신용,이용카드수_신용,신용카드사용률
0,D,1,1,1.0
1,E,1,1,1.0
2,C,1,1,1.0
3,D,2,1,0.5
4,E,1,0,0.0
...,...,...,...,...
2399995,E,1,0,0.0
2399996,D,1,1,1.0
2399997,C,1,1,1.0
2399998,E,1,0,0.0


In [10]:
# 등급을 수치로 매핑 (예: A=5, ..., E=1)
segment_map = {'A': 5, 'B': 4, 'C': 3, 'D': 2, 'E': 1}
df_g7['Segment_score'] = df_g7['Segment'].map(segment_map)

# Spearman 상관분석
from scipy.stats import spearmanr
corr, p = spearmanr(df_g7['Segment_score'], df_g7['신용카드사용률'])
print(f'Spearman 상관계수: {corr:.4f}, p-value: {p:.4f}')

Spearman 상관계수: 0.0771, p-value: 0.0000
