In [1]:
# prompt: 구글드라이브 연동해줘

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 라이브러리 선언

In [2]:
## api 호출 라이브러
import requests
## 데이터 조작 라이브러리
import pandas as pd
import numpy as np
## 머신러닝 라이브러리
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
##전처리 라이브러리
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
## 시각화 라이브러
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [3]:
!pip install -U scikit-learn
!pip install -U imbalanced-learn
!pip install -U xgboost



## 데이터 불러오기

In [4]:
# 원하는 경로로 이동
os.chdir('/content/drive/MyDrive/Colab Notebooks/data_analysis_2025/MLProject')

# 현재 경로 확인
print(os.getcwd())

/content/drive/MyDrive/Colab Notebooks/data_analysis_2025/MLProject


In [5]:
df = pd.read_csv('Churn_Modelling.csv')

In [6]:
df.columns = df.columns.str.upper()
df

Unnamed: 0,ROWNUMBER,CUSTOMERID,SURNAME,CREDITSCORE,GEOGRAPHY,GENDER,AGE,TENURE,BALANCE,NUMOFPRODUCTS,HASCRCARD,ISACTIVEMEMBER,ESTIMATEDSALARY,EXITED
0,1,15634602,Hargrave,619,France,Female,42.0,2,0.00,1,1.0,1.0,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41.0,1,83807.86,1,0.0,1.0,112542.58,0
2,3,15619304,Onio,502,France,Female,42.0,8,159660.80,3,1.0,0.0,113931.57,1
3,4,15701354,Boni,699,France,Female,39.0,1,0.00,2,0.0,0.0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43.0,2,125510.82,1,,1.0,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9997,9998,15584532,Liu,709,France,Female,36.0,7,0.00,1,0.0,1.0,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42.0,3,75075.31,2,1.0,0.0,92888.52,1
9999,9999,15682355,Sabbatini,772,Germany,Male,42.0,3,75075.31,2,1.0,0.0,92888.52,1
10000,10000,15628319,Walker,792,France,Female,28.0,4,130142.79,1,1.0,0.0,38190.78,0


## 타입통합/특성 숫자컬럼 추가

In [7]:
strColumnList = [ "CUSTOMERID", "SURNAME", "GEOGRAPHY", "GENDER"]
intColumnList = ["ROWNUMBER", "CREDITSCORE", "AGE", "TENURE", "NUMOFPRODUCTS","HASCRCARD", "ISACTIVEMEMBER", "EXITED"]
floatColumnList = ["BALANCE", "ESTIMATEDSALARY"]

In [8]:
for i in range(0,len(strColumnList)):
    targetColumn = strColumnList[i]
    df[targetColumn] = df[targetColumn].fillna('0')
    df[targetColumn] = df[targetColumn].astype(str)
for i in range(0,len(intColumnList)):
    targetColumn = intColumnList[i]
    df[targetColumn] = df[targetColumn].fillna('0')
    df[targetColumn] = df[targetColumn].astype(int)
for i in range(0,len(floatColumnList)):
    targetColumn = floatColumnList[i]
    df[targetColumn] = df[targetColumn].fillna('0')
    df[targetColumn] = df[targetColumn].astype(float)

In [9]:
# HasCrCard, IsActiveMember, Age, Geography는 결측치가 존재

In [10]:
def get_age_group(age):
    if age < 20:
        return 10
    elif age < 30:
        return 20
    elif age < 40:
        return 30
    elif age < 50:
        return 50
    else:
        return 60

In [12]:
featuresData = df.loc[:]

In [13]:
featuresData["AGECTGRY"] = featuresData["AGE"].apply(get_age_group)

In [15]:
genderMap = {"Male":0,"Female":1}
featuresData["GENDER_LE"] = featuresData.GENDER.map( genderMap )

In [17]:
featuresData.head(3)

Unnamed: 0,ROWNUMBER,CUSTOMERID,SURNAME,CREDITSCORE,GEOGRAPHY,GENDER,AGE,TENURE,BALANCE,NUMOFPRODUCTS,HASCRCARD,ISACTIVEMEMBER,ESTIMATEDSALARY,EXITED,AGECTGRY,GENDER_LE
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1,50,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,50,1
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,50,1


## minmaxscaler 만들기

In [18]:
scrScaler = MinMaxScaler()
balScaler = MinMaxScaler()
esrScaler = MinMaxScaler()

featuresData["CREDITSCORE_SC"] = balScaler.fit_transform(featuresData.loc[:, ["CREDITSCORE"]] )
featuresData["BALANCE_SC"] = balScaler.fit_transform(featuresData.loc[:, ["BALANCE"]] )
featuresData["ESTIMATEDSALARY_SC"] = balScaler.fit_transform(featuresData.loc[:, ["ESTIMATEDSALARY"]] )

In [21]:
orgColumns = ["AGE", "BALANCE","CREDITSCORE", "GENDER", "ESTIMATEDSALARY", "ROWNUMBER", "CUSTOMERID", "SURNAME", "GEOGRAPHY"]
featuresData.drop(columns=orgColumns, inplace=True)
featuresData.head(2)

Unnamed: 0,TENURE,NUMOFPRODUCTS,HASCRCARD,ISACTIVEMEMBER,EXITED,AGECTGRY,GENDER_LE,CREDITSCORE_SC,BALANCE_SC,ESTIMATEDSALARY_SC
0,2,1,1,1,1,50,1,0.538,0.0,0.506735
1,1,1,0,1,0,50,1,0.516,0.334031,0.562709


## 특성선정 및 데이터분리

In [24]:
corrDf = featuresData.corr(numeric_only=True)
stdCorr = 0.1
label = ["EXITED"]
features =list(corrDf.loc[(abs(corrDf.EXITED) >= stdCorr) & (corrDf.EXITED != 1)].index)
print(f"label : {label} 선정된 features : {features}")
corrDf

label : ['EXITED'] 선정된 features : ['ISACTIVEMEMBER', 'AGECTGRY', 'GENDER_LE', 'BALANCE_SC']


Unnamed: 0,TENURE,NUMOFPRODUCTS,HASCRCARD,ISACTIVEMEMBER,EXITED,AGECTGRY,GENDER_LE,CREDITSCORE_SC,BALANCE_SC,ESTIMATEDSALARY_SC
TENURE,1.0,0.013418,0.02274,-0.02818,-0.014117,-0.008377,-0.014706,0.000703,-0.012282,0.00783
NUMOFPRODUCTS,0.013418,1.0,0.003376,0.009459,-0.047601,-0.0323,0.021681,0.012203,-0.304226,0.01429
HASCRCARD,0.02274,0.003376,1.0,-0.011899,-0.006929,-0.019318,-0.005993,-0.005734,-0.014974,-0.009929
ISACTIVEMEMBER,-0.02818,0.009459,-0.011899,1.0,-0.156129,0.047848,-0.022376,0.025673,-0.010379,-0.011208
EXITED,-0.014117,-0.047601,-0.006929,-0.156129,1.0,0.327963,0.106244,-0.026909,0.118456,0.012123
AGECTGRY,-0.008377,-0.0323,-0.019318,0.047848,0.327963,1.0,0.029802,-0.012159,0.036727,0.007192
GENDER_LE,-0.014706,0.021681,-0.005993,-0.022376,0.106244,0.029802,1.0,0.002902,-0.011989,0.008005
CREDITSCORE_SC,0.000703,0.012203,-0.005734,0.025673,-0.026909,-0.012159,0.002902,1.0,0.00639,-0.001557
BALANCE_SC,-0.012282,-0.304226,-0.014974,-0.010379,0.118456,0.036727,-0.011989,0.00639,1.0,0.012704
ESTIMATEDSALARY_SC,0.00783,0.01429,-0.009929,-0.011208,0.012123,0.007192,0.008005,-0.001557,0.012704,1.0


In [25]:
featuresData.shape

(10002, 10)

In [26]:
trainData, testData = train_test_split(featuresData, test_size=0.2, random_state=10)

In [27]:
trainDataFeatures = trainData.loc[:, features]
trainDataLabels = trainData.loc[:, label]
trainDataAll = trainData.loc[:]
testDataFeatures = testData.loc[:, features]
testDataLabels = testData.loc[:, label]
testDataAll = testData.loc[:]

In [28]:
print(trainDataFeatures.shape)
print(trainDataLabels.shape)
print(testDataFeatures.shape)
print(testDataLabels.shape)
print(testDataAll.shape)

(8001, 4)
(8001, 1)
(2001, 4)
(2001, 1)
(2001, 10)


## 모델 적용

In [29]:
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings(action="ignore")

In [32]:
knnModel = KNeighborsClassifier(n_neighbors=3)
scvModel = SVC(C=1.0)
dtModel = DecisionTreeClassifier(random_state=10)

fittedKnnModel = knnModel.fit(trainDataFeatures, trainDataLabels)
fittedScvModel = scvModel.fit(trainDataFeatures, trainDataLabels)
fittedDtModel = dtModel.fit(trainDataFeatures, trainDataLabels)

## 예측

In [33]:
predictKnnValue = fittedKnnModel.predict(testDataFeatures)
predictSvcValue = fittedScvModel.predict(testDataFeatures)
predictDtValue = fittedDtModel.predict(testDataFeatures)

In [34]:
testDataAll["predict_knn"] = predictKnnValue
testDataAll["predict_svc"] = predictSvcValue
testDataAll["predict_dt"] = predictDtValue

In [39]:
testDataAll

Unnamed: 0,TENURE,NUMOFPRODUCTS,HASCRCARD,ISACTIVEMEMBER,EXITED,AGECTGRY,GENDER_LE,CREDITSCORE_SC,BALANCE_SC,ESTIMATEDSALARY_SC,predict_knn,predict_svc,predict_dt
937,5,1,0,1,0,50,1,1.000,0.512580,0.855505,0,0,0
9356,7,1,1,0,1,50,0,0.046,0.000000,0.388911,0,0,0
2293,0,1,1,0,1,50,1,0.678,0.230310,0.989057,0,0,1
192,1,2,0,0,0,50,0,0.258,0.000000,0.574979,0,0,0
6850,1,1,1,1,0,50,0,0.402,0.200060,0.116951,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5136,10,2,1,1,0,60,1,0.734,0.392041,0.108103,0,0,0
987,0,1,1,0,1,30,0,0.438,0.605183,0.511490,0,0,0
7117,5,1,1,1,1,30,0,0.428,0.484108,0.102112,0,0,0
2092,3,3,0,1,1,30,0,0.610,1.000000,0.405251,0,0,0


## 데이터 정리

In [37]:
from sklearn.metrics import accuracy_score

In [40]:
knnAccuray = accuracy_score( y_true=testDataAll.EXITED, y_pred = testDataAll.predict_knn)
svcAccuray = accuracy_score( y_true=testDataAll.EXITED, y_pred = testDataAll.predict_svc)
dtAccuray = accuracy_score( y_true=testDataAll.EXITED, y_pred = testDataAll.predict_dt)

In [41]:
accDf = pd.DataFrame( [[knnAccuray, svcAccuray, dtAccuray]] , columns=["knn_accuracy","svc_accuracy","dt_accuracy"])

## 예측 결과 저장

In [42]:
import joblib

In [43]:
modelDump = {}

In [44]:
modelDump["model"] = fittedKnnModel
modelDump["features"] = features
modelDump["label"] = label
modelDump["preprocessing"] = [balScaler, scrScaler,  esrScaler]

In [45]:
modelDump

{'model': KNeighborsClassifier(n_neighbors=3),
 'features': ['ISACTIVEMEMBER', 'AGECTGRY', 'GENDER_LE', 'BALANCE_SC'],
 'label': ['EXITED'],
 'preprocessing': [MinMaxScaler(), MinMaxScaler(), MinMaxScaler()]}

In [47]:
joblib.dump(modelDump, "kncore.dump")

['kncore.dump']

In [48]:
joblib.load("dtcore.dump")

{'model': KNeighborsClassifier(n_neighbors=3),
 'features': ['ISACTIVEMEMBER', 'AGECTGRY', 'GENDER_LE', 'BALANCE_SC'],
 'label': ['EXITED'],
 'preprocessing': [MinMaxScaler(), MinMaxScaler(), MinMaxScaler()]}