In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from time import time
from imblearn.over_sampling import RandomOverSampler

In [86]:
warnings.filterwarnings(action =  'ignore')
plt.rc('font',family = 'malgun gothic')

In [87]:
df = pd.read_csv('C:/Users/whileduck/Desktop/code/dataset/train.csv')
df

Unnamed: 0,ID,가입일,음성사서함이용,주간통화시간,주간통화횟수,주간통화요금,저녁통화시간,저녁통화횟수,저녁통화요금,밤통화시간,밤통화횟수,밤통화요금,상담전화건수,전화해지여부
0,TRAIN_00000,329,0,99.2,93,27.3,268.8,68,28.92,262.9,328,32.89,2,0
1,TRAIN_00001,2,80,323.9,323,83.7,269.4,326,32.09,322.8,209,32.32,2,0
2,TRAIN_00002,93,28,282.4,323,34.2,207.0,322,32.82,280.8,328,8.28,0,0
3,TRAIN_00003,223,1,221.4,223,25.1,233.0,61,23.90,203.8,234,9.36,0,0
4,TRAIN_00004,222,0,96.3,222,28.7,223.9,69,28.08,263.1,223,2.80,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30195,TRAIN_30195,263,80,289.6,201,21.8,280.5,323,29.88,208.0,66,9.28,2,0
30196,TRAIN_30196,283,81,210.7,280,90.5,284.1,202,32.80,287.8,203,6.28,2,0
30197,TRAIN_30197,24,0,222.4,33,22.1,233.9,32,22.22,293.6,95,4.22,2,0
30198,TRAIN_30198,63,1,262.4,202,29.6,280.6,282,28.88,280.9,207,20.88,2,1


# 파생변수 생성 

* 하루 평균에 관련된 변수, 사용량 대비 요금 파생변수를 만든 후 각 변수 별 gini 계수를 구해보자 

In [90]:
df['하루평균통화횟수'] = df['주간통화횟수'] + df['저녁통화횟수'] + df['밤통화횟수']
df['하루평균통화시간'] = round(df['주간통화시간'] + df['저녁통화시간'] + df['밤통화시간'],2)
df['하루평균통화요금'] = round(df['주간통화요금'] + df['저녁통화요금'] + df['밤통화요금'],2)

df['통화횟수대비요금'] = round(df['하루평균통화요금'] /df['하루평균통화횟수'],2)
df['통화시간대비요금'] = round(df['하루평균통화요금'] / df['하루평균통화시간'],2)

In [91]:
df = df[['ID', '가입일', '음성사서함이용', '주간통화시간', '주간통화횟수', '주간통화요금', '저녁통화시간',
       '저녁통화횟수', '저녁통화요금', '밤통화시간', '밤통화횟수', '밤통화요금', '상담전화건수',
       '하루평균통화횟수', '하루평균통화시간', '하루평균통화요금', '통화횟수대비요금', '통화시간대비요금', '전화해지여부']]

In [92]:
def Gini(data,value,target,min_leaf_node):
    
    unique = np.sort(data[value].unique()) #해당 컬럼이 가진 unique 한 값 나열
    
    unique_list = []
    gini_list = []
    
    for uni in unique:
        
        left = data[data[value] >= uni][[value,target]] # memory 용량을 줄이기 위해 차원 축소
        right = data[data[value] < uni][[value,target]]
        
        left_total = len(left) #스칼라 형태로 연산하자
        left_posi = np.sum(left[target] == 1) # 가지 친 영역마다의 스칼라 형태로 positive , negative 수 저장
        left_nega = left_total - left_posi
        
        right_total = len(right)
        right_posi = np.sum(right[target] == 1)
        right_nega = right_total - right_posi
        
        left_gini = 1 - (left_posi / left_total) **2 - (left_nega / left_total) **2 #각 영역 별 gini 계수 계산
        right_gini = 1 - (right_posi / right_total) **2 - (right_nega / right_total) **2
        
        total = left_total + right_total
        
        if (left_total / total > min_leaf_node) and (right_total / total > min_leaf_node): #최소한 divide 할 때 최소 비율엔 맞춰서 divide 되도록
            
            unique_list.append(uni)
            gini_list.append(left_gini + right_gini)
        
    gini_idx = np.argmin(gini_list) #gini 계수를 최소로 하게 만드는 기준
        
    return unique_list[gini_idx],gini_list[gini_idx]       

In [93]:
cols = df.columns.tolist()[1:-1]

col_list = []
uniques_list = []
ginis_list = []

for col in cols:

    unique,gini = Gini(df,col,'전화해지여부',0.2)
    
    col_list.append(col)
    uniques_list.append(unique)
    ginis_list.append(gini)    
        
gini_df = pd.DataFrame({'컬럼':col_list,
             '기준':uniques_list,
              '지니':ginis_list}).sort_values(by = '지니').reset_index(drop = True)

gini_df

Unnamed: 0,컬럼,기준,지니
0,하루평균통화요금,108.66,0.338017
1,통화시간대비요금,0.12,0.33872
2,주간통화요금,51.0,0.340921
3,음성사서함이용,25.0,0.343709
4,주간통화시간,213.0,0.346026
5,하루평균통화시간,668.3,0.363884
6,밤통화요금,5.27,0.369946
7,통화횟수대비요금,0.24,0.370028
8,상담전화건수,1.0,0.375209
9,가입일,53.0,0.379058


# 데이터 스케일링

In [78]:
test = pd.read_csv('C:/Users/whileduck/Desktop/code/dataset/test.csv')

test['하루평균통화횟수'] = test['주간통화횟수'] + test['저녁통화횟수'] + test['밤통화횟수']
test['하루평균통화시간'] = round(test['주간통화시간'] + test['저녁통화시간'] + test['밤통화시간'],2)
test['하루평균통화요금'] = round(test['주간통화요금'] + test['저녁통화요금'] + test['밤통화요금'],2)

test['통화횟수대비요금'] = round(test['하루평균통화요금'] /test['하루평균통화횟수'],2)
test['통화시간대비요금'] = round(test['하루평균통화요금'] / test['하루평균통화시간'],2)

test

* 거리기반 분류 알고리즘도 사용 할 것이기 때문에 스케일링 해주자 

In [79]:
def min_max(df):
    
    min_df = min(df)
    max_df = max(df)
    
    return (df - min_df)/(max_df - min_df)

In [80]:
train_cols = df.columns.tolist()[1:-1]
test_cols = test.columns.tolist()[1:]

for _ in train_cols:
    df[_] = min_max(df[_])
for _ in test_cols:
    test[_] = min_max(test[_])

In [None]:
df = df.drop('ID',axis = 1)
test = test.drop('ID',axis = 1)

df.to_csv('daycon_train.csv',index = False)
test.to_csv('daycon_test.csv',index = False)