In [None]:
import pandas as pd
import numpy as np
import os
import random

import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import matplotlib.patches as mpatches

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import log_loss, roc_curve, auc, RocCurveDisplay,  silhouette_score, classification_report, accuracy_score,confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_val_predict, train_test_split
from sklearn.linear_model import LogisticRegression
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod.families import Binomial
from statsmodels.genmod.families.links import logit
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.preprocessing import LabelEncoder

from sklearn.cluster import KMeans

from mpl_toolkits.mplot3d import Axes3D

import warnings
warnings.filterwarnings("ignore")
os.chdir("/Users/yj.noh/Documents/GitHub/prj_on_boarding")
print(os.getcwd())
plt.rcParams['font.family'] = 'AppleGothic'

# 1. data load

In [None]:
df = pd.read_csv("/Users/yj.noh/Desktop/new_on_boarding_data.csv", encoding = "cp949")
print(df.shape) # 16,134
print(df.isna().sum())

### 친구추천 아닌 사람만! 

In [None]:
#data_filter = df[((df['day_cnt'].notnull()) & (df['delivery_method'] == 'BIKE') & (df['is_recom'] == 0))]
#data_filter = df[((df['day_cnt'].notnull()) & (df['delivery_method'] == 'BIKE'))]

df =  df[((df['active_days'].notnull()) & (df['is_recom'] == 0))]
print(df.isna().sum())
print(df.shape)  # 10,245

In [None]:
print(df['outcome'].value_counts()) 

# 2. 그래프 그리기

In [None]:
category_vars = ['birth', 'delivery_method', 'insurance_type', 'is_recom', 'gender']
num_vars = ['active_days', 'avg_daily_delivery', 'avg_distance', 'avg_fee', 'avg_distance_1_to_3', 'avg_fee_1_to_3', 'join_period', 'from_join_to_first_able', 'from_first_able_to_start']

## 2-1. numeric

In [None]:
# palette = plasma, coolwarm, magma, BuGn, Dark2 
def plot_numeric (data, numeric_vars, outcome):
    
    os.makedirs('graphs_all/', exist_ok = True)
    
    palette = 'coolwarm'
    
    mapping = {0 : '미이탈', 1 : '이탈'}
    data[outcome] = data[outcome].replace(mapping)
    
    for num_var in numeric_vars:
        fig, axs = plt.subplots(1,3, figsize=(12,4))
        fig.suptitle(f'{num_var} 분포', fontsize = 12)
    
        # Boxplot
        sns.boxplot(ax=axs[0], x= outcome, y=num_var, data= data, palette = palette)
        axs[0].set_title('Boxplot')
    
        # Violinplot
        sns.violinplot(ax=axs[1], x = outcome, y = num_var, data = data, palette = palette)
        axs[1].set_title('Violinplot')
        
        # KDE plot
        sns.kdeplot(ax=axs[2], data = data, x = num_var, hue = outcome, fill = True, common_norm = False, palette = palette, alpha=.5, linewidth=0)
        axs[2].set_title('Density plot')
        
        plt.tight_layout()
        fig.subplots_adjust(top=0.8) # title 공간 확보
        
        fig.savefig(f'graphs_all/{num_var}_distributions.png')
        plt.show()
        plt.close(fig)
        
        

In [None]:
plot_numeric(df, num_vars, 'outcome')

In [None]:
# plt.figure(figsize=(8,6))
# sns.boxplot(data=data_filter, x='outcome', y='avg_cnt')
# plt.title("outcome 별 일평균수행처리건수")
# plt.savefig("prj_on_boarding/boxplot1.png")
# plt.show()

# plt.figure(figsize=(8,6))
# sns.boxplot(data=data_filter, x='outcome', y='day_cnt')
# plt.title("outcome 별 수행일수")
# plt.savefig("prj_on_boarding/boxplot2.png")
# plt.show()

# # day_cnt
# plt.figure(figsize=(8,6))
# sns.violinplot(x='outcome', y='day_cnt', data=data_filtered_both)
# plt.title('Distribution of day_cnt by outcome')
# plt.show()

# # avg_cnt
# plt.figure(figsize=(8,6))
# sns.violinplot(x='outcome', y='avg_cnt', data=data_filtered_both)
# plt.title('Distribution of avg_cnt by outcome')
# plt.show()


## 2.2 category 변수

In [None]:
def plot_category(data, category_vars, outcome) : 
    
    # outcome 값 변경
    mapping  = {0 : '미이탈', 1 : '이탈'}
    data[outcome] = data[outcome].replace(mapping)
    
    palette = 'coolwarm'
    
    # 저장할 디렉토리 생성
    os.makedirs('graphs_all', exist_ok = True)
    
    for cat_var in category_vars : 
        plt.figure(figsize = (12,4))
        ax = sns.countplot(x=cat_var, hue=outcome, data=data, palette=palette)
        plt.title(f'이탈여부에 따른 {cat_var} 분포')
        plt.ylabel('개수')
        plt.legend(title=outcome, loc='upper right')
        plt.xticks(fontsize=8)  # x축 글씨 조정
        
        # 각 막대에 데이터 레이블 추가
        for p in ax.patches:
            ax.annotate(f'{int(p.get_height())}',  # 높이 값(즉, 개수)을 얻어 텍스트로 설정
                        (p.get_x() + p.get_width() / 2., p.get_height()),  # 텍스트 위치 설정
                        ha='center',  # 가로 정렬(center)
                        va='center',  # 세로 정렬(center)
                        xytext=(0, 10),  # 텍스트 오프셋(위쪽으로 약간 이동)
                        textcoords='offset points',  # 어떤 종류의 오프셋을 사용할지 정의
                        fontsize=10)  # 글꼴 크기
        
        
        plt.tight_layout()
        plt.savefig(f'graphs_all/{cat_var}_distributions.png')
        plt.show()
        
       

In [None]:
plot_category(df, category_vars, 'outcome')

In [None]:
print(df['delivery_method'].value_counts()) # bike, p_bicycles, car, g_bicycles, walk, throttle_bicycles, kickboard 

## 3. BIKE만 대상으로 진행 

In [None]:
data_bike = df[(df['delivery_method'] == 'WALK')]

print(data_bike.shape)
print(data_bike['delivery_method'].value_counts()) 

#data_bike = df[df['delivery_method'].isin(['PAS_BICYCLES', 'GENERAL_BICYCLES', 'THROTTLE_BICYCLES', 'KICKBOARD'])]

# 3-1.numeric - outlier 제거

In [None]:
#이탈 값 되돌리기
mapping  = { '미이탈' : 0 ,  '이탈' : 1}
data_bike['outcome'] = data_bike['outcome'].replace(mapping)

# outlier 제거  - outcome 별로 나눠서 
def remove_outliers(df, column):
    Q1 = df.groupby('outcome')[column].quantile(0.25)
    Q3 = df.groupby('outcome')[column].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    return df[df.apply(lambda x: (x[column] >= lower_bound[x['outcome']]) & 
                                 (x[column] <= upper_bound[x['outcome']]), axis=1)]


In [None]:
#'avg_cnt'와 'day_cnt'에 대한 아웃라이어 제거
data_filtered_avg = remove_outliers(data_bike, 'active_days')
data_filtered_both = remove_outliers(data_bike, 'avg_daily_delivery')

print(data_filtered_both.shape)  
print(data_filtered_both['outcome'].value_counts())

data_filtered_both.to_excel('data_filtered_both.xlsx', index=False, engine='openpyxl') # 3,393 / 2,255, 1,138 -> 1,782/ 1203, 579

# 3-2 그래프 확인

In [None]:
# palette = plasma, coolwarm, magma, BuGn, Dark2 
def plot_numeric (data, numeric_vars, outcome):
    
    os.makedirs('graph_walk', exist_ok = True)
    
    palette = 'coolwarm'
    
    for num_var in numeric_vars:
        fig, axs = plt.subplots(1,3, figsize=(12,4))
        fig.suptitle(f'{num_var} 분포', fontsize = 12)
    
        # Boxplot
        sns.boxplot(ax=axs[0], x= outcome, y=num_var, data= data, palette = palette)
        axs[0].set_title('Boxplot')
    
        # Violinplot
        sns.violinplot(ax=axs[1], x = outcome, y = num_var, data = data, palette = palette)
        axs[1].set_title('Violinplot')
        
        # KDE plot
        sns.kdeplot(ax=axs[2], data = data, x = num_var, hue = outcome, fill = True, common_norm = False, palette = palette, alpha=.5, linewidth=0)
        axs[2].set_title('Density plot')
        
        plt.tight_layout()
        fig.subplots_adjust(top=0.8) # title 공간 확보
        
        fig.savefig(f'graph_walk/{num_var}_distributions_bike.png')
        plt.show()
        plt.close(fig)
    
def plot_category(data, category_vars, outcome) : 
    
    # outcome 값 변경
    mapping  = {0 : '미이탈', 1 : '이탈'}
    data[outcome] = data[outcome].replace(mapping)
    
    palette = 'coolwarm'
    
    for cat_var in category_vars : 
        plt.figure(figsize = (12,4))
        ax = sns.countplot(x=cat_var, hue = 'outcome', data = data, palette = palette)
        plt.title(f'이탈여부에 따른 {cat_var} 분포')
        plt.ylabel('개수')
        plt.legend(title=outcome, loc='upper right')
        plt.xticks(fontsize=8)  # x축 글씨 조정
        
        # 각 막대에 데이터 레이블 추가
        for p in ax.patches:
            ax.annotate(f'{int(p.get_height())}',  # 높이 값(즉, 개수)을 얻어 텍스트로 설정
                        (p.get_x() + p.get_width() / 2., p.get_height()),  # 텍스트 위치 설정
                        ha='center',  # 가로 정렬(center)
                        va='center',  # 세로 정렬(center)
                        xytext=(0, 10),  # 텍스트 오프셋(위쪽으로 약간 이동)
                        textcoords='offset points',  # 어떤 종류의 오프셋을 사용할지 정의
                        fontsize=10)  # 글꼴 크기
            
        plt.tight_layout()
        plt.savefig(f'graph_walk/{cat_var}_distributions_bike.png')
        plt.show()


In [None]:
plot_numeric(data_filtered_both, num_vars, 'outcome')
plot_category(data_filtered_both, category_vars, 'outcome')

In [None]:
#data_filtered_both = df

# 이탈 값 되돌리기
mapping  = { '미이탈' : 0 ,  '이탈' : 1}
data_filtered_both['outcome'] = data_filtered_both['outcome'].replace(mapping)

# 데이터 필터링
X_test_0 = data_filtered_both[data_filtered_both['outcome'] == 0]
X_test_1 = data_filtered_both[data_filtered_both['outcome'] == 1]

plt.figure(figsize=(12, 6))

# KDE plot
# `cbar=True`를 추가하여 컬러 바를 그래프에 추가
# `thresh`를 설정하여 밀도가 낮은 영역을 숨기기
sns.kdeplot(data=X_test_0, x='active_days', y='avg_daily_delivery', cmap='Blues', shade=True, alpha=0.5, cbar=True, thresh=0.05)
sns.kdeplot(data=X_test_1, x='active_days', y='avg_daily_delivery', cmap='Oranges', shade=True, alpha=0.5, cbar=True, thresh=0.05)

plt.xlabel('수행일자')
plt.ylabel('일평균수행건수')
plt.title('Density plot of active days and avg_daily delivery by churn')

# Manual legend
blue_patch = mpatches.Patch(color='blue', label='미이탈')
orange_patch = mpatches.Patch(color='orange', label='이탈')
plt.legend(handles=[blue_patch, orange_patch])

# 저장할 디렉토리 생성
#os.makedirs('graphs', exist_ok=True)  

# 그래프를 파일로 저장
#plt.savefig("graphs_bike/density_plot.png")

# 그래프 표시
plt.show()

# 4. train, test set split 

In [None]:
# train_set, test_set = train_test_split(data_filtered_both, test_size = 0.25, 
#                                        stratify = data_filtered_both['outcome'], random_state=1234)
# X_train = train_set[['birth', 'delivery_method', 'insurance_type', 'is_recom', 'gender','active_days', 'avg_daily_delivery', 'avg_distance', 'avg_fee', 'avg_distance_1_to_3', 'avg_fee_1_to_3', 'join_period', 'from_join_to_first_able', 'from_first_able_to_start', 'working_period']]
# y_train = train_set['outcome']
# X_test =  test_set[['birth', 'delivery_method', 'insurance_type', 'is_recom', 'gender','active_days', 'avg_daily_delivery', 'avg_distance', 'avg_fee', 'avg_distance_1_to_3', 'avg_fee_1_to_3', 'join_period', 'from_join_to_first_able', 'from_first_able_to_start', 'working_period']]
# y_test = test_set['outcome']

# print(X_train.shape, X_test.shape)  
# print(y_train.value_counts())
# print(y_test.value_counts())

In [None]:
train_set, test_set = train_test_split(data_filtered_both, test_size=0.25, 
                                       stratify=data_filtered_both['outcome'], random_state=1234)

le_delivery_method = LabelEncoder()
le_insurance_type = LabelEncoder()
le_gender = LabelEncoder()

data_filtered_both['delivery_method_encoded'] = le_delivery_method.fit_transform(data_filtered_both['delivery_method'])
data_filtered_both['insurance_type_encoded'] = le_insurance_type.fit_transform(data_filtered_both['insurance_type'])
data_filtered_both['gender_encoded'] = le_gender.fit_transform(data_filtered_both['gender'])


train_set['delivery_method_encoded'] = data_filtered_both.loc[train_set.index, 'delivery_method_encoded']
train_set['insurance_type_encoded'] = data_filtered_both.loc[train_set.index, 'insurance_type_encoded']
train_set['gender_encoded'] = data_filtered_both.loc[train_set.index, 'gender_encoded']

test_set['delivery_method_encoded'] = data_filtered_both.loc[test_set.index, 'delivery_method_encoded']
test_set['insurance_type_encoded'] = data_filtered_both.loc[test_set.index, 'insurance_type_encoded']
test_set['gender_encoded'] = data_filtered_both.loc[test_set.index, 'gender_encoded']


train_set.drop(['delivery_method', 'insurance_type', 'gender'], axis=1, inplace=True)
test_set.drop(['delivery_method', 'insurance_type', 'gender'], axis=1, inplace=True)


X_train = train_set[['birth', 'delivery_method_encoded', 'insurance_type_encoded', 'is_recom', 'gender_encoded',
                     'active_days', 'avg_daily_delivery', 'avg_distance', 'avg_fee', 
                     'avg_distance_1_to_3', 'avg_fee_1_to_3', 'join_period', 
                     'from_join_to_first_able', 'from_first_able_to_start']]

y_train = train_set['outcome']

X_test = test_set[['birth', 'delivery_method_encoded', 'insurance_type_encoded', 'is_recom', 'gender_encoded',
                   'active_days', 'avg_daily_delivery', 'avg_distance', 'avg_fee', 
                   'avg_distance_1_to_3', 'avg_fee_1_to_3', 'join_period', 
                   'from_join_to_first_able', 'from_first_able_to_start']]

y_test = test_set['outcome']


delivery_method_mapping = dict(zip(le_delivery_method.classes_, le_delivery_method.transform(le_delivery_method.classes_)))
insurance_type_mapping = dict(zip(le_insurance_type.classes_, le_insurance_type.transform(le_insurance_type.classes_)))
gender_mapping = dict(zip(le_gender.classes_, le_gender.transform(le_gender.classes_)))

print("delivery_method:", delivery_method_mapping)
print("insurance_type:", insurance_type_mapping)
print("gender:", gender_mapping)


# 5. Decision Tree

In [None]:
depth_range = list(range(1, 11))  # 1부터 10까지의 깊이를 테스트
cv_scores = []
random.seed(2234)

for depth in depth_range:
    clf = DecisionTreeClassifier(max_depth=depth)
    scores = cross_val_score(clf, X_train, y_train, cv=10)  # 5-fold 교차 검증
    cv_scores.append(scores.mean())

# 교차 검증 점수가 가장 높은 max_depth 값 찾기
optimal_depth = depth_range[cv_scores.index(max(cv_scores))]
print(f"Optimal max_depth is {optimal_depth}")

# 최적의 max_depth 값으로 모델 학습
clf = DecisionTreeClassifier(max_depth=optimal_depth)
clf = clf.fit(X_train, y_train)

# 모델의 정확도 확인
print(f"Training Accuracy: {clf.score(X_train, y_train):.4f}")
print(f"Test Accuracy: {clf.score(X_test, y_test):.4f}")

feature_names = [col.replace('_encoded', '') for col in X_train.columns]

# Decision Tree 시각화
fig, ax = plt.subplots(figsize=(24, 8))
tree.plot_tree(clf, filled=True, feature_names=feature_names, class_names=['0', '1'], rounded=True, fontsize=10)
#plt.savefig("graphs_bike/dt tree 1.png")
plt.show()

# 교차 검증 점수에 따른 max_depth 값의 변화를 그래프로 그리기
plt.figure(figsize=(10, 6))
plt.plot(depth_range, cv_scores, marker='o', linestyle='-')
plt.xlabel("max_depth")
plt.ylabel("Mean CV Score")
plt.title("Mean CV Score vs. max_depth")
plt.grid(True)
#plt.savefig("graphs_bike/cv_dt tree.png")
plt.show()

## 특성 중요도 출력 

In [None]:
importances = clf.feature_importances_
sorted_indices = np.argsort(importances)[::-1]

print("Feature importances:")
for idx in sorted_indices:
    print(f"{feature_names[idx]}: {importances[idx]:.4f}")


In [None]:
# model 학습 
dep_n = 2
random.seed(24534)
clf = DecisionTreeClassifier(max_depth = dep_n)  
clf = clf.fit(X_train, y_train)

# 모델의 정확도 확인
print(f"Training Accuracy: {clf.score(X_train, y_train):.4f}")
print(f"Test Accuracy: {clf.score(X_test, y_test):.4f}")

# Decision Tree 시각화
fig, ax = plt.subplots(figsize=(14, 6))
tree.plot_tree(clf, filled=True, feature_names=['active_days', 'avg_daily_delivery'], class_names=['0', '1'], rounded=True, fontsize=10)
#plt.savefig("graphs_is_not_recom_bike/DT tree.png")
plt.show()


# 6. 연관규칙

In [None]:
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

# 데이터셋을 one-hot-encoding 형태로 변환
te = TransactionEncoder()
te_ary = te.fit(data).transform(data)
df = pd.DataFrame(te_ary, columns=te.columns_)

# Apriori 알고리즘을 사용하여 빈번한 항목 집합을 찾습니다.
frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True)

# 연관 규칙을 추출합니다.
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.1)

# 이탈 여부와 관련된 규칙만 필터링합니다.
churn_rules = rules[rules['consequents'].apply(lambda x: 'outcome' in x)]
