# 라이브러리 로드

In [23]:
%config Completer.use_jedi = False
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import random
import warnings
warnings.filterwarnings('ignore')

# 모델 생성
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

import joblib

# 노트북 안에 그래프를 그리기 위해
%matplotlib inline

# 그래프에서 마이너스 폰트 깨지는 문제에 대한 대처
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Malgun Gothic'


# 모델 불러오기

In [24]:
# 저장한 모델 불러와 변수에 담기
loaded_model = joblib.load('./SGD_model.pkl')
loaded_model

In [8]:
def churn_prediction(df):
    
    # 데이터 원본 min, max값 - 데이터에서 직접 가져올수도 있을듯
    age_max = 80
    age_min = 19
    number_of_dependents_max = 9
    number_of_dependents_min = 0
    satisfaction_score_max = 5
    satisfaction_score_min = 1
    tech_services_max = 4
    tech_services_min = 0
    streaming_services_max = 2
    streaming_services_min = 0
    combined_product_max = 4
    combined_product_min = 1
    tenure_in_months_max = 72
    tenure_in_months_min = 1
    monthly_charge_max = 118.75
    monthly_charge_min = 18.25
    total_revenue_max = 11979.34
    total_revenue_min = 21.36

    # 스케일링 변환 함수
    def transform_and_scale(value, min_val, max_val):
        if value > max_val:
            return (value - max_val) / (max_val - min_val)
        elif value < min_val:
            return (value - min_val) / (max_val - min_val)
        else:
            return (value - min_val) / (max_val - min_val)
    
    # 더미 변수 생성
    def create_membership_dummy(Membership):
        membership_dummy = [0, 0, 0, 0, 0, 0]  # None, Offer A, Offer B, Offer C, Offer D, Offer E에 대한 더미 변수

        if Membership == "None":
            membership_dummy[0] = 1
        elif Membership == "Offer A":
            membership_dummy[1] = 1
        elif Membership == "Offer B":
            membership_dummy[2] = 1
        elif Membership == "Offer C":
            membership_dummy[3] = 1
        elif Membership == "Offer D":
            membership_dummy[4] = 1
        elif Membership == "Offer E":
            membership_dummy[5] = 1

        return membership_dummy
    
    # 더미 변수 생성
    def create_contract_dummy(Contract):
        contract_dummy = [0, 0, 0]  # Month-to-Month, One Year, Two Year에 대한 더미 변수

        if Contract == "Month-to-Month":
            contract_dummy[0] = 1
        elif Contract == "One Year":
            contract_dummy[1] = 1
        elif Contract == "Two Year":
            contract_dummy[2] = 1

        return contract_dummy
    
    # 데이터 스케일링
    def scale_data(row):
        scaled_data = []
        input_values = row[['Age', 'Number of Dependents', 'Satisfaction Score', 'Tech services',
                            'Streaming services', 'Combined Product', 'Tenure in Months',
                            'Monthly Charge', 'Total Revenue']]
        min_values = [age_min, number_of_dependents_min, satisfaction_score_min, tech_services_min,
                      streaming_services_min, combined_product_min, tenure_in_months_min,
                      monthly_charge_min, total_revenue_min]
        max_values = [age_max, number_of_dependents_max, satisfaction_score_max, tech_services_max,
                      streaming_services_max, combined_product_max, tenure_in_months_max,
                      monthly_charge_max, total_revenue_max]

        for i in range(len(input_values)):
            scaled_data.append(transform_and_scale(input_values[i], min_values[i], max_values[i]))

        return scaled_data

    # 예측 및 결과 저장
    def predict_churn(row):
        scaled_data = row['Scaled Data']
        membership_dummy = row['Membership Dummy']
        contract_dummy = row['Contract Dummy']
        model_input = scaled_data + membership_dummy + contract_dummy
        churn_probability = loaded_model.predict_proba([model_input])[0][1] * 100  # 클래스 1의 확률을 선택
        return churn_probability
    
    train_df = df.copy()
    train_df['Membership Dummy'] = train_df['Membership'].apply(create_membership_dummy)
    train_df['Contract Dummy'] = train_df['Contract'].apply(create_contract_dummy)

    train_df['Scaled Data'] = train_df.apply(scale_data, axis=1)

    # 모델 로드
    loaded_model = joblib.load('./SGD_model.pkl')

    train_df['Churn Probability'] = train_df.apply(predict_churn, axis=1)

    result_df = train_df[['Customer ID', 'Churn Probability']]

#     # JSON 형식으로 변환
#     json_df = result_df.to_json(orient='records')

#     return json_df  # JSON 형식 반환

    return result_df

# 데이터 로드

In [26]:
df = pd.read_excel("Churn_final.xlsx")
df.head()

Unnamed: 0,Customer ID,Age,Number of Dependents,Membership,Satisfaction Score,Tech services,Streaming services,Combined Product,Contract,Tenure in Months,Monthly Charge,Total Revenue,Churn Value
0,8779-QRDMV,78,0,,2,1,1,2,Month-to-Month,1,39.65,59.65,1
1,7495-OOKFY,74,1,Offer E,5,1,0,1,Month-to-Month,8,80.65,1024.1,1
2,1658-BYGOY,71,3,Offer D,3,0,2,1,Month-to-Month,18,95.45,1910.88,1
3,4598-XLKNJ,78,1,Offer C,3,2,2,1,Month-to-Month,25,98.5,2995.07,1
4,4846-WHAFZ,80,1,Offer C,1,0,0,1,Month-to-Month,37,76.5,3102.36,1


In [11]:
result = churn_prediction(df)
result

Unnamed: 0,Customer ID,Churn Probability
0,8779-QRDMV,65.700984
1,7495-OOKFY,52.182243
2,1658-BYGOY,40.846450
3,4598-XLKNJ,56.178603
4,4846-WHAFZ,62.192832
...,...,...
7038,2569-WGERO,0.000000
7039,6840-RESVB,0.000000
7040,2234-XADUH,0.000000
7041,4801-JZAZL,26.726435


In [15]:
final = pd.concat([result, df.iloc[:,-4:-1]], axis=1)
final

Unnamed: 0,Customer ID,Churn Probability,Tenure in Months,Monthly Charge,Total Revenue
0,8779-QRDMV,65.700984,1,39.65,59.65
1,7495-OOKFY,52.182243,8,80.65,1024.10
2,1658-BYGOY,40.846450,18,95.45,1910.88
3,4598-XLKNJ,56.178603,25,98.50,2995.07
4,4846-WHAFZ,62.192832,37,76.50,3102.36
...,...,...,...,...,...
7038,2569-WGERO,0.000000,72,21.15,3039.53
7039,6840-RESVB,0.000000,24,84.80,2807.47
7040,2234-XADUH,0.000000,72,103.20,9453.04
7041,4801-JZAZL,26.726435,11,29.60,319.21


In [16]:
# 고위험 고객 그룹 : 이탈율 60% 이상
high_prob = final[final['Churn Probability'] >= 60]
high_prob

Unnamed: 0,Customer ID,Churn Probability,Tenure in Months,Monthly Charge,Total Revenue
0,8779-QRDMV,65.700984,1,39.65,59.65
4,4846-WHAFZ,62.192832,37,76.50,3102.36
5,4412-YLTKF,60.760640,27,78.05,2235.41
6,0390-DCFDQ,77.814355,1,70.45,85.73
10,0094-OIFMO,75.368663,11,95.00,1367.79
...,...,...,...,...,...
6946,7952-OBOYL,67.765819,3,89.85,319.75
6957,9108-EJFJP,64.125792,1,53.55,69.61
6960,2215-ZAFGX,77.709002,9,85.50,895.20
7004,5568-DMXZS,68.024949,8,65.45,853.41


In [18]:
# 고위험 고객 그룹의 이탈을 막았을 경우 추후 1년 간 유지할 수 있는 매출(요금제 변동 없다고 가정)
# 고위험 그룹 Monthly Charge * 12

final['Save'] = final["Monthly Charge"] * 12
final

Unnamed: 0,Customer ID,Churn Probability,Tenure in Months,Monthly Charge,Total Revenue,Save
0,8779-QRDMV,65.700984,1,39.65,59.65,475.8
1,7495-OOKFY,52.182243,8,80.65,1024.10,967.8
2,1658-BYGOY,40.846450,18,95.45,1910.88,1145.4
3,4598-XLKNJ,56.178603,25,98.50,2995.07,1182.0
4,4846-WHAFZ,62.192832,37,76.50,3102.36,918.0
...,...,...,...,...,...,...
7038,2569-WGERO,0.000000,72,21.15,3039.53,253.8
7039,6840-RESVB,0.000000,24,84.80,2807.47,1017.6
7040,2234-XADUH,0.000000,72,103.20,9453.04,1238.4
7041,4801-JZAZL,26.726435,11,29.60,319.21,355.2


In [28]:
feature_importances = loaded_model.coef_[0]
feature_importances

array([ 0.25338593, -1.72349128, -0.61616885, -0.32622475,  0.14485616,
       -0.25808713, -0.55096867,  0.553857  ,  0.        ,  0.        ,
        0.21158883, -0.05426157, -0.0598876 , -0.15508016,  0.21463461,
        0.52916213,  0.        , -0.34187111])

In [31]:
import matplotlib.pyplot as plt

# 피처 중요도를 내림차순으로 정렬합니다.
sorted_idx = feature_importances.argsort()[::-1]

# 중요도 순서대로 피처 이름과 중요도를 추출합니다.
sorted_features = X_train.columns[sorted_idx]
sorted_importances = feature_importances[sorted_idx]

# 중요도가 높은 상위 N개의 피처를 선택합니다.
top_n = 10  # 상위 N개 피처를 선택하도록 수정 가능
top_features = sorted_features[:top_n]
top_importances = sorted_importances[:top_n]

# 중요도에 따른 가로 막대 그래프를 그립니다.
plt.figure(figsize=(10, 6))
plt.barh(range(top_n), top_importances, align='center')
plt.yticks(range(top_n), top_features)
plt.xlabel('Feature Importance')
plt.title('Top {} Feature Importances'.format(top_n))
plt.show()


NameError: name 'X_train' is not defined