# 라이브러리 로드

In [12]:
%config Completer.use_jedi = False
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import warnings
warnings.filterwarnings('ignore')

# 모델 생성
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import classification_report


# 가상고객 데이터 생성
import random
pd.options.display.float_format = '{:.5f}'.format


%matplotlib inline

# 모델 생성

In [25]:
def churn_prediction(df):
    # 범주형 컬럼 One-Hot Encoding
    encoding_df = pd.get_dummies(df, columns=['Membership', 'Contract'])

    # 학습/테스트셋 분리
    y_target = encoding_df['Churn Value']
    X_data = encoding_df.drop(['Churn Value'], axis=1, inplace=False)

    X_train, X_test, y_train, y_test = train_test_split(X_data, y_target, test_size=0.2, random_state=156)

    # 학습 데이터셋(수치형 컬럼) 정규화
    ## Initialize variable
    X_train_origin, X_test_origin, y_train_origin, y_test_origin = X_train.copy(), X_test.copy(), y_train.copy(), y_test.copy()

    ## Numeric Only
    Numeric_column_list = []
    for i in range(len(X_data.columns)):
        if X_data[X_data.columns[i]].dtype == 'float64' or X_data[X_data.columns[i]].dtype == 'int64':
            Numeric_column_list.append(X_data.columns[i])

    numeric_train_data, numeric_test_data = X_train[Numeric_column_list], X_test[Numeric_column_list]

    scaler = MinMaxScaler()

    X_train[Numeric_column_list] = scaler.fit_transform(numeric_train_data)
    X_test[Numeric_column_list] = scaler.transform(numeric_test_data)

    # 선정한 모델로 학습
    best_model = SGDClassifier(random_state=42, alpha=0.001, loss='modified_huber',
                               max_iter=100, penalty='l1', tol=1e-05)

    best_model.fit(X_train, y_train)

    y_pred = best_model.predict(X_test)

    # 가상 데이터로 예측 확률 계산 => 실제 DB에 들어갈 데이터
    
    
    predicted_proba = best_model.predict_proba(X_test.iloc[0:1])
    probability_of_class_1 = predicted_proba[0][1]

    return probability_of_class_1

# 학습에 이용할 데이터 로드

In [26]:
df = pd.read_excel("Churn.xlsx")
df.head()

Unnamed: 0,Age,Number of Dependents,Membership,Satisfaction Score,Tech services,Streaming services,Combined Product,Contract,Tenure in Months,Monthly Charge,Total Revenue,Churn Value
0,78,0,,3,1,1,2,Month-to-Month,1,39.65,59.65,1
1,74,1,Offer E,3,1,0,1,Month-to-Month,8,80.65,1024.1,1
2,71,3,Offer D,2,0,2,1,Month-to-Month,18,95.45,1910.88,1
3,78,1,Offer C,2,2,2,1,Month-to-Month,25,98.5,2995.07,1
4,80,1,Offer C,2,0,0,1,Month-to-Month,37,76.5,3102.36,1


# 가상고객데이터 생성

In [27]:
# 빈 데이터프레임 생성
test_df = pd.DataFrame()

test_df['Age'] = np.random.randint(19, 81, 100)
test_df['Number of Dependents'] = np.random.randint(0, 6, 100)
test_df['Membership'] = np.random.choice(['None', 'Offer A', 'Offer B', 'Offer C', 'Offer D', 'Offer E'], size=100)
test_df['Satisfaction Score'] = np.random.randint(1, 6, 100)
test_df['Tech services'] = np.random.randint(0, 5, 100)
test_df['Streaming services'] = np.random.randint(0, 3, 100)
test_df['Combined Product'] = np.random.randint(1, 5, 100)
test_df['Contract'] = np.random.choice(['Month-to-Month', 'One Year', 'Two Year'], size=100)
test_df['Tenure in Months'] = np.random.randint(1, 73, 100)
test_df['Monthly Charge'] = np.random.randint(18, 119, 100)

# 'Total Revenue' 컬럼 생성
random_values = np.random.randint(-50, 151, size=100)
revenues = (test_df['Tenure in Months'] * test_df['Monthly Charge']) - random_values
test_df['Total Revenue'] = np.maximum(revenues, 0)

test_df

Unnamed: 0,Age,Number of Dependents,Membership,Satisfaction Score,Tech services,Streaming services,Combined Product,Contract,Tenure in Months,Monthly Charge,Total Revenue
0,70,5,Offer D,5,2,0,4,Month-to-Month,65,103,6729
1,46,2,Offer C,4,0,2,3,Month-to-Month,22,112,2426
2,66,1,,4,2,1,2,Two Year,15,70,960
3,26,4,Offer A,3,4,0,3,Month-to-Month,59,64,3637
4,36,3,,3,4,1,4,Two Year,58,87,4980
...,...,...,...,...,...,...,...,...,...,...,...
95,19,1,Offer E,2,4,1,2,One Year,32,72,2193
96,61,3,Offer C,5,3,2,4,Two Year,29,95,2707
97,54,5,Offer C,2,1,2,2,Month-to-Month,71,90,6342
98,59,1,Offer E,5,4,2,3,Month-to-Month,49,100,4782


# 함수 호출

In [None]:
# 데이터를 로드하고 함수를 호출

# 학습에 이용할 데이터


# 가상 고객 데이터

test_df
result = churn_prediction(df, test_df)

print("가상 고객이 이탈할 확률:", result)
print("가상 고객 이탈 점수:", int(result * 100), "점")
