In [8]:
# Basic module
import numpy as np
import pandas as pd

# Graphic module
import matplotlib.pyplot as plt
import seaborn as sns

# Chi-Square module
from scipy.stats import chi2, chi2_contingency, pointbiserialr  # Not exist in basic anaconda. Install need(pip install scipy or conda install scipy)
from scipy.stats.contingency import association

## Data loading

### Data load

In [9]:
labeled_data = pd.read_csv("./result_data/labeling_data.csv", header=0, index_col=0, encoding='utf-8')

### Data check

> churn : 해지 여부에 대한 데이터로, 범주-명목형 데이터

Categorical과 Numerical Data 두가지가 혼재되어 있음

- Categorical - Categorical : Chi-Square (카이 제곱 검정)
- Categorical - Numerical(Continuous) : t 검정, Point biserial correlation

## Correlationship Test

### Chi-Square :: Categorical - Categorical

In [10]:
# Joined 상태 Stayed로 변경
labeled_data_cp = labeled_data.copy()

labeled_data_cp['Customer Status'][labeled_data_cp['Customer Status'] == 'Joined'] = 'Stayed'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_data_cp['Customer Status'][labeled_data_cp['Customer Status'] == 'Joined'] = 'Stayed'


In [11]:
categorical_column_list = []
for i in range(len(labeled_data_cp.columns)):
    if labeled_data_cp[labeled_data_cp.columns[i]].dtype == 'object':
        if len(labeled_data_cp[labeled_data_cp.columns[i]].unique()) <= 100 and len(labeled_data_cp[labeled_data_cp.columns[i]].unique()) != 1:
        # if len(labeled_data_cp[labeled_data_cp.columns[i]].unique()) != len(labeled_data_cp):
            categorical_column_list.append(labeled_data_cp.columns[i])

In [28]:
chi_result = pd.DataFrame(columns=['Column', 'Chi-SQ', 'P-value(0.05)', 'H0/H1', 'Corr'])  # 컬럼명, 카이제곱값, p값, 독립/상관 여부, 상관계수
# 카이제곱값이 클수록 예측결과와 실제 결과가 다름을 의미하며, 유의미한 관련성을 가짐
# alpha는 0.05로 작성

for col_name in categorical_column_list:
    ct_result = pd.crosstab(labeled_data_cp[str(col_name)], labeled_data_cp['Customer Status'])

    chi, p, dof, expected = chi2_contingency(ct_result)

    # 유의수준 0.05, 5%이내
    if p < 0.05:
        R_h0 = '상관'
    else:
        R_h0 = '독립'
    
    if len(ct_result) > 2:  # crosstab 결과로, 어느 축이든 변수가 2개 이상이면 Cramer-V 로 상관 계수를 구함
        corr_value = association(ct_result, method='cramer')
    else:  # x/y 축 모두 변수가 2개이면, phi corr로 상관 계수를 구함
        corr_value = labeled_data_cp[(str(col_name)+'_label')].corr(labeled_data_cp['Churn Label_label'])
    
    chi_result.loc[len(chi_result)] = [col_name, chi, p, R_h0, corr_value]


  value = phi2 / min(n_cols - 1, n_rows - 1)
  value = phi2 / min(n_cols - 1, n_rows - 1)


In [30]:
chi_result.sort_values('Corr', ascending=False).reset_index(drop=True)

Unnamed: 0,Column,Chi-SQ,P-value(0.05),H0/H1,Corr
0,Churn Label,7037.871379,0.0,상관,1.0
1,Contract,1445.293243,0.0,상관,0.453001
2,Internet Type,653.832054,2.15035e-141,상관,0.304687
3,Offer,481.725489,7.020357e-102,상관,0.26153
4,Internet Service,364.519799,2.9203e-81,상관,0.22789
5,Payment Method,337.831161,4.374168e-74,상관,0.219014
6,Paperless Billing,258.277649,4.0733549999999996e-58,상관,0.191825
7,Unlimited Data,194.549217,3.231434e-44,상관,0.166545
8,Senior Citizen,159.4263,1.5100669999999998e-36,상관,0.150889
9,Streaming TV,27.862522,1.302484e-07,상관,0.063228


### Point biserial correlation :: Categorical - Numeric

In [14]:
Numeric_column_list = []
for i in range(len(labeled_data_cp.columns)):
    if labeled_data_cp[labeled_data_cp.columns[i]].dtype == 'float64' or labeled_data_cp[labeled_data_cp.columns[i]].dtype == 'int64':
        # if len(labeled_data_cp[labeled_data_cp.columns[i]].unique()) <= 100 and len(labeled_data_cp[labeled_data_cp.columns[i]].unique()) != 1:
        # if len(labeled_data_cp[labeled_data_cp.columns[i]].unique()) != len(labeled_data_cp):
            Numeric_column_list.append(labeled_data_cp.columns[i])

In [15]:
new_ncl = Numeric_column_list.copy()
labeled_column = []

for col_name in new_ncl:
    if '_label' in col_name:
        labeled_column.append(col_name)  # *_label column 제거 전 리스트화

for target_col in labeled_column:
    if target_col in new_ncl:
        new_ncl.remove(target_col)

# new_ncl

In [16]:
ttest_result = pd.DataFrame(columns=['Column', 'PB-correlation', 'P-value(0.05)', 'H0/H1'])  # 컬럼명, 상관계수, p값, 독립/상관

for col_name in new_ncl:
    ttest, p = pointbiserialr(labeled_data_cp['Churn Label_label'], labeled_data_cp[col_name])

    if p < 0.05:
        R_h0 = '상관'
    else:
        R_h0 = '독립'

    ttest_result.loc[len(ttest_result)] = [col_name, ttest, p, R_h0]



In [24]:
ttest_result.sort_values('PB-correlation', ascending=False).reset_index(drop=True)

Unnamed: 0,Column,PB-correlation,P-value(0.05),H0/H1
0,Churn Value,1.0,0.0,상관
1,Churn Score,0.660772,0.0,상관
2,Monthly Charge,0.193356,2.706646e-60,상관
3,Age,0.11576,1.9204970000000001e-22,상관
4,Avg Monthly GB Download,0.048868,4.081545e-05,상관
5,Longitude,0.024052,0.04354771,상관
6,Avg Monthly Long Distance Charges,0.00812,0.4956803,독립
7,Total Extra Data Charges,0.007139,0.5491321,독립
8,Zip Code,-0.016289,0.1716809,독립
9,Total Refunds,-0.033709,0.004665343,상관


## Result
- Categorical Columns correlation, Chi-Square Test

|Rank| Column | Chi-SQ                 | P-value(0.05) | H0/H1     |
|--------|------------------------|---------------|-----------|----|
| 0      | Customer Status        | 7037.871379   | 0.00E+00  | 상관 |
| 1      | Contract               | 1445.293243   | 0.00E+00  | 상관 |
| 2      | Churn Label            | 7037.871379   | 0.00E+00  | 상관 |
| 3      | Internet Type          | 653.832054    | 2.15E-141 | 상관 |
| 4      | Offer                  | 481.725489    | 7.02E-102 | 상관 |
| 5      | Dependents             | 433.734379    | 2.50E-96  | 상관 |
| 6      | Internet Service       | 364.519799    | 2.92E-81  | 상관 |
| 7      | Payment Method         | 337.831161    | 4.37E-74  | 상관 |
| 8      | Paperless Billing      | 258.277649    | 4.07E-58  | 상관 |
| 9      | Online Security        | 205.633104    | 1.23E-46  | 상관 |
| 10     | Unlimited Data         | 194.549217    | 3.23E-44  | 상관 |
| 11     | Premium Tech Support   | 190.166842    | 2.92E-43  | 상관 |
| 12     | Senior Citizen         | 159.4263      | 1.51E-36  | 상관 |
| 13     | Married                | 158.733382    | 2.14E-36  | 상관 |
| 14     | Referred a Friend      | 155.940342    | 8.72E-36  | 상관 |
| 15     | Online Backup          | 47.260854     | 6.21E-12  | 상관 |
| 16     | Device Protection Plan | 30.513395     | 3.32E-08  | 상관 |
| 17     | Streaming TV           | 27.862522     | 1.30E-07  | 상관 |
| 18     | Streaming Movies       | 26.251336     | 3.00E-07  | 상관 |
| 19     | Under 30               | 20.690894     | 5.40E-06  | 상관 |
| 20     | Streaming Music        | 14.42146      | 1.46E-04  | 상관 |
| 21     | Multiple Lines         | 11.143251     | 8.43E-04  | 상관 |


- Numeric Columns correlation, T-test

|Rank|Column|PB-correlation|P-value(0.05)|H0/H1|
|---|---|---|---|---|
|0|Churn Value|1.000000|0.000000e+00|상관|
|1|Satisfaction Score|-0.754649|0.000000e+00|상관|
|2|Churn Score|0.660772|0.000000e+00|상관|
|3|Tenure in Months|-0.352861|1.329222e-205|상관|
|4|Number of Referrals|-0.286540|3.406199e-133|상관|
|5|Total Long Distance Charges|-0.223756|1.247019e-80|상관|
|6|Total Revenue|-0.223003|4.351662e-80|상관|
|7|Number of Dependents|-0.218780|4.420027e-77|상관|
|8|Total Charges|-0.198546|1.538899e-63|상관|
|9|Monthly Charge|0.193356|2.706646e-60|상관|
|10|CLTV|-0.127463|6.679131e-27|상관|
|11|Age|0.115760|1.920497e-22|상관|
|12|Avg Monthly GB Download|0.048868|4.081545e-05|상관|
|13|Latitude|-0.041546|4.874845e-04|상관|
|14|Total Refunds|-0.033709|4.665343e-03|상관|
|15|Longitude|0.024052|4.354771e-02|상관|


### 개인 확인용

In [18]:
test= pointbiserialr(labeled_data_cp['Churn Label_label'], labeled_data_cp['Age'])
test

SignificanceResult(statistic=0.11575973716172766, pvalue=1.9204974701201783e-22)

In [19]:
test = pd.crosstab(labeled_data_cp['Offer'], labeled_data_cp['Customer Status'])
test

Customer Status,Churned,Stayed
Offer,Unnamed: 1_level_1,Unnamed: 2_level_1
,1051,2826
Offer A,35,485
Offer B,101,723
Offer C,95,320
Offer D,161,441
Offer E,426,379


In [20]:
chi, p, dof, expected = chi2_contingency(test)

In [21]:
print(f"chi 스퀘어 값: {chi}\n",
      f"p-value (0.05): {p}\n",
      f"자유도 수: {dof}\n",
      f"기대값: \n{pd.DataFrame(expected)}\n"
      f"\n실제값: \n{test}")

chi 스퀘어 값: 481.7254889632278
 p-value (0.05): 7.020357379958891e-102
 자유도 수: 5
 기대값: 
             0            1
0  1028.838989  2848.161011
1   137.992333   382.007667
2   218.664774   605.335226
3   110.128496   304.871504
4   159.752662   442.247338
5   213.622746   591.377254

실제값: 
Customer Status  Churned  Stayed
Offer                           
None                1051    2826
Offer A               35     485
Offer B              101     723
Offer C               95     320
Offer D              161     441
Offer E              426     379
