### 1. Importing Packages and Loading Data

In [None]:
import pandas as pd
from zipfile import ZipFile
import os
 
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import math

In [None]:
os.getcwd()

In [None]:
file_path = './iranian+churn+dataset.zip'

with ZipFile(file_path) as z:
    print(z.namelist())

In [None]:
filename = z.namelist()[0]
with ZipFile(file_path).open(filename) as f:
    df = pd.read_csv(f)

#### 1.1 Basic Data Information

In [None]:
df.head()

In [None]:
# data size
print(f'num_row: {len(df)}, num_col: {len(df.columns)}')

In [None]:
# Missing Values per Column -> None
df.isnull().sum()

In [None]:
# Column Data Types 
df.dtypes

### 2. Splitting Data into Train and Test Sets
- stratified 

In [None]:
X = df.drop(columns = ['Churn']) # dataframe
y = df['Churn'] # series

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=312, stratify=y)

In [None]:
print(f'# of train samples: {len(X_train)}')
print(f'# of test samples: {len(X_test)}')

### 3. Exploratory Data Analysis
- on training samples only
- each row represents a customer

In [None]:
# Number of unique values in each predictor column
X_train.nunique()

In [None]:
# num_unique_values < 12 -> categorical variable
categorical_features = ['Complains', 'Charge  Amount', 'Age Group', 'Tariff Plan', 'Status', 'Age']
continuous_features = [c for c in X_train.columns if c not in categorical_features]
continuous_features

In [None]:
# Number of unique values in target column
y_train.nunique()

#### 3.1 Descriptive Statistics

In [None]:
# continuous features
X_train[continuous_features].describe()

In [None]:
# number of zero values for each continuous columns
for c in continuous_features:
    print(f'{c}: {(X_train[c] == 0).sum()}')
'''
통화를 안 쓴 128명 -> 통화 시간 0, 통화 횟수 0, 전화를 건 번호 개수도 0
고객 점수가 0점인 사람은 109 -> 통화 기록이 없는 128 명의 하위 집합일까? 만약 그려면 통화 기록이 없으면서 고객 점수가 0점이 아닌 사람들은 뭐지? 
''';

In [None]:
# frequency table for categorical predictors
for c in categorical_features:
    print(c + ' ' + '=' * (20 - len(c)))
    print(pd.DataFrame(
                {'count' : X_train[c].value_counts(),
                'percentage' : round(X_train[c].value_counts(normalize=True) * 100, 2)}))
    print()
'''
- Age와 Age 그룹의 분포는 동일하다. 다만 각 범주별 수치를 절대적으로 가져가냐 아니면 그룹 인덱스로 가져가냐 차이 
- Complains가 없는 사람들이 대부분을 차지. imbalanced event
- Tariff Plan에서도 정액제보다 사용량 기반 요금제가 대부분을 차지. imbalanced event 
- Status에서는 non-active인 고객이 1/4 정도 되는데, non-active가 정확히 뭘 의미하는거지?
- Charge amount는 meta data에 따르면 0~9 범위랬는데 10에 해당되는 행이 5개 있다. 이상치인가 그대로 들고 가도 되려나 
''';

In [None]:
# y_train distribution
pd.DataFrame(
            {'count' : y_train.value_counts(),
            'percentage' : round(y_train.value_counts(normalize=True) * 100, 2)})

#### 3.2 Data visualization

##### 3.2.1 Distribution
- Histogram for continuous variables
- Count plot for discrete variables

In [None]:
def draw_histogram(col_array):
    n_cols = 3
    n_rows = math.ceil(len(col_array) / n_cols)

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))

    for i, col in enumerate(col_array):
        row = i // n_cols
        col_idx = i % n_cols
        sns.histplot(data=X_train, x=col, ax=axes[row, col_idx])

    if len(col_array) % n_cols != 0:
        for j in range(len(col_array), n_rows * n_cols):
            fig.delaxes(axes.flat[j])
    
    plt.tight_layout()
    plt.show()

In [None]:
def draw_countplot(col_array):
    n_cols = 3
    n_rows = math.ceil(len(col_array) / n_cols)

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))

    for i, col in enumerate(col_array):
        row = i // n_cols
        col_idx = i % n_cols
        sns.countplot(data=X_train, x=col, ax=axes[row, col_idx], color='lightgreen')

    if len(col_array) % n_cols != 0:
        for j in range(len(col_array), n_rows * n_cols):
            fig.delaxes(axes.flat[j])
    
    plt.tight_layout()
    plt.show()

In [None]:
def count_outlier(col):
    data = X_train[col]
    Q1 = np.percentile(data, 25)
    Q3 = np.percentile(data, 75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data < lower_bound) | (data > upper_bound)]
    print(f"{col}: {len(outliers)}")

In [None]:
for c in continuous_features:
    count_outlier(c)

In [None]:
draw_histogram(continuous_features)

In [None]:
import pandas as pd
import numpy as np

# Original data
data = pd.DataFrame({
    'var1': [0, 1, 2, 3, 4],
    'var2': [1, 2, 3, 4, 5],
    'var3': [10, 20, 30, 40, 50]
})

# Function to apply (log transformation in this case, adding 1 to avoid log(0))
log_transform = lambda x: np.log(x + 1)

# Create a dictionary where keys are column names and values are log-transformed column data
transformed_data_dict = {col: log_transform(data[col]) for col in data.columns}

# Convert the dictionary back to a DataFrame (this will maintain column names)
transformed_data = pd.DataFrame(transformed_data_dict)

# Display the transformed DataFrame
print(transformed_data)

In [None]:
# Calculate skewness for each variable
from scipy.stats import skew

skewness = pd.DataFrame({'col': continuous_features,
                        'skewness': [skew(X_train[col]) for col in continuous_features]})

skewness.sort_values(by='skewness', key=abs, ascending=False)

In [None]:
draw_countplot(categorical_features)

#### 3.3 Correlation Analysis of continuous predictors

In [None]:
from scipy.stats import pearsonr

def calculate_correlations(df):
    correlations = []
    columns = df.columns
    for i in range(len(columns)):
        for j in range(i + 1, len(columns)):
            var1 = df[columns[i]]
            var2 = df[columns[j]]
            corr, p_value = pearsonr(var1, var2)
            correlations.append((columns[i], columns[j], corr, p_value))

    corr_df = pd.DataFrame(correlations, columns=['Variable 1', 'Variable 2', 'Correlation', 'P-Value'])
    return corr_df.sort_values(by='Correlation', ascending=False)

In [None]:
corr_df = calculate_correlations(X_train[continuous_features])
top_corr = corr_df[(corr_df['Correlation'] > 0.5) & (corr_df['P-Value'] < 0.05)]
top_corr

In [None]:
col_list = ['Seconds of Use', 'Frequency of use', 'Frequency of SMS']

In [None]:
rel_pair = {}
def get_rel_pair(col_list):
    for col in col_list:
        rel_pair[col] = []
    for col in col_list:
        cond1 = (top_corr['Variable 1'] == col)
        cond2 = (top_corr['Variable 2'] == col)
        pairs = list(top_corr[cond1 | cond2][['Variable 1', 'Variable 2']].values)
        for a,b in pairs:
            if a in rel_pair and b in rel_pair:
                if a not in rel_pair[b] and b not in rel_pair[a]:
                    rel_pair[a].append(b)
            elif a in rel_pair and b not in rel_pair[a]:
                rel_pair[a].append(b)
            elif b in rel_pair and a not in rel_pair[b]:
                rel_pair[b].append(a)

In [None]:
get_rel_pair(col_list)
rel_pair

In [None]:
def get_multi_pairplot(base, others):
    fig, axes = plt.subplots(1, len(others), figsize=(4 * len(others), 4))
    if len(others) == 1:
        var = others[0]
        sns.regplot(x=base, y=var, data=X_train, ax=axes, scatter_kws={'alpha': 0.5}, line_kws={'color': 'red', 'linewidth': 2, 'linestyle': '--'})
        axes.set_title(f'{base} vs {var}')
    else:
        for i, var in enumerate(others):
            sns.regplot(x=base, y=var, data=X_train, ax=axes[i], scatter_kws={'alpha':0.5}, line_kws={'color': 'red', 'linewidth': 2, 'linestyle': '--'})
            axes[i].set_title(f'{base} vs {var}')
    plt.tight_layout()
    plt.show()

In [None]:
for k in rel_pair:
    get_multi_pairplot(k, rel_pair[k])

In [None]:
corr = X_train[continuous_features].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.show()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_data = pd.DataFrame()
vif_data['Variable'] = continuous_features
vif_data['VIF'] = [variance_inflation_factor(X_train[continuous_features], i) for i in range(len(continuous_features))]

In [None]:
vif_data.sort_values(by='VIF', ascending=False)

In [None]:
# candidates for column removal
vif_data = pd.DataFrame()
temp = [x for x in continuous_features if x not in ['Frequency of use', 'Customer Value']]
vif_data['Variable'] = temp
vif_data['VIF'] = [variance_inflation_factor(X_train[temp], i) for i in range(len(temp))]

In [None]:
vif_data

#### 3.4 Predictors vs Response
- heatmap for categorical variables
- scatter plot for continuous variables

In [None]:
def draw_heatmap(col_array):  
    n_cols = 3
    n_rows = math.ceil(len(col_array) / n_cols)

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))

    for i, col in enumerate(col_array):
        row = i // n_cols
        col_idx = i % n_cols
        crosstab_result = pd.crosstab(X_train[col], y_train)
        sns.heatmap(crosstab_result, annot=True, cmap='coolwarm',  ax=axes[row, col_idx])

    if len(col_array) % n_cols != 0:
        for j in range(len(col_array), n_rows * n_cols):
            fig.delaxes(axes.flat[j])
    
    plt.tight_layout()
    plt.show()

In [None]:
def draw_scatter(col_array):
    n_cols = 3
    n_rows = math.ceil(len(col_array) / n_cols)

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))

    for i, col in enumerate(col_array):
        row = i // n_cols
        col_idx = i % n_cols
        sns.scatterplot(x=X_train[col], y=y_train, ax=axes[row, col_idx])

    if len(col_array) % n_cols != 0:
        for j in range(len(col_array), n_rows * n_cols):
            fig.delaxes(axes.flat[j])
    
    plt.tight_layout()
    plt.show()

In [None]:
draw_scatter(continuous_features) # 명확한 선형 관계 안 보임 

In [None]:
draw_heatmap(categorical_features) # 두 축에서 모두 다수인 클래스에 다수가 분포한다는 것밖에...