## Import Library

In [74]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import LabelBinarizer,LabelEncoder
from sklearn.model_selection import train_test_split
import lightgbm as lgbm
from lightgbm import LGBMClassifier
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')
pd.set_option('Display.max_columns',100)
pd.set_option('Display.max_rows',100)

## Import Dataset

In [75]:
path    = "../final_assignment_public/Telecom_customer_churn.csv"
df      = pd.read_csv( path )

print( 'df:\n{} rows × {} columns\n'.format( df.shape[0], df.shape[1] ) )

df:
100000 rows × 100 columns



In [76]:
path = '../csv/f_imp_core.csv'
df_feature = pd.read_csv( path, index_col=0 )

print( 'df_feature:\n{} rows × {} columns\n'.format( df_feature.shape[0], df_feature.shape[1] ) )

df_feature:
22 rows × 1 columns



In [77]:
# 重要度が高い22種類のカラムだけ取得
df_core = df[df_feature['feature']]
df_core['churn'] = df['churn']

In [81]:
type( df_core['eqpdays']

pandas.core.frame.DataFrame

## Data Preprocessing

### Categorical data

In [None]:
# カテゴリカル変数を表示
x_cat = df_core[df_core.select_dtypes( include=object ).columns].columns

# カテゴリカル変数を落とす(理解不能なため)
df_core = df_core.drop( columns=x_cat )

### Numerical data

In [None]:
def check_null( _df ):
    print( _df.isnull().sum() )

In [None]:
# 欠損が多いカラムを落とす
df_core = df_core.drop( columns='lor' )

In [None]:
# 不要なカラム(groupbyの結果，解約者と非解約者で差がないデータ)を落とす
df_core = df_core[ ['churn', 'eqpdays', 'change_mou', 'hnd_price'] ]

In [None]:
# 欠損値を埋める
df_core['eqpdays']      = df_core['eqpdays'].fillna( df_core['eqpdays'].median() )
df_core['hnd_price']    = df_core['hnd_price'].fillna( df_core['hnd_price'].median() )
df_core['change_mou']   = df_core['change_mou'].fillna( df_core['change_mou'].median() )

check_null( df_core )

## EDA

In [None]:
df_core['churn'].value_counts()

In [None]:
def plot_churn_ratio():
    plt.rcParams["figure.figsize"] = (7, 7)
    plt.pie( df_core['churn'].value_counts(), counterclock=True, startangle=90, colors=['tab:blue','tab:orange'])
    plt.show()

In [None]:
plot_churn_ratio()

### 解約者の特徴分析をおこなうために手がかりとする3種類のデータ
|  重要度  |  カラム名  |  内容  |  分類  |
| ---- | ---- | ---- | ---- |
|  1  |  eqpdays  |  現在の端末の使用日数  |  使用日数  |
|  2  |  change_mou  |  過去3か月の毎月の平均使用時間の変化率(%)  |  使用時間  |
|  3  |  hnd_price  |  現在の携帯電話の料金  |  料金  |

### 解約者の特徴を分析する

In [None]:
# 解約者と非解約者で，明らかに差がありそうなデータを探す
for column_name in df_core.columns:
    print( '{}\n'.format( df_core.groupby('churn')[column_name].mean() ) )

In [None]:
# 解約者と非解約者で，明らかに差がありそうなデータを探す
for column_name in df_core.columns:
    print( '{}\n'.format( df_core.groupby('churn')[column_name].median() ) )

In [None]:
# 解約者と非解約者で，明らかに差がありそうなデータを探す
for column_name in df_core.columns:
    print( '{}\n'.format( df_core.groupby('churn')[column_name].std() ) )

In [None]:
def plot( _y, _ylabel ):
    plt.rcParams["figure.figsize"] = (10, 10)
    # plt.title( 'Cumulative Feature Importance', fontsize=24 )

    # parameters
    x = [0, 1]
    bar_width = 0.5
    line_width = 0.5

    plt.bar( x[0], _y[0], bar_width, edgecolor='black', linewidth=line_width )
    plt.bar( x[1], _y[1], bar_width, edgecolor='black', linewidth=line_width )

    plt.xticks( [0, 1], fontsize=20 )
    plt.yticks( fontsize=20 )
    plt.xlabel( 'churn', fontsize=24 )
    plt.ylabel( _ylabel, fontsize=24 )
    plt.show()

#### 1. eqpday(現在の端末の使用日数)

In [None]:
plot( df_core.groupby('churn')['eqpdays'].mean(), 'Mean of eqpdays' )

#### 2. change_mou(過去3か月の月間平均使用時間の変化率(%))

In [None]:
# 平均と標準偏差を計算
change_mou_mean = df_core['change_mou'].mean()
change_mou_std = df_core['change_mou'].std()

# 閾値を計算
change_mou_th_upper = change_mou_mean + 2*change_mou_std
change_mou_th_lower = change_mou_mean - 2*change_mou_std

# 外れ値を除去
df_change_mou = df_core[ (df_core['change_mou'] < change_mou_th_upper) & (df_core['change_mou'] > change_mou_th_lower) ]
df_change_mou = df_change_mou[ ['change_mou', 'churn'] ]

In [None]:
# plot( df_core.groupby('churn')['change_mou'].mean(), 'Mean of change_mou(%)' )
plot( df_change_mou.groupby('churn')['change_mou'].mean(), 'Mean of change_mou(%)' )

#### 3. hnd_price(現在の携帯電話の価格)

In [None]:
plot( df_core.groupby('churn')['hnd_price'].mean(), 'Mean of hnd_price' )

## Export to csv file

In [None]:
df_core.to_csv( '../csv/df_core.csv' )

## 3つの特徴量について，非解約者の分布を確認

In [None]:
df_churn0 = df_core[df_core['churn'] == 0]
df_churn1 = df_core[df_core['churn'] == 1]

In [None]:
def plot_dist( _df, _feature, _bins=100 ):
    plt.rcParams["figure.figsize"] = (10, 10)

    sns.distplot( _df[_feature], kde=True, bins=_bins, color='#1e77b4' )

In [None]:
def plot_2dist( _df0, _df1,  _feature, _bins=100, _kde=True ):
    plt.rcParams["figure.figsize"] = (10, 10)

    sns.distplot( _df0[_feature], kde=_kde, bins=_bins, color='#1e77b4' )
    sns.distplot( _df1[_feature], kde=_kde, bins=_bins, color='#f97f10' )
    # sns.distplot( _df0[_feature], kde=_kde, color='#1e77b4' )
    # sns.distplot( _df1[_feature], kde=_kde, color='#f97f10' )

### eqpdays

In [None]:
plot_2dist( df_churn0, df_churn1, 'eqpdays', 200 )

### hnd_price

In [None]:
def plot_box( _df, _feature ):
    plt.rcParams["figure.figsize"] = (10, 5)

    sns.boxplot( _df[_feature], color='tab:blue', linewidth=2 )

In [None]:
# plot_dist( df_churn0, 'hnd_price', 30 )

display( df_churn0['hnd_price'].describe() )
plot_box( df_churn0, 'hnd_price' )

In [None]:
plot_2dist( df_churn0, df_churn1, 'hnd_price', 100 )

### change_mou

In [None]:
df_change_mou_churn0 = df_change_mou[df_change_mou['churn'] == 0]
df_change_mou_churn1 = df_change_mou[df_change_mou['churn'] == 1]

In [None]:
plot_2dist( df_change_mou_churn0, df_change_mou_churn1, 'change_mou', 100, True )

In [None]:
# plot_box( df_change_mou_churn0, 'change_mou' )
# display( df_change_mou_churn0['change_mou'].describe() )

plot_box( df_churn0, 'change_mou' )
display( df_churn0['change_mou'].describe() )

### change_rev

In [None]:
df_change_rev_churn0 = df_change_rev[df_change_rev['churn'] == 0]
df_change_rev_churn1 = df_change_rev[df_change_rev['churn'] == 1]

In [None]:
plot_2dist( df_change_rev_churn0, df_change_rev_churn1, 'change_rev', 80, False )

In [None]:
plot_box( df_change_rev_churn0, 'change_rev' )
display( df_change_rev_churn0['change_rev'].describe() )

In [None]:
df_churn0_tmp = df[df['churn'] == 0]
df_churn1_tmp = df[df['churn'] == 1]

plot_box( df_churn0_tmp, 'rev_Mean' )
display( df_churn0_tmp['rev_Mean'].describe() )

In [None]:
df_core

In [None]:
def pair_plot( _df ):
    plt.rcParams["figure.figsize"] = (20, 20)
    # sns.pairplot( _df, hue='churn', vars=['eqpdays'] )
    sns.pairplot( _df, hue='churn' )

In [None]:
pair_plot( df_core[:1000] )

### change_mouとchange_revの相関を可視化

In [None]:
# 外れ値を除去
df_core['change_mou'].idxmax()
df_core.iloc[39170]
df_core = df_core.drop( 39170 )

In [None]:
# 相関係数を計算
df_core.corr()

display( df_churn0.corr(), df_churn1.corr() )

In [None]:
def scatter( _df, _x, _y ):
    plt.rcParams["figure.figsize"] = (10, 10)
    # _df.plot( kind='scatter', x='change_mou', y='change_rev', c='churn', cmap='viridis' )
    _df.plot( kind='scatter', x=_x, y=_y, c='black', alpha=0.1, s=5, xlim=[-2000, 2000], ylim=[-1000, 1000] )

In [None]:
# scatter( df_core, 'eqpdays', 'hnd_price' )
scatter( df_core, 'change_mou', 'change_rev' )

In [None]:
def lmplot():
    sns.lmplot( data=df_core, x='change_mou', y='change_rev', hue='churn', col='churn', height=7, aspect=1 )
    # plt.savefig( '../figures/corr_mou-rev.png' )

In [None]:
lmplot()