## 預測金融客戶是否流失

### 讀取資料

In [1]:
import pandas
df = pandas.read_csv('https://raw.githubusercontent.com/ywchiu/tibamedl/master/Data/Churn_Modelling.csv', index_col = 0)
df.head(3)

Unnamed: 0_level_0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 1 to 10000
Data columns (total 13 columns):
CustomerId         10000 non-null int64
Surname            10000 non-null object
CreditScore        10000 non-null int64
Geography          10000 non-null object
Gender             10000 non-null object
Age                10000 non-null int64
Tenure             10000 non-null int64
Balance            10000 non-null float64
NumOfProducts      10000 non-null int64
HasCrCard          10000 non-null int64
IsActiveMember     10000 non-null int64
EstimatedSalary    10000 non-null float64
Exited             10000 non-null int64
dtypes: float64(2), int64(8), object(3)
memory usage: 1.1+ MB


### 資料預處理

In [3]:
y = df['Exited']
del df['Exited']

In [4]:
del df['Surname']

In [5]:
del df['CustomerId']

In [6]:
df['Geography'].unique()

array(['France', 'Spain', 'Germany'], dtype=object)

In [7]:
df['Gender'].unique()

array(['Female', 'Male'], dtype=object)

In [8]:
geo = pandas.get_dummies(df['Geography'])
del geo['France']
geo.head()

Unnamed: 0_level_0,Germany,Spain
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,0
2,0,1
3,0,0
4,0,0
5,0,1


In [9]:
gender = pandas.get_dummies(df['Gender'])
del gender['Female']
gender.head()

Unnamed: 0_level_0,Male
RowNumber,Unnamed: 1_level_1
1,0
2,0
3,0
4,0
5,0


In [10]:
del df['Geography']
del df['Gender']
customer = pandas.concat([df, geo, gender], axis = 1 )
customer.head()

Unnamed: 0_level_0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Germany,Spain,Male
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,619,42,2,0.0,1,1,1,101348.88,0,0,0
2,608,41,1,83807.86,1,0,1,112542.58,0,1,0
3,502,42,8,159660.8,3,1,0,113931.57,0,0,0
4,699,39,1,0.0,2,0,0,93826.63,0,0,0
5,850,43,2,125510.82,1,1,1,79084.1,0,1,0


In [11]:
X = customer.values

In [13]:
X.shape

(10000, 11)

In [15]:
y.shape

(10000,)

### 資料標準化 

In [26]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

In [27]:
X

array([[-0.32622142,  0.29351742, -1.04175968, ..., -0.57873591,
        -0.57380915, -1.09598752],
       [-0.44003595,  0.19816383, -1.38753759, ..., -0.57873591,
         1.74273971, -1.09598752],
       [-1.53679418,  0.29351742,  1.03290776, ..., -0.57873591,
        -0.57380915, -1.09598752],
       ...,
       [ 0.60498839, -0.27860412,  0.68712986, ..., -0.57873591,
        -0.57380915, -1.09598752],
       [ 1.25683526,  0.29351742, -0.69598177, ...,  1.72790383,
        -0.57380915,  0.91241915],
       [ 1.46377078, -1.04143285, -0.35020386, ..., -0.57873591,
        -0.57380915, -1.09598752]])

### 建構模型

In [28]:
from keras.layers import Dense, Activation
from keras.models   import Sequential

In [29]:
model = Sequential()
model.add(Dense(6, activation='relu', input_shape=(11,)))
model.add(Dense(6, activation='relu'))
model.add(Dense(1,  activation='sigmoid'))

In [30]:
model.compile(loss='binary_crossentropy',
              optimizer='SGD',
              metrics=['accuracy'])

In [32]:
history = model.fit(X, y,
                    batch_size=100,
                    epochs=10,
                    verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### 使用模型做預測

In [33]:
predicted = model.predict(X)

In [37]:
predicted = (predicted > 0.5).flatten().astype(int)

In [39]:
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(y, predicted)

0.864

In [40]:
confusion_matrix(y, predicted)

array([[7662,  301],
       [1059,  978]], dtype=int64)