In [1]:
import math
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file

# For visualization
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
pd.set_option('display.max_rows', None)
df_ts = pd.read_csv('../data/BankChurners.csv')
df_ts.head()

Unnamed: 0,CustomerId,Geography,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditLevel
0,15762418,Spain,3,121681.82,1,1,0,128643.35,1,8
1,15749905,Spain,6,0.0,1,1,0,50213.81,1,7
2,15600911,France,2,182888.08,1,1,0,3061.0,0,7
3,15572762,Germany,2,102278.79,2,1,0,89822.48,0,2
4,15627848,France,7,109346.13,2,1,0,102665.92,0,7


In [3]:
drop_list = ['CustomerId']
df_ts = df_ts.drop(drop_list, axis = 1)
df_ts.head()

Unnamed: 0,Geography,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditLevel
0,Spain,3,121681.82,1,1,0,128643.35,1,8
1,Spain,6,0.0,1,1,0,50213.81,1,7
2,France,2,182888.08,1,1,0,3061.0,0,7
3,Germany,2,102278.79,2,1,0,89822.48,0,2
4,France,7,109346.13,2,1,0,102665.92,0,7


In [4]:
columns = list(df_ts.columns)
print(columns)

['Geography', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited', 'CreditLevel']


In [5]:
non_normalization_list = ['Geography', 'CreditLevel']

for column in columns:
    if column not in non_normalization_list:
        print(str(column), " xmin: ", df_ts[column].min() , " xmax: ", df_ts[column].max() )

Tenure  xmin:  0  xmax:  10
Balance  xmin:  0.0  xmax:  250898.09
NumOfProducts  xmin:  1  xmax:  4
HasCrCard  xmin:  0  xmax:  1
IsActiveMember  xmin:  0  xmax:  1
EstimatedSalary  xmin:  11.58  xmax:  199970.74
Exited  xmin:  0  xmax:  1


In [6]:
def normalization(x):
    result = (x-x.min())/(x.max()-x.min())
    return result

In [7]:
for column in columns:
    if column not in non_normalization_list:
        df_ts[column] = normalization(df_ts[column])
        
# df_ts['CreditLevel'] = normalization(df_ts['CreditLevel'], 0, 10)

In [8]:
df_ts.head()

Unnamed: 0,Geography,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditLevel
0,Spain,0.3,0.484985,0.0,1.0,0.0,0.64329,1.0,8
1,Spain,0.6,0.0,0.0,1.0,0.0,0.251062,1.0,7
2,France,0.2,0.728934,0.0,1.0,0.0,0.01525,0.0,7
3,Germany,0.2,0.407651,0.333333,1.0,0.0,0.449146,0.0,2
4,France,0.7,0.435819,0.333333,1.0,0.0,0.513377,0.0,7


In [9]:
countries = df_ts["Geography"].unique()

In [10]:
print(countries)

['Spain' 'France' 'Germany']


In [11]:
for country in countries:
    count = df_ts [ df_ts["Geography"]== country ].shape[0]
    print(country , " : ", count)

Spain  :  2253
France  :  4510
Germany  :  2237


In [12]:
credit_lvs = [1,2,3,4,5,6,7,8,9,10]

for lv in credit_lvs:
    count = df_ts [ df_ts["CreditLevel"]== lv ].shape[0]
    print(lv , " : ", count)

1  :  12
2  :  98
3  :  383
4  :  895
5  :  1425
6  :  1883
7  :  1899
8  :  1309
9  :  717
10  :  379


In [13]:
geo = pd.get_dummies(df_ts["Geography"])
df_ts.drop(["Geography"],axis=1,inplace=True)
df_ts = pd.concat([df_ts,geo], axis=1)#, join="inner"
df_creditlevel = df_ts.pop('CreditLevel') # remove column of label and store it in df1
df_ts['CreditLevel']= df_creditlevel

In [14]:
df_ts.head()

Unnamed: 0,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,France,Germany,Spain,CreditLevel
0,0.3,0.484985,0.0,1.0,0.0,0.64329,1.0,0,0,1,8
1,0.6,0.0,0.0,1.0,0.0,0.251062,1.0,0,0,1,7
2,0.2,0.728934,0.0,1.0,0.0,0.01525,0.0,1,0,0,7
3,0.2,0.407651,0.333333,1.0,0.0,0.449146,0.0,0,1,0,2
4,0.7,0.435819,0.333333,1.0,0.0,0.513377,0.0,1,0,0,7


In [15]:
df_ts.to_csv('../data/BankChurners_normalized.csv', index=False)

In [16]:
def standardization(x):
    result = (x-x.mean())/x.var()
    return result

In [17]:
for column in columns:
    if column not in non_normalization_list:
        df_ts[column] = standardization(df_ts[column])

In [18]:
df_ts.head()

Unnamed: 0,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,France,Germany,Spain,CreditLevel
0,-2.396864,2.930924,-4.730419,1.414715,-2.055505,1.734639,4.864324,0,0,1,8
1,1.169701,-4.905018,-4.730419,1.414715,-2.055505,-2.993378,4.864324,0,0,1,7
2,-3.58572,6.872423,-4.730419,1.414715,-2.055505,-5.83592,-1.258601,1,0,0,7
3,-3.58572,1.681428,4.165062,1.414715,-2.055505,-0.605623,-1.258601,0,1,0,2
4,2.358557,2.136543,4.165062,1.414715,-2.055505,0.168626,-1.258601,1,0,0,7


In [19]:
df_ts.to_csv('../data/BankChurners_normalized_standardized.csv', index=False)

In [20]:
credit_list = [1,2,3,4]

df_new = df_ts.copy()
for credit in credit_list:
    df_new.loc[df_new['CreditLevel']==credit, 'CreditLevel'] = 1

credit_list = [9,10]
for credit in credit_list:
    df_new.loc[df_new['CreditLevel']==credit, 'CreditLevel'] = 99
    
old_label = [5,6,7,8,99]
new_label = [2,3,4,5,6]

for i in range(len(old_label)):
    df_new.loc[df_new['CreditLevel']==old_label[i], 'CreditLevel'] = new_label[i]

df_new.head()

Unnamed: 0,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,France,Germany,Spain,CreditLevel
0,-2.396864,2.930924,-4.730419,1.414715,-2.055505,1.734639,4.864324,0,0,1,5
1,1.169701,-4.905018,-4.730419,1.414715,-2.055505,-2.993378,4.864324,0,0,1,4
2,-3.58572,6.872423,-4.730419,1.414715,-2.055505,-5.83592,-1.258601,1,0,0,4
3,-3.58572,1.681428,4.165062,1.414715,-2.055505,-0.605623,-1.258601,0,1,0,1
4,2.358557,2.136543,4.165062,1.414715,-2.055505,0.168626,-1.258601,1,0,0,4


In [21]:
df_new.to_csv('../data/BankChurners_normalized_standardized_combined_6.csv', index=False)

In [22]:
df_new.head()

Unnamed: 0,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,France,Germany,Spain,CreditLevel
0,-2.396864,2.930924,-4.730419,1.414715,-2.055505,1.734639,4.864324,0,0,1,5
1,1.169701,-4.905018,-4.730419,1.414715,-2.055505,-2.993378,4.864324,0,0,1,4
2,-3.58572,6.872423,-4.730419,1.414715,-2.055505,-5.83592,-1.258601,1,0,0,4
3,-3.58572,1.681428,4.165062,1.414715,-2.055505,-0.605623,-1.258601,0,1,0,1
4,2.358557,2.136543,4.165062,1.414715,-2.055505,0.168626,-1.258601,1,0,0,4


In [23]:
credit_lvs = [1,2,3,4,5,6,7,8,9,10]

for lv in credit_lvs:
    count = df_new [ df_new["CreditLevel"]== lv ].shape[0]
    print(lv , " : ", count)

1  :  1388
2  :  1425
3  :  1883
4  :  1899
5  :  1309
6  :  1096
7  :  0
8  :  0
9  :  0
10  :  0


In [24]:
credit_list = [1,2,3,4]

df_new = df_ts.copy()
for credit in credit_list:
    df_new.loc[df_new['CreditLevel']==credit, 'CreditLevel'] = 1

credit_list = [9,10]

for credit in credit_list:
    df_new.loc[df_new['CreditLevel']==credit, 'CreditLevel'] = 1
    
old_label = [5,6,7,8]
new_label = [2,3,4,5]

for i in range(len(old_label)):
    df_new.loc[df_new['CreditLevel']==old_label[i], 'CreditLevel'] = new_label[i]


In [25]:
df_new.to_csv('../data/BankChurners_normalized_standardized_combined_5.csv', index=False)

In [26]:
df_new.head()

Unnamed: 0,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,France,Germany,Spain,CreditLevel
0,-2.396864,2.930924,-4.730419,1.414715,-2.055505,1.734639,4.864324,0,0,1,5
1,1.169701,-4.905018,-4.730419,1.414715,-2.055505,-2.993378,4.864324,0,0,1,4
2,-3.58572,6.872423,-4.730419,1.414715,-2.055505,-5.83592,-1.258601,1,0,0,4
3,-3.58572,1.681428,4.165062,1.414715,-2.055505,-0.605623,-1.258601,0,1,0,1
4,2.358557,2.136543,4.165062,1.414715,-2.055505,0.168626,-1.258601,1,0,0,4


In [27]:
credit_lvs = [1,2,3,4,5,6,7,8,9,10]

for lv in credit_lvs:
    count = df_new [ df_new["CreditLevel"]== lv ].shape[0]
    print(lv , " : ", count)

1  :  2484
2  :  1425
3  :  1883
4  :  1899
5  :  1309
6  :  0
7  :  0
8  :  0
9  :  0
10  :  0


In [28]:
credit_list = [1,2,3,4,9,10]

df_new = df_ts.copy()
for credit in credit_list:
    df_new = df_new[df_new["CreditLevel"] != credit]

old_label = [5,6,7,8]
new_label = [1,2,3,4]

for i in range(len(old_label)):
    df_new.loc[df_new['CreditLevel']==old_label[i], 'CreditLevel'] = new_label[i]

In [29]:
df_new.to_csv('../data/BankChurners_normalized_standardized_combined_drop5.csv', index=False)

In [30]:
df_new.head()

Unnamed: 0,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,France,Germany,Spain,CreditLevel
0,-2.396864,2.930924,-4.730419,1.414715,-2.055505,1.734639,4.864324,0,0,1,4
1,1.169701,-4.905018,-4.730419,1.414715,-2.055505,-2.993378,4.864324,0,0,1,3
2,-3.58572,6.872423,-4.730419,1.414715,-2.055505,-5.83592,-1.258601,1,0,0,3
4,2.358557,2.136543,4.165062,1.414715,-2.055505,0.168626,-1.258601,1,0,0,3
5,3.547412,3.463009,4.165062,-3.410004,-2.055505,5.970857,-1.258601,1,0,0,2


In [31]:
credit_lvs = [1,2,3,4,5,6,7,8,9,10]

for lv in credit_lvs:
    count = df_new [ df_new["CreditLevel"]== lv ].shape[0]
    print(lv , " : ", count)
    

1  :  1425
2  :  1883
3  :  1899
4  :  1309
5  :  0
6  :  0
7  :  0
8  :  0
9  :  0
10  :  0


In [32]:
df_new = df_ts.copy()
df_new = df_new.sort_values(by=['CreditLevel'])
df_new = df_new.reset_index(drop=True)

In [33]:
labels = list( df_new["CreditLevel"].unique() )

label_num_dict = {}

for label in labels:
    count = df_new [ df_new["CreditLevel"]== label ].shape[0]
    label_num_dict[label] = count
    
print(label_num_dict)

{1: 12, 2: 98, 3: 383, 4: 895, 5: 1425, 6: 1883, 7: 1899, 8: 1309, 9: 717, 10: 379}


In [34]:
label_id_dict = {}
count = 0

for k , v in label_num_dict.items():
    id_list = []
    for i in range(v):
        id_list.append(count)
        count+=1
    label_id_dict[k] = id_list

In [35]:
alpha = 30
user_num = 2
partitions = np.random.dirichlet(np.repeat(alpha, user_num))
print(partitions)

[0.41871807 0.58128193]


In [36]:
label_id_dict_of_users = {}
label_id_dict_of_user = {}

user_id = 0
for partition in partitions:
    for k, v in label_num_dict.items():
        num = v * partition
        id_list = [label_id_dict[k].pop(0) for idx in range(int(num))]
        label_id_dict_of_user[k] = id_list

    # List of label_id_dict for the users
    label_id_dict_of_users[user_id] = label_id_dict_of_user
    label_id_dict_of_user = {}
    user_id += 1


In [37]:
rows = df_new.shape[1]
print(rows)
count=-1
rows = [idx for idx in range((rows)-1)]
# print(label_id_dict_of_users[0])
# print(rows)

dfs = []

for user_id in range(len(label_id_dict_of_users)):
    ids = []
    for label, id in label_id_dict_of_users[user_id].items():
        ids = ids+id
    df_one = df_new.filter(items = ids, axis=0)
    dfs.append(df_one)


11


In [38]:
df_in = df_ts.iloc[0:,0:10]
df_out = df_ts.iloc[0:,0:11]
df_in.head()

Unnamed: 0,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,France,Germany,Spain
0,-2.396864,2.930924,-4.730419,1.414715,-2.055505,1.734639,4.864324,0,0,1
1,1.169701,-4.905018,-4.730419,1.414715,-2.055505,-2.993378,4.864324,0,0,1
2,-3.58572,6.872423,-4.730419,1.414715,-2.055505,-5.83592,-1.258601,1,0,0
3,-3.58572,1.681428,4.165062,1.414715,-2.055505,-0.605623,-1.258601,0,1,0
4,2.358557,2.136543,4.165062,1.414715,-2.055505,0.168626,-1.258601,1,0,0


In [39]:
from sklearn.decomposition import PCA
pca = PCA()
x = df_in
x_pca = pca.fit_transform(x)
x_pca = pd.DataFrame(x_pca)
x_pca.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-5.791384,1.577139,0.011546,-2.766836,4.807613,-1.584089,-0.404764,-0.629772,0.786283,-1.180038e-15
1,-2.405755,-6.360005,0.535808,1.718822,5.947161,-1.333872,-0.170942,-0.847912,0.390856,4.849347e-15
2,-7.226395,2.660759,7.254179,1.198176,-1.158051,-1.625592,-2.587229,0.721952,0.385464,-1.075454e-15
3,3.026543,3.093584,3.114202,-2.00666,-0.722244,-1.650458,-2.390673,-0.411319,-0.709109,1.908577e-16
4,2.944398,3.44035,-0.942292,2.400956,-0.80303,-1.49523,-2.267765,0.66453,0.254538,-5.699248e-17


In [40]:
explained_variance = pca.explained_variance_ratio_
explained_variance

array([3.61200588e-01, 1.62353532e-01, 1.47045320e-01, 1.43012016e-01,
       7.57642623e-02, 5.85225466e-02, 4.52058352e-02, 4.32785202e-03,
       2.56804758e-03, 1.00307011e-34])

In [41]:
df_in = df_ts.copy()

In [42]:
drop_list = ['IsActiveMember','EstimatedSalary','Exited','France','Germany','Spain']
for item in drop_list:
    df_in = df_in.drop(item, axis = 1)
df_in.head()

Unnamed: 0,Tenure,Balance,NumOfProducts,HasCrCard,CreditLevel
0,-2.396864,2.930924,-4.730419,1.414715,8
1,1.169701,-4.905018,-4.730419,1.414715,7
2,-3.58572,6.872423,-4.730419,1.414715,7
3,-3.58572,1.681428,4.165062,1.414715,2
4,2.358557,2.136543,4.165062,1.414715,7


In [43]:
df_in.to_csv('../data/BankChurners_normalized_standardized_pca.csv', index=False)

In [44]:
from sklearn.model_selection import train_test_split
inputs = df_ts.iloc[0:,0:10]
labels = df_ts.iloc[0:,10:11]
X_train, X_test, y_train, y_test = train_test_split(inputs, labels, test_size=0.2,random_state=0)

In [45]:
X_train.head()

Unnamed: 0,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,France,Germany,Spain
6157,-2.396864,1.724982,4.165062,1.414715,-2.055505,3.271509,-1.258601,0,1,0
2923,2.358557,7.345795,-4.730419,1.414715,1.946993,-2.551147,-1.258601,1,0,0
5155,1.169701,3.633466,-4.730419,1.414715,1.946993,-5.575085,4.864324,0,1,0
31,4.736267,2.538468,4.165062,-3.410004,1.946993,0.109038,-1.258601,0,1,0
7628,-2.396864,-4.905018,4.165062,1.414715,1.946993,5.659358,-1.258601,1,0,0


In [46]:
y_train.head()

Unnamed: 0,CreditLevel
6157,7
2923,9
5155,7
31,8
7628,7


In [47]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

# define model
model = LinearDiscriminantAnalysis()
# define model evaluation method
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
# summarize result
print('Mean Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

Mean Accuracy: 0.211 (0.008)


In [48]:
LDA = model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [49]:
predictions = LDA.predict(X_train)

In [50]:
labels = y_train['CreditLevel'].tolist()

In [51]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
acc = accuracy_score(labels, predictions)
cm = confusion_matrix(labels, predictions)
per_class_acc = cm.diagonal() / cm.sum(axis=1)
print("------- result ------")
print("Acc: ", acc)
print("Per-Class Acc: ")
print(per_class_acc)
print("Confusion Matrix: ")
print(cm)

------- result ------
Acc:  0.21805555555555556
Per-Class Acc: 
[0.         0.         0.         0.         0.00088574 0.47420635
 0.56200528 0.         0.         0.        ]
Confusion Matrix: 
[[  0   0   0   0   0   9   1   0   0   0]
 [  0   0   0   0   0  41  39   0   0   0]
 [  0   0   0   0   0 139 171   0   0   0]
 [  1   0   0   0   0 356 372   0   0   0]
 [  0   0   0   0   1 506 622   0   0   0]
 [  1   0   0   0   1 717 793   0   0   0]
 [  1   0   0   0   1 662 852   0   0   0]
 [  1   0   0   0   0 488 562   0   0   0]
 [  1   0   0   0   0 283 285   0   0   0]
 [  0   0   0   0   0 134 160   0   0   0]]


In [52]:
import xgboost as xg
XGBC = xg.XGBClassifier(objective="multi:softmax", num_class=10)
XGBC.fit(X_train, y_train)

predictions = XGBC.predict(X_train)

acc = accuracy_score(labels, predictions)
cm = confusion_matrix(labels, predictions)
per_class_acc = cm.diagonal() / cm.sum(axis=1)
print("------- result ------")
print("Acc: ", acc)
print("Per-Class Acc: ")
print(per_class_acc)
print("Confusion Matrix: ")
print(cm)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


------- result ------
Acc:  0.7331944444444445
Per-Class Acc: 
[1.         0.8        0.55483871 0.57201646 0.72542073 0.82407407
 0.87005277 0.71075167 0.58523726 0.5170068 ]
Confusion Matrix: 
[[  10    0    0    0    0    0    0    0    0    0]
 [   0   64    0    0    1    5   10    0    0    0]
 [   0    0  172    3   16   44   68    6    1    0]
 [   0    0    0  417   38  130  105   35    2    2]
 [   0    0    0   10  819  134  133   24    9    0]
 [   0    0    1   13   43 1246  172   32    2    3]
 [   0    0    0   10   26  129 1319   23    7    2]
 [   0    0    1    9   30  125  135  747    3    1]
 [   0    1    0    5   26   98   85   21  333    0]
 [   0    0    0    1   12   54   57   16    2  152]]


In [53]:
X_train.head()

Unnamed: 0,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,France,Germany,Spain
6157,-2.396864,1.724982,4.165062,1.414715,-2.055505,3.271509,-1.258601,0,1,0
2923,2.358557,7.345795,-4.730419,1.414715,1.946993,-2.551147,-1.258601,1,0,0
5155,1.169701,3.633466,-4.730419,1.414715,1.946993,-5.575085,4.864324,0,1,0
31,4.736267,2.538468,4.165062,-3.410004,1.946993,0.109038,-1.258601,0,1,0
7628,-2.396864,-4.905018,4.165062,1.414715,1.946993,5.659358,-1.258601,1,0,0
