In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelBinarizer, LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score

In [3]:
from neural_network import FNNClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression

Using TensorFlow backend.


### Classification with Cenesus data

In [4]:
df = pd.read_csv('./data/adult.csv') 

In [5]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [6]:
df['income'].value_counts()

<=50K    24720
>50K      7841
Name: income, dtype: int64

In [7]:
df = pd.get_dummies(df)

In [8]:
X = df[[c for c in df.columns if c not in ['income_<=50K', 'income_>50K']]].values

In [9]:
y = df['income_>50K'].values

In [10]:
scaler = StandardScaler()

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=22)

In [12]:
X_train

array([[    26,  94392,      7, ...,      1,      0,      0],
       [    41, 279297,      9, ...,      1,      0,      0],
       [    35, 155961,      9, ...,      0,      0,      0],
       ...,
       [    38, 192939,      9, ...,      1,      0,      0],
       [    38, 186145,     10, ...,      1,      0,      0],
       [    36, 112074,     16, ...,      1,      0,      0]], dtype=int64)

In [13]:
X_train = scaler.fit_transform(X_train)



In [14]:
fnn = FNNClassifier(class_weight='balanced')

In [15]:
fnn.fit(X_train, y_train)

Data size (29304, 108) -	 Epochs 100 -	 Batch Size 128
Computed Class Weights {0: 0.6572466693580945, 1: 2.089858793324775}
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Hidden_1 (Dense)             (None, 50)                5450      
_________________________________________________________________
Dropout_1_0.5 (Dropout)      (None, 50)                0         
_________________________________________________________________
Output_sigmoid (Dense)       (None, 1)                 51        
Total params: 5,501
Trainable params: 5,501
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
 4480/29304 [===>..........................] - ETA: 0s - loss: 0.4286 - acc: 0.7833



Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 

Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Fit complete in 97.92 seconds


In [16]:
fnn.score(scaler.transform(X_test), y_test)





[0.37273428834103206, 0.8197727968511971]

In [17]:
logistic = LogisticRegression(class_weight='balanced')

In [18]:
logistic.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [19]:
accuracy_score(y_pred=logistic.predict(scaler.transform(X_test)), y_true=y_test)



0.8167024869511821

In [20]:
df = pd.read_csv('./data/creditcard.csv')

In [21]:
df['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [22]:
X = df[[c for c in df.columns if c not in ['Time', 'Class']]].values

In [23]:
y = df['Class'].values

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=22)

In [25]:
X_train = scaler.fit_transform(X_train)

In [26]:
fnn = FNNClassifier(hidden_layers=[50, 30, 5], dropout=0.5, validation_split=0.2, class_weight='balanced', epochs=100, early_stopping=1)

In [27]:
fnn.fit(X_train, y_train)

Data size (256326, 29) -	 Epochs 100 -	 Batch Size 128
Computed Class Weights {0: 0.5008812896922326, 1: 284.1751662971175}
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Hidden_1 (Dense)             (None, 50)                1500      
_________________________________________________________________
Dropout_1_0.5 (Dropout)      (None, 50)                0         
_________________________________________________________________
Hidden_2 (Dense)             (None, 30)                1530      
_________________________________________________________________
Dropout_2_0.5 (Dropout)      (None, 30)                0         
_________________________________________________________________
Hidden_3 (Dense)             (None, 5)                 155       
_________________________________________________________________
Dropout_3_0.5 (Dropout)      (None, 5)                 0         
__________________

In [28]:
X_test = scaler.transform(X_test)

In [29]:
roc_auc_score(y_score=fnn.predict_proba(X_test)[:, 1], y_true=y_test)

0.9980155054715104

In [30]:
fnn.score(scaler.transform(X_test), y_test)



[0.051488693095332785, 0.9933288859239493]

In [40]:
logistic = SGDClassifier(class_weight='balanced', loss='log', penalty='l2')

In [41]:
logistic.fit(X_train, y_train)



SGDClassifier(alpha=0.0001, average=False, class_weight='balanced',
       epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [42]:
roc_auc_score(y_score=logistic.predict_proba(X_test)[:, 1], y_true=y_test)

  np.exp(prob, prob)


0.9907327364412885