# deep learning
#### Advantages  
Deep learning is the current state-of-the-art for certain domains, such as computer vision and speech recognition. Deep neural networks perform very well on image, audio, and text data, and they can be easily updated with new data using batch propagation. Their architectures (i.e. number and structure of layers) can be adapted to many types of problems, and their hidden layers reduce the need for feature engineering.

#### Disadvantages:
Deep learning algorithms are usually not suitable as general-purpose algorithms because they require a very large amount of data.  

In fact, they are usually outperformed by tree ensembles for classical machine learning problems. In addition, they are computationally intensive to train, and they require much more expertise to tune.

In [None]:
import pandas as pd
def read_csv(path_name):
    df = pd.read_csv(path_name, index_col=False)
    return df

path_name= "after_fix_missing.csv" 
df= read_csv(path_name)    
df.tail()

In [2]:
def random_sample(df):
    #ramdon under sample not sign up
    df_sign_up= df[df['signup']==1]
    df_not_sign_up= df[df['signup']==0]

    num_sign_up= len(df_sign_up)
    df_not_sign_up= df_not_sign_up.sample(num_sign_up)
    #merge and random
    df= df_sign_up.append(df_not_sign_up)
    df= df.sample(len(df))

    print("num_sign_up: ", len(df_sign_up))
    print("num_not_sign_up: ", len(df_not_sign_up))
    
    return df

df= random_sample(df)
df.head()

num_sign_up:  25601
num_not_sign_up:  25601


Unnamed: 0,0,1,2,3,4,5,6,7,8,67,...,141,142,154,155,156,159,163,164,165,signup
88156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.98,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
113074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,12.0,19.0,1.0,0.0,0.0,4.0,0.0
91280,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.97,...,0.0,0.0,0.0,2.0,4.0,1.0,0.0,0.0,0.0,0.0
13836,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0
58437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.99,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [3]:
import numpy
import pandas
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from keras.layers import Dropout

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
from sklearn import preprocessing
def df2array(df):
    cols = ['67', '68', '84', '85', '87', '88', '101', '102', '103', '136', '156']
    df_X= df[cols]
    min_max_scaler = preprocessing.MinMaxScaler()#(0, 1)
    X = min_max_scaler.fit_transform(df_X)   
       
    y_= df['signup'].as_matrix()
    y = np_utils.to_categorical(y_)
    return X, y
X, y= df2array(df) 
print(X, y)

[[3.26666667e-01 5.00000000e-03 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [3.33333333e-01 0.00000000e+00 1.03448276e-01 ... 3.87258072e-02
  0.00000000e+00 8.63636364e-02]
 [3.23333333e-01 1.00000000e-02 1.37931034e-01 ... 5.68490525e-02
  0.00000000e+00 1.81818182e-02]
 ...
 [1.66666667e-01 0.00000000e+00 0.00000000e+00 ... 3.07791935e-04
  1.17125812e-02 0.00000000e+00]
 [3.33333333e-01 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [3.33333333e-01 0.00000000e+00 3.44827586e-02 ... 6.89329557e-02
  0.00000000e+00 1.81818182e-02]] [[0. 1.]
 [1. 0.]
 [1. 0.]
 ...
 [0. 1.]
 [0. 1.]
 [1. 0.]]


  if __name__ == '__main__':


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [6]:
model = Sequential()
model.add(Dropout(0.2))
model.add(Dense(5, input_dim=11, init='normal', activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(2, init='normal', activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, batch_size=50, nb_epoch=5, verbose=1, validation_data=(X_test, y_test))

  This is separate from the ipykernel package so we can avoid doing imports until
  """
  


Train on 30721 samples, validate on 20481 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1a41595390>

In [7]:
score = model.evaluate(X_test, y_test, verbose=0)
#print('Test score:', score[0])
print('Test accuracy:', score[1])

Test accuracy: 0.7328255456276549
