<a href="https://colab.research.google.com/github/thyyl/Deep-Learning/blob/main/SimpleDeepLearning/DeepLearningBasics/KerasPipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Upload datasets

In [None]:
from google.colab import files
train_uploaded = files.upload()
test_uploaded = files.upload()

Saving train.csv to train.csv


Saving test.csv to test.csv


Import Library 

In [None]:
import collections
import sys
import os
import io
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML, Image
%matplotlib inline

from plotly import tools
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.figure_factory as ff
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
py.init_notebook_mode(connected=True)

import warnings
warnings.filterwarnings('ignore')

In [None]:
def configure_plotly_browser_state():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              plotly: 'https://cdn.plot.ly/plotly-latest.min.js?noext',
            },
          });
        </script>
        '''))

In [None]:
train_df = pd.read_csv(io.BytesIO(train_uploaded['train.csv']))
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
test_df = pd.read_csv(io.BytesIO(test_uploaded['test.csv']))
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


Check for null values

In [None]:
train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:
test_df.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

It seems that both data are missing some values in Age and Cabin. As Cabin contains too many null values, it might be wiser to drop the column from this dataset

In [None]:
train_df.drop('Cabin', axis=1)
test_df.drop('Cabin', axis=1)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S
...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,S


### Data Visualization

In [None]:
def DistPlotTrace(df, name, color):
  trace = go.Histogram(
      x = df,
      opacity = 0.75,
      name = name,
      xbins = dict(start=0, end=80, size=4),
      autobinx = False,
      marker = dict(color=color),
  )
  return trace

In [None]:
configure_plotly_browser_state()
age_survived = train_df[train_df.Survived == 1]['Age']
age_not_survived = train_df[train_df.Survived == 0]['Age']
features = [(age_survived, 'Survived', 'rgba(171, 50, 96, 0.6)'), (age_not_survived, 'Not Survived', 'rgba(12, 50, 196, 0.6)')]
data = []

for df, name, color in features:
  trace = go_hist_trace(data=df, name=name, color=color)
  data.append(trace)

layout = go.Layout(
    barmode = 'overlay',
    title = 'Survivability by Age',
    xaxis = dict(title='Survival ratio'),
    yaxis = dict( title='Count'),
    template = 'plotly_white'
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

### Data Fitting

In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    int64  
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(6), object(4)
memory usage: 83.7+ KB


In [None]:
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder 

label_encoder = LabelEncoder()
one_hot_encoder = OneHotEncoder()
encoded = train_df['Sex']
encoded = label_encoder.fit_transform(encoded)
train_df.drop('Sex', axis=1).drop('Embarked', axis=1)
train_df['Sex'] = encoded

X = train_df.drop(columns=['PassengerId', 'Survived', 'Ticket', 'Cabin', 'Name', 'Embarked'])
y = train_df['Survived']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras import optimizers

def create_model(optimizer='adam', dropout=0.2):
    model = Sequential()
    model.add(Dense(64,activation='relu'))
    model.add(Dropout(dropout))
    model.add(Dense(1,activation='sigmoid'))

    model.compile(loss='binary_crossentropy',optimizer=optimizer, metrics=['accuracy'])

    return model

In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 331 to 102
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    712 non-null    int64  
 1   Sex       712 non-null    int64  
 2   Age       572 non-null    float64
 3   SibSp     712 non-null    int64  
 4   Parch     712 non-null    int64  
 5   Fare      712 non-null    float64
 6   Embarked  710 non-null    object 
dtypes: float64(2), int64(4), object(1)
memory usage: 44.5+ KB


In [None]:
from keras.wrappers.scikit_learn import KerasClassifier
clf = KerasClassifier(build_fn=create_model,verbose=0)
pipeline = Pipeline([
    ('clf',clf)
])
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('clf',
                 <tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x7fc4e14598d0>)],
         verbose=False)