In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

train_data_path = 'data/train.csv'
test_data_path = 'data/test.csv'

titanic_temp_data = pd.read_csv(train_data_path)
titanic_test_data = pd.read_csv(test_data_path)

titanic_train_data, titanic_val_data = train_test_split(
                                            titanic_temp_data, test_size = .2)

In [2]:
titanic_train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
635,636,1,2,"Davis, Miss. Mary",female,28.0,0,0,237668,13.0,,S
844,845,0,3,"Culumovic, Mr. Jeso",male,17.0,0,0,315090,8.6625,,S
778,779,0,3,"Kilgannon, Mr. Thomas J",male,,0,0,36865,7.7375,,Q
849,850,1,1,"Goldenberg, Mrs. Samuel L (Edwiga Grabowska)",female,,1,0,17453,89.1042,C92,C
490,491,0,3,"Hagland, Mr. Konrad Mathias Reiersen",male,,1,0,65304,19.9667,,S


In [157]:
# keep only the necessary column values
features = ['Pclass','Sex','Age','SibSp','Parch','Fare']
labels = ['Survived']

train_set = titanic_train_data[features].reset_index(drop=True)
val_set = titanic_val_data[features].reset_index(drop=True)
test_set = titanic_test_data[features].reset_index(drop=True)

train_labels = titanic_train_data[labels].reset_index(drop=True)
val_labels = titanic_val_data[labels].reset_index(drop=True)

In [158]:
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, StandardScaler

## set(train_set['Sex)) = {male,female} need to conver to binary
lb_make = LabelEncoder()
train_set.loc[:,'Sex_bin'] = lb_make.fit_transform(train_set['Sex'])
val_set.loc[:,'Sex_bin'] = lb_make.fit_transform(val_set['Sex'])
test_set.loc[:,'Sex_bin'] = lb_make.fit_transform(test_set['Sex'])

## alternative approach using built in pandas functionality
# train_set['Sex_bin'] = train_set['Sex'].astype('category').cat.codes
# test_set['Sex_bin] = test_set['Sex].astype('category').cat.codes

In [159]:
## convert Pclass to be one-hot-encoded. 
lb_binarizer = LabelBinarizer()
Pclass_binarize_results_train = lb_binarizer.fit_transform(train_set['Pclass'])
Pclass_binarize_results_val = lb_binarizer.fit_transform(val_set['Pclass'])
Pclass_binarize_results_test = lb_binarizer.fit_transform(test_set['Pclass'])
Pclass_labels = ['class_{}'.format(c) for c in lb_binarizer.classes_]

Pclass_df_train = pd.DataFrame(Pclass_binarize_results_train, columns=Pclass_labels)
Pclass_df_val = pd.DataFrame(Pclass_binarize_results_val, columns=Pclass_labels)
Pclass_df_test = pd.DataFrame(Pclass_binarize_results_test, columns=Pclass_labels)


training_data = pd.concat([train_set,Pclass_df_train], axis=1)
val_data = pd.concat([val_set,Pclass_df_val], axis=1)
testing_data = pd.concat([test_set, Pclass_df_test], axis=1)

In [160]:
## fill in missing age data
def nan_conversion(df,column):
    mean_age = df[column].mean()
    median_age = df[column].median()
    std_age = df[column].std()
    num_miss = df[column].isnull().sum()
    
    # fill age by draws from a normal distribution defined by mean and std
    age_fill = np.random.normal(mean_age, std_age, num_miss)
    
    if np.isnan(df[column]).sum(axis= 0) > 0:
        df.loc[np.isnan(df[column]),column] = np.abs(age_fill)
        
    ## another option is to just fillna with median or mean values
#     df['Age'].fillna(median_age, inplace=True)
#     df['Age'].fillna(mean_age, inplace=True) 

    # there are other options such as drawing from uniform dist bounded by mean-std, mean+std
    return df

training_data = nan_conversion(training_data, 'Age')
val_data = nan_conversion(val_data, 'Age')
testing_data = nan_conversion(testing_data, 'Age')
testing_data = nan_conversion(testing_data, 'Fare')

In [161]:
# remove not need columns
columns_to_drop = set(['Pclass','Sex'])
if bool(columns_to_drop.intersection(set(training_data.columns))):
    training_data.drop(columns_to_drop, axis=1, inplace=True)
if bool(columns_to_drop.intersection(set(val_data.columns))):
    val_data.drop(columns_to_drop, axis=1, inplace=True)
if bool(columns_to_drop.intersection(set(testing_data.columns))):
    testing_data.drop(columns_to_drop, axis=1, inplace=True)

In [162]:
x_train = training_data.values
x_val = val_data.values
x_test = testing_data.values

y_train = train_labels.values
y_val = val_labels.values

In [163]:
## Through testing learned that testing has a missing value for fare. 
scaler = StandardScaler()
scaled_x_trian = scaler.fit_transform(x_train)
scaled_x_val = scaler.fit_transform(x_val)
scaled_x_test = scaler.fit_transform(x_test)

In [221]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

rfc = RandomForestClassifier(n_estimators=100)
lr = LogisticRegression()

In [223]:
y_pred = cross_val_predict(rfc, scaled_x_trian, y_train.reshape(712,), cv=3)
accuracy_score(y_train,y_pred)

0.8146067415730337

In [222]:
rfc.fit(scaled_x_trian, y_train)
rfc.score(scaled_x_val, y_val)

  """Entry point for launching an IPython kernel.


0.77094972067039103

In [22]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


train_data_path = 'data/train.csv'
test_data_path = 'data/test.csv'

titanic_train_data = pd.read_csv(train_data_path)
titanic_test_data = pd.read_csv(test_data_path)

# keep only the necessary column values
features = ['Pclass','Sex','Age','SibSp','Parch','Fare']
labels = ['Survived']

train_set = titanic_train_data[features].reset_index(drop=True)
test_set = titanic_test_data[features].reset_index(drop=True)

train_labels = titanic_train_data[labels].reset_index(drop=True)


from sklearn.preprocessing import LabelEncoder, LabelBinarizer, StandardScaler

## set(train_set['Sex)) = {male,female} need to conver to binary
lb_make = LabelEncoder()
train_set.loc[:,'Sex_bin'] = lb_make.fit_transform(train_set['Sex'])
test_set.loc[:,'Sex_bin'] = lb_make.fit_transform(test_set['Sex'])


## convert Pclass to be one-hot-encoded. 
lb_binarizer = LabelBinarizer()
Pclass_binarize_results_train = lb_binarizer.fit_transform(train_set['Pclass'])
Pclass_binarize_results_test = lb_binarizer.fit_transform(test_set['Pclass'])
Pclass_labels = ['class_{}'.format(c) for c in lb_binarizer.classes_]

Pclass_df_train = pd.DataFrame(Pclass_binarize_results_train, columns=Pclass_labels)
Pclass_df_test = pd.DataFrame(Pclass_binarize_results_test, columns=Pclass_labels)


training_data = pd.concat([train_set,Pclass_df_train], axis=1)
testing_data = pd.concat([test_set, Pclass_df_test], axis=1)


## fill in missing age data
def nan_conversion(df,column):
    mean_age = df[column].mean()
    median_age = df[column].median()
    std_age = df[column].std()
    num_miss = df[column].isnull().sum()
    
    # fill age by draws from a normal distribution defined by mean and std
    age_fill = np.random.normal(mean_age, std_age, num_miss)
    
    if np.isnan(df[column]).sum(axis= 0) > 0:
        df.loc[np.isnan(df[column]),column] = np.abs(age_fill)
        
    return df

training_data = nan_conversion(training_data, 'Age')
testing_data = nan_conversion(testing_data, 'Age')
testing_data = nan_conversion(testing_data, 'Fare')

# remove not need columns
columns_to_drop = set(['Pclass','Sex'])
if bool(columns_to_drop.intersection(set(training_data.columns))):
    training_data.drop(columns_to_drop, axis=1, inplace=True)
if bool(columns_to_drop.intersection(set(testing_data.columns))):
    testing_data.drop(columns_to_drop, axis=1, inplace=True)
    
    
x_train = training_data.values
x_test = testing_data.values

y_train = train_labels.values

In [63]:
# baseline model
def create_baseline():
    # create model
    model = Sequential()
    model.add(Dense(8, input_dim=8, kernel_initializer='normal', activation='relu'))
#     model.add(Dropout(.2))
    model.add(Dense(4, kernel_initializer='normal', activation='relu'))
#     model.add(Dense(4, kernel_initializer='normal', activation='relu'))
#     model.add(Dropout(.2))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


In [58]:
import jupyternotify
ip = get_ipython()
ip.register_magics(jupyternotify.JupyterNotifyMagics)

<IPython.core.display.Javascript object>

In [18]:


# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

# X = scaled_x_trian
X = x_train
Y = y_train.reshape(y_train.shape[0])

# evaluate model with standardized dataset
estimator = KerasClassifier(build_fn=create_baseline, nb_epoch=100, batch_size=5, verbose=0)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(estimator, X, Y, cv=kfold)
print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [65]:
%notify
# X = scaled_x_trian
X = x_train
Y = y_train.reshape(y_train.shape[0])

# evaluate baseline model with standardized dataset
numpy.random.seed(seed)
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(build_fn=create_baseline, epochs=300, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
results = cross_val_score(pipeline, X, Y, cv=kfold)
print("Standardized: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

<IPython.core.display.Javascript object>

KeyboardInterrupt: 

In [None]:
# All done w/ 10 splits
3 layer (8,4,1) no dropout epoch100 = 81.59,4.18
4 layer (8,8,4,1) no dropout epoch100 = 80.58, 4.1
3 layer (4,4,1) no dropout epoch100 = 80.69, 3.53
3 layer (4,4,1) .5 dropout epoch100 = 79.63, 2.88
3 layer (4,4,1) .5 dropout epoch200 = 80.02, 3.92
# All done w/ 5 splits
3 layer (4,4,1) .2 dropout epoch200 = 77, 8.25
3 layer (4,4,1) no dropout epoch200 = 81.03, 2.74
3 layer (8,4,1) no dropout epoch200 = 81.71, 1.6
3 layer (8,8,1) no dropout epoch200 = 81.26, 1.18
4 layer (8,8,4,1) no dropout epoch200 = 78, 8.22
2 layer (4,1) no dropout epoch200 = 80.81, 1.7
4 layer (16,8,4,1) no dropout epoch200 = 78.65, 8.7
3 layer (16,8,1) no dropout epoch200 = 81.59, 1.39
3 layer (16,8,1) no dropout epoch300 =82.16, 3.47
3 layer (8,4,1) no dropout epoch300 = 81.37, 1.84