# Kaggle Titanic competition
goal: create fast and submit to learn how to deal with Kaggle


In [1]:
# general & data analysis imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# data preprocessing imports
from sklearn import preprocessing
#import category_encoders as ce
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.model_selection import train_test_split

# model learning & evaluation imports
import tensorflow as tf
from tensorflow.keras import models
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras import regularizers
import autokeras as ak

Using TensorFlow backend.


## data loading

In [2]:
train_dataset=pd.read_csv('train.csv')
test_dataset=pd.read_csv('test.csv')

## data analysis

In [None]:
train_dataset.info()

In [None]:
train_dataset.describe()

In [None]:
train_dataset.head()


In [None]:
train_dataset.hist(figsize=(15,20));
plt.figure();

In [None]:
sns.pairplot(train_dataset, hue="Survived");

In [None]:
sns.barplot(x="SibSp", y="Survived", data=train_dataset)

In [None]:
sns.barplot(x="Parch", y="Survived", data=train_dataset)

In [None]:
sns.heatmap(train_dataset.corr(),annot=True) 


In [None]:
test_dataset.isnull().sum()

## data preprocessing

In [3]:
# remove dependent variable from train set to have the same structure as test set
df_y=train_dataset['Survived'].copy()
train_dataset.drop('Survived', axis=1,inplace=True)

In [4]:
def preprocess_data(source_df):
    df=source_df.copy()
    # Age to bins, NaN to separate bin
    #df.Age.fillna(-1, inplace=True)
    #age_bins=(-10,0,10,30,500)
    #age_labels=('unk','upto10','upto30','above30')
    #df.Age=pd.cut(df.Age, age_bins, labels=age_labels)
    #cabin - keep first letter
    df.Cabin.fillna('Unknown', inplace=True)
    df.Cabin=df.Cabin.apply(lambda x: x[0])
    # fare - divide by mean (~32)
    df.Fare.fillna(df.Fare.median(), inplace=True)
    #df.Fare=df.Fare.apply(lambda x: x/32)
    # embarked - fillna
    df.Embarked.fillna('U', inplace=True)
    # SibSp, Parch - replace to binary
    #df['SibSp']=df['SibSp'].apply(lambda x: 1 if (x==1)|(x==2) else 0)
    #df['Parch']=df['Parch'].apply(lambda x: 1 if (x==1)|(x==2)|(x==3) else 0)
    
    #df['FamilySize']=df['SibSp']+df['Parch']+1
    #df.SibSp=df.SibSp.apply(lambda x: int(x>0))
    #df.Parch=df.Parch.apply(lambda x: int(x>0))
    # df['Fam_label']=df.FamilySize.apply(Fam_label)
    # ticket, name - drop
    df=df.drop(['Ticket','Name','PassengerId'], axis=1)

    #return preprocessed df
    return df


In [5]:
df_test=preprocess_data(test_dataset)
df_train=preprocess_data(train_dataset)

In [None]:
df_train.info()

In [None]:
df_train.head()

In [None]:
df_train['Sex'].value_counts()

In [6]:
def encode(df1, df2):
    '''df1, df2 - train and test dataframes
    return - modified df1, df2'''
    
    cat_features =['Sex','Pclass','Cabin','Embarked'] # features for categorization
    onehot_features=['Cabin','Embarked','Age'] # features for onehot encoding
    
    # for each feature, use LabelEncoder on both dataframes
    for f in cat_features:
        l_encoder = preprocessing.LabelEncoder()
        l_encoder.fit(df1[f])
        df1[f]=l_encoder.transform(df1[f])
        df2[f]=l_encoder.transform(df2[f])

    # use OneHotEncoder on both dataframes
    # oh_encoder = ce.OneHotEncoder(handle_unknown='ignore', cols=onehot_features,use_cat_names=True)
    # df1=oh_encoder.fit_transform(df1)
    # df2=oh_encoder.transform(df2)
    
    # drop manually featurez for "unknown" values
    #df1.drop('Age_unk', axis=1, inplace=True)
    #df2.drop('Age_unk', axis=1, inplace=True)
    #df1.drop('Embarked_U', axis=1, inplace=True)
    #df2.drop('Embarked_U', axis=1, inplace=True)

    return df1,df2
        
    

In [7]:
df_train, df_test = encode(df_train, df_test)

In [None]:
# Perform feature selection
K=12

#draw a graph before
predictors=df_train.columns.tolist()
#selector = SelectKBest(f_classif, k='all')
selector = SelectKBest(chi2, k='all')
_=selector.fit(df_train, df_y)
plt.bar(range(len(predictors)), selector.scores_)
plt.xticks(range(len(predictors)), predictors, rotation='vertical')
plt.title('features - before')
plt.show()

# select K best features

selector = SelectKBest(f_classif, k=K)
X_train_pruned=selector.fit_transform(df_train, df_y)
X_test_pruned=selector.transform(df_test)  

selected_features=df_train.columns[selector.get_support(indices=True)].tolist()
removed_features=[x for x in predictors if x not in selected_features]
print(f'selected: {selected_features}')
print(f'removed: {removed_features}')

#draw a graph after
selector = SelectKBest(chi2, k='all')
_=selector.fit(X_train_pruned, df_y)
plt.bar(range(len(selected_features)), selector.scores_)
plt.xticks(range(len(selected_features)), selected_features, rotation='vertical')
plt.title('features - after')
plt.show()


In [None]:
X_train_pruned.shape

## building autokeras model

In [33]:
# split train and validation
# X_train,X_val,y_train,y_val=train_test_split(df_train.values,dfy.values,test_size=0.25,random_state=1, shuffle=True)
X_train,X_val,y_train,y_val=train_test_split(df_train.values,df_y.values,test_size=0.25,random_state=1, shuffle=True)
X_test=df_test.values

In [34]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)
print(X_test.shape)


(668, 8)
(668,)
(223, 8)
(223,)
(418, 8)


In [None]:
df_train.info()

In [26]:
df_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,2,1,22.0,1,0,7.25,8,2
1,0,0,38.0,1,0,71.2833,2,0
2,2,0,26.0,0,0,7.925,8,2
3,0,0,35.0,1,0,53.1,2,2
4,2,1,35.0,0,0,8.05,8,2


In [None]:
X_train.shape

In [None]:
X_train[0]

In [35]:
datainfo=np.array(['NUM','CAT','NUM','NUM','CAT','NUM','CAT','CAT'])

In [36]:
from keras.datasets import mnist
from autokeras.image.image_supervised import ImageClassifier


clf = ak.TabularClassifier()
clf.fit(X_train, y_train, time_limit=30*60, data_info=datainfo)

clf.final_fit(X_train, y_train, X_val, y_val, retrain=True)
y = clf.evaluate(X_val, y_val)
print(y)

QQ: ['NUM' 'CAT' 'NUM' 'NUM' 'CAT' 'NUM' 'CAT' 'CAT']
QQ1: 0
QQ2: 4
QQ3: 4
num_cat_pair_2: {}
0.7453125


In [38]:
clf.load_searcher().load_best_model().produce_keras_model().save('my_auto_model.h5')

AttributeError: 'TabularClassifier' object has no attribute 'load_searcher'

In [None]:
from keras.models import load_model
model = load_model('my_model.h5') #See 'How to export keras models?' to generate this file before loading it.

In [None]:
datainfo

## predict for test data & submit

In [30]:
y_probs=clf.predict(X_test)
y_preds=np.rint(y_probs).astype(int)
y_preds=y_preds.reshape(-1)

In [31]:
submission = pd.DataFrame({'PassengerId':test_dataset['PassengerId'],'Survived':y_preds})


In [32]:
submission.to_csv('submission.csv',index=False)