In [73]:
# New python modules are bundled in requirements.ipynb
#!pip install ludwig

In [74]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
print('Current path: ' + os.getcwd())

Current path: /tmp/working


In [75]:
dataset = pd.read_csv('./input/titanic/train.csv', index_col=False)
submission_set = pd.read_csv('./input/titanic/test.csv', index_col=False)

# RESTORE PATHS FOR KAGGLE ENVIRONMENT
#dataset = pd.read_csv('../input/titanic/train.csv', index_col=False)
#submission_set= pd.read_csv('../input/titanic/test.csv', index_col=False)

In [76]:
# dataset will be splitted internally in Ludwig model into training & test sets
dataset.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [77]:
# submission_set does not have 'Survived' column. You have to predict it
submission_set.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [78]:
from ludwig.api import LudwigModel
import yaml
import logging

In [79]:
### OPTION 1: DEFINE THE MODEL IN THE CODE #################################
# Model definition
titanic_yaml = """
input_features:
    -
        name: Pclass
        type: category
    -
        name: Sex
        type: category
    -
        name: Age
        type: numerical
        missing_value_strategy: fill_with_mean
    -
        name: SibSp
        type: numerical
    -
        name: Parch
        type: numerical
    -
        name: Fare
        type: numerical
        missing_value_strategy: fill_with_mean
    -
        name: Embarked
        type: category

output_features:
    -
        name: Survived
        type: binary
"""

model_definition = yaml.load(titanic_yaml, Loader=yaml.FullLoader)

print(yaml.dump(model_definition))

input_features:
- name: Pclass
  type: category
- name: Sex
  type: category
- missing_value_strategy: fill_with_mean
  name: Age
  type: numerical
- name: SibSp
  type: numerical
- name: Parch
  type: numerical
- missing_value_strategy: fill_with_mean
  name: Fare
  type: numerical
- name: Embarked
  type: category
output_features:
- name: Survived
  type: binary



In [80]:
### OPTION 2: DEFINE THE MODEL FROM AN EXTERNAL YAML FILE #################################
# You can alternatively load it from the external file
with open(r'./input/titanic/model_definition.yaml') as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
    titanic_yaml_toJson = yaml.load(file, Loader=yaml.FullLoader)
    print(titanic_yaml_toJson)
    
model_definition = yaml.load('./input/titanic/model_definition.yaml', Loader=yaml.FullLoader)
print(yaml.dump(model_definition))

{'input_features': [{'name': 'Pclass', 'type': 'category'}, {'name': 'Sex', 'type': 'category'}, {'name': 'Age', 'type': 'numerical', 'missing_value_strategy': 'fill_with_mean'}, {'name': 'SibSp', 'type': 'numerical'}, {'name': 'Parch', 'type': 'numerical'}, {'name': 'Fare', 'type': 'numerical', 'missing_value_strategy': 'fill_with_mean'}, {'name': 'Embarked', 'type': 'category'}], 'output_features': [{'name': 'Survived', 'type': 'binary'}]}
./input/titanic/model_definition.yaml
...



In [81]:
# Instantiate a model
model = LudwigModel(model_definition)
#Train the model
print("training...")
train_stats = model.train(dataset, logging_level=logging.INFO)
print("finished training.\n")

training...
finished training.



In [82]:
predictions = model.predict(submission_set, logging_level=logging.INFO)
predictions

(     Survived_predictions  Survived_probabilities_False  \
 0                   False                      0.820209   
 1                   False                      0.565499   
 2                   False                      0.724704   
 3                   False                      0.829723   
 4                   False                      0.542959   
 ..                    ...                           ...   
 413                 False                      0.854882   
 414                  True                      0.125081   
 415                 False                      0.820851   
 416                 False                      0.854882   
 417                 False                      0.805885   
 
      Survived_probabilities_True  Survived_probability  
 0                       0.179791              0.820209  
 1                       0.434501              0.565499  
 2                       0.275296              0.724704  
 3                       0.170277             

In [83]:
survived_pred = predictions[0][['Survived_predictions']]
survived_pred

Unnamed: 0,Survived_predictions
0,False
1,False
2,False
3,False
4,False
...,...
413,False
414,True
415,False
416,False
