In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf
import ydf # Importing ydf for compatibility with TensorFlow Decision Forests
import tensorflow_decision_forests as tfdf

In [5]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [6]:
print(f"Found TF-DF {tfdf.__version__}")

Found TF-DF 1.12.0


In [7]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [8]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [9]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [10]:
test_df.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [11]:
#Verify the data types of the columns
print(train_df.dtypes)

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


### Data Preprocessing

In [12]:
# Drop the 'PassengerId', 'Name' and 'Ticket' columns from both datasets, as it is not useful for training
train_df = train_df.drop(columns=['PassengerId', 'Name', 'Ticket'])
test_df = test_df.drop(columns=['PassengerId', 'Name', 'Ticket'])

In [13]:
# Convert categorical columns to string type
cat_cols = ['Pclass']
train_df[cat_cols] = train_df[cat_cols].astype(str)
test_df[cat_cols] = test_df[cat_cols].astype(str)

In [14]:
# Convert the survival column to 'yes' and 'no' strings
train_df['Survived'] = train_df['Survived'].map({0: 'no', 1: 'yes'})

In [15]:
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,no,3,male,22.0,1,0,7.25,,S
1,yes,1,female,38.0,1,0,71.2833,C85,C
2,yes,3,female,26.0,0,0,7.925,,S
3,yes,1,female,35.0,1,0,53.1,C123,S
4,no,3,male,35.0,0,0,8.05,,S


In [16]:
# Count the nan values in the train dataset in each column
train_df.isna().sum().sort_values(ascending=False)

Cabin       687
Age         177
Embarked      2
Survived      0
Pclass        0
SibSp         0
Sex           0
Fare          0
Parch         0
dtype: int64

In [17]:
# Count the nan values in the test dataset in each column
test_df.isna().sum().sort_values(ascending=False)

Cabin       327
Age          86
Fare          1
Pclass        0
SibSp         0
Sex           0
Parch         0
Embarked      0
dtype: int64

In [18]:
# Remove the 'Cabin' column in both datasets, as it has too many missing values
train_df = train_df.drop(columns=['Cabin'])
test_df = test_df.drop(columns=['Cabin'])

In [19]:
# Fill missing values in 'Age' with the median age

## If this is not a good strategy, I will remove the rows

train_df['Age'] = train_df['Age'].fillna(train_df['Age'].median())
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].median())

In [20]:
# Show the embarked values in the 'Embarked' column that is missing
train_df[train_df['Embarked'].isna()]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
61,yes,1,female,38.0,0,0,80.0,
829,yes,1,female,62.0,0,0,80.0,


In [21]:
# Fill missing values in 'Embarked' in the train dataset with the most common value
train_df['Embarked'] = train_df['Embarked'].fillna(train_df['Embarked'].mode()[0])

In [22]:
# Fill missing values in 'Fare' in the test dataset with the median fare
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].median())

In [23]:
# Verify that there are no more missing values in the train dataset
train_df.isna().sum().sort_values(ascending=False)

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [24]:
# Verify that there are no more missing values in the test dataset
test_df.isna().sum().sort_values(ascending=False)

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

### Model Training ###

In [25]:
# Create a TensorFlow Decision Forests model
model = tfdf.keras.RandomForestModel(task= tfdf.keras.Task.CLASSIFICATION,
                                      num_trees=100,
                                      max_depth=10,
                                      min_examples=10,
                                      categorical_algorithm='CART',
                                     )

Use /tmp/tmpvbpf97m2 as temporary training directory


I0000 00:00:1754314939.658873  274438 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 4153 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1660 Ti, pci bus id: 0000:01:00.0, compute capability: 7.5


In [26]:
# Convert the datasets to a TensorFlow datasets
train_data = tfdf.keras.pd_dataframe_to_tf_dataset(train_df, label='Survived')
test_data = tfdf.keras.pd_dataframe_to_tf_dataset(test_df)

In [27]:
model.fit(train_data)  # Train the model on the training dataset

Reading training dataset...
Training dataset read in 0:00:03.310900. Found 891 examples.
Training model...
Model trained in 0:00:00.098963
Compiling model...


I0000 00:00:1754314943.202129  274438 kernel.cc:782] Start Yggdrasil model training
I0000 00:00:1754314943.202171  274438 kernel.cc:783] Collect training examples
I0000 00:00:1754314943.202182  274438 kernel.cc:795] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

I0000 00:00:1754314943.202608  274438 kernel.cc:401] Number of batches: 1
I0000 00:00:1754314943.202626  274438 kernel.cc:402] Number of examples: 891
I0000 00:00:1754314943.202964  274438 kernel.cc:802] Training dataset:
Number of records: 891
Number of columns: 8

Number of columns by type:
	CATEGORICAL: 4 (50%)
	NUMERICAL: 4 (50%)

Columns:

CATEGORICAL: 4 (50%)
	1: "Embarked" CATEGORICAL has-dict vocab-si

Model compiled.


<tf_keras.src.callbacks.History at 0x7b6605de8f20>

In [28]:
model.summary()  # Print the model summary

Model: "random_forest_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1 (1.00 Byte)
Trainable params: 0 (0.00 Byte)
Non-trainable params: 1 (1.00 Byte)
_________________________________________________________________
Type: "RANDOM_FOREST"
Task: CLASSIFICATION
Label: "__LABEL"

Input Features (7):
	Age
	Embarked
	Fare
	Parch
	Pclass
	Sex
	SibSp

No weights

Variable Importance: INV_MEAN_MIN_DEPTH:
    1.      "Sex"  0.580120 ################
    2.     "Fare"  0.334135 ######
    3.   "Pclass"  0.255020 ####
    4.      "Age"  0.246121 ###
    5. "Embarked"  0.158898 
    6.    "SibSp"  0.154797 
    7.    "Parch"  0.145175 

Variable Importance: NUM_AS_ROOT:
    1.      "Sex" 58.000000 ################
    2.     "Fare" 20.000000 #####
    3.   "Pclass" 18.000000 ####
    4. "Embarked"  3.000000 
    5.    "Parch"  1.000000 

Variable Importance: NUM_NODES:
    1.     "Fare" 1705.0

In [29]:
# Test the model on the test dataset
predictions = model.predict(test_data)  # Get predictions for the test dataset
predictions



array([[0.        ],
       [0.3899999 ],
       [0.        ],
       [0.02      ],
       [0.44999984],
       [0.04      ],
       [0.8199995 ],
       [0.16      ],
       [0.8499995 ],
       [0.01      ],
       [0.01      ],
       [0.19000001],
       [0.99999934],
       [0.        ],
       [0.99999934],
       [0.97999936],
       [0.        ],
       [0.05999999],
       [0.35999992],
       [0.66999966],
       [0.27      ],
       [0.44999984],
       [0.9499994 ],
       [0.42999986],
       [0.9699994 ],
       [0.        ],
       [0.98999935],
       [0.05999999],
       [0.41999987],
       [0.10999998],
       [0.        ],
       [0.08999999],
       [0.5099998 ],
       [0.29      ],
       [0.5999997 ],
       [0.05999999],
       [0.22000003],
       [0.14999999],
       [0.        ],
       [0.46999982],
       [0.05999999],
       [0.7099996 ],
       [0.        ],
       [0.99999934],
       [0.99999934],
       [0.08999999],
       [0.32999995],
       [0.05 

In [30]:
# Transform the predictions to values of 0 and 1
predictions = np.where(predictions <= 0.5, 0, 1)
predictions_df = pd.DataFrame(predictions, columns=['Survived'])  # Create a DataFrame with the predictions
predictions_df

Unnamed: 0,Survived
0,0
1,0
2,0
3,0
4,0
...,...
413,0
414,1
415,0
416,0


In [31]:
results = pd.read_csv('data/gender_submission.csv')  # Read the sample submission file
results.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [32]:
# Count the number of the correct predictions
correct_predictions = (results['Survived'] == predictions_df['Survived']).sum()

# Count the total number of the sample predictions
total_predictions = len(predictions_df)

accuracy = correct_predictions / total_predictions  # Calculate the accuracy
print(f"Accuracy: {accuracy:.2%}")  # Print the accuracy as a percentage

Accuracy: 90.43%


In [33]:
correct_predictions

np.int64(378)