In [1]:
# Import libraries
import arff
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# Read the pickle file from the previous notebook
df = pd.read_pickle('df_reduced.pkl')
df.head(5)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,CabinReduced
0,1.0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO",B
1,1.0,1,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON",C
2,1.0,0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",C
3,1.0,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON",C
4,1.0,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",C


The 'train_test_split' function from the 'Scikit Learn' Python library splits arrays or matrices into random train and test subsets.
It takes parametres such as:
- *arrays - lists/matrices/arrays/dataframes (usually X and y)
- test_size - the proportion of the dataset to include in the test split
- train_size - the absolute number of train samples (usually test_size is enough)
- random_state - a seed value used by the random number generator

In [3]:
# Create a list with three columns - sex, cabin and CabinReduced
col_name = ['sex', 'cabin', 'CabinReduced']

# Select columns given in the 'col_name' list from the dataframe and set the independent variable
X = df[col_name]

# Set the dependent variable 0 the 'survived' column
y = df['survived']

# Print X and y combined in one preview DataFrame
preview = X.head(10).copy()
preview['survived'] = y.head(10).values
print(preview)

      sex    cabin CabinReduced survived
0  female       B5            B        1
1    male  C22 C26            C        1
2  female  C22 C26            C        0
3    male  C22 C26            C        0
4  female  C22 C26            C        0
5    male      E12            E        1
6  female       D7            D        1
7    male      A36            A        0
8  female     C101            C        1
9    male     None            N        0


In [4]:
# Split the data into training and test sets
# Set the test_size as 30% of all data and the random_state equal to 0
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# Display the dimensions of the datasets
datasets = [X_train, X_test, y_train, y_test]
names = ['X_train', 'X_test', 'y_train', 'y_test']

for name, dataset in zip(names, datasets):
    print(f'{name} shape: {dataset.shape}')

X_train shape: (916, 3)
X_test shape: (393, 3)
y_train shape: (916,)
y_test shape: (393,)


The datasets have the following shapes:
- X_train: 916 rows, 3 columns - 916 training samples with 3 features ('sex', 'cabin' and 'CabinReduced')
- X_test: 393 rows, 3 columns - 393 test samples with 3 features ('sex', 'cabin' and 'CabinReduced')
- y_train: 916 rows, 1 column - 916 labels (whether the person survived)
- y_test: 393 rows, 1 column - 393 labels (whether the person survived)

X_train and X_test are DataFrames, while y_train and y_test are Series.

In [5]:
# Check the unique labels
for col in col_name:
    unique_test = [x for x in X_test[col].unique() if x not in X_train[col].unique()]
    print(f'"{col}": {len(unique_test)} unique values in test set') 

"sex": 0 unique values in test set
"cabin": 36 unique values in test set
"CabinReduced": 0 unique values in test set


For the 'cabin' feature, 36 values appear in the test set but not in the train set. 
For the 'CabinReduced' and 'sex' features there are no labels that appear in the test set and do not appear in the train set.

In [6]:
# Encode categorical values
def map_values(col):
    values_to_map = pd.concat([X_train[col], X_test[col]]).dropna().unique()
    map_dict = {val: i for i, val in enumerate(values_to_map)}
    return map_dict

# Create mappings
cabin_mapped = map_values('cabin')
cabin_reduced_mapped = map_values('CabinReduced')
sex_mapped = map_values('sex')

In [7]:
# Replace the original labels with mapped values
X_train['cabin_map'] = X_train['cabin'].map(cabin_mapped)
X_test['cabin_map'] = X_test['cabin'].map(cabin_mapped)

X_train['CabinReduced_map'] = X_train['CabinReduced'].map(cabin_reduced_mapped)
X_test['CabinReduced_map'] = X_test['CabinReduced'].map(cabin_reduced_mapped)

X_train['sex_map'] = X_train['sex'].map(sex_mapped)
X_test['sex_map'] = X_test['sex'].map(sex_mapped)

In [8]:
# Count missing values
print("Missing values in X_train:\n")
print(X_train[['cabin_map', 'CabinReduced_map', 'sex_map']].isnull().sum())

print("\n\nMissing values in X_test:")
print(X_test[['cabin_map', 'CabinReduced_map', 'sex_map']].isnull().sum())

Missing values in X_train:

cabin_map           702
CabinReduced_map      0
sex_map               0
dtype: int64


Missing values in X_test:
cabin_map           312
CabinReduced_map      0
sex_map               0
dtype: int64


In [9]:
# Replace missing values with 0
X_train[['cabin_map', 'CabinReduced_map', 'sex_map']] = X_train[['cabin_map', 'CabinReduced_map', 'sex_map']].fillna(0)
X_test[['cabin_map', 'CabinReduced_map', 'sex_map']] = X_test[['cabin_map', 'CabinReduced_map', 'sex_map']].fillna(0)

Replacing missing values with 0 is a simple and commonly used solution, but it is not always the best one. In some cases it is better to use, for example, mean or mode imputation, or model-based imputation.

In [10]:
# Compare number of unique values  in test set and train set
results = []

for col in ['cabin_map', 'CabinReduced_map', 'sex_map']:
    train_unique = len(X_train[col].unique())
    test_unique = len(X_test[col].unique())
    results.append({'Feature': col, 'Train Unique': train_unique, 'Test Unique': test_unique})

# Create a DataFrame from the results
unique_values_df = pd.DataFrame(results)

# Display the DataFrame
print(unique_values_df)

            Feature  Train Unique  Test Unique
0         cabin_map           150           71
1  CabinReduced_map             9            8
2           sex_map             2            2


In [12]:
 # The train set before mapping
before_mapping_train = {
    'cabin': len(X_train['cabin'].unique()),
    'CabinReduced': len(X_train['CabinReduced'].unique()),
    'sex': len(X_train['sex'].unique())
}

# The test set before mapping
before_mapping_test = {
    'cabin': len(X_test['cabin'].unique()),
    'CabinReduced': len(X_test['CabinReduced'].unique()),
    'sex': len(X_test['sex'].unique())
}

# The train set after mapping
after_mapping_train = {
    'cabin_map': len(X_train['cabin_map'].unique()),
    'CabinReduced_map': len(X_train['CabinReduced_map'].unique()),
    'sex_map': len(X_train['sex_map'].unique())
}

# The test set after mapping
after_mapping_test = {
    'cabin_map': len(X_test['cabin_map'].unique()),
    'CabinReduced_map': len(X_test['CabinReduced_map'].unique()),
    'sex_map': len(X_test['sex_map'].unique())
}

results = [
    ['cabin', before_mapping_train['cabin'], before_mapping_test['cabin'], after_mapping_train['cabin_map'], after_mapping_test['cabin_map']],
    ['CabinReduced', before_mapping_train['CabinReduced'], before_mapping_test['CabinReduced'], after_mapping_train['CabinReduced_map'], after_mapping_test['CabinReduced_map']],
    ['sex', before_mapping_train['sex'], before_mapping_test['sex'], after_mapping_train['sex_map'], after_mapping_test['sex_map']]
]

# Create the DataFrame
unique_values_df = pd.DataFrame(results, columns=['Feature', 'Train (before)', 'Test (before)', 'Train (after)', 'Test (after)'])

# Display the DataFrame
print(unique_values_df)

        Feature  Train (before)  Test (before)  Train (after)  Test (after)
0         cabin             151             71            150            71
1  CabinReduced               9              8              9             8
2           sex               2              2              2             2


The number of unique values before and after the mapping is mostly consistent, especially for categorical features like sex and CabinReduced.
The number of unique values in the test and training sets remains almost identical before and after the transformation, suggesting the data distribution is stable across both sets.