In [1]:
# Libraries
import warnings
import os
import shutil
import json
from pathlib import Path

import numpy as np
import pandas as pd
from tabulate import tabulate

# Ignore warnings
warnings.filterwarnings('ignore')

# Find repository root (so kaggle.json is global/top-level)
def _find_repo_root(start: Path) -> Path:
    for parent in [start, *start.parents]:
        if (parent / 'requirements.txt').exists() or (parent / '.git').exists():
            return parent
    return start

repo_root = _find_repo_root(Path.cwd())
kaggle_json_path = repo_root / 'kaggle.json'
if not kaggle_json_path.exists():
    raise FileNotFoundError(
        "Missing kaggle.json in the repository root. Put your Kaggle credentials file at: "
        f"{kaggle_json_path}"
    )

# API credentials for Kaggle
with kaggle_json_path.open(encoding='utf-8') as f:
    data = json.load(f)

os.environ['KAGGLE_USERNAME'] = data['username']
os.environ['KAGGLE_KEY'] = data['key']

from kaggle.api.kaggle_api_extended import KaggleApi

# Show current working directory
print(os.getcwd())

/workspaces/scientific_programming/Week_02/challenge


In [2]:
# Initialize API
api = KaggleApi()
api.authenticate()

In [3]:
# Download file
api.dataset_download_file('yasserh/titanic-dataset',
                          'Titanic-Dataset.csv',
                          path='./data')

# Read data to pandas data frame
df = pd.read_csv('./data/Titanic-Dataset.csv', sep=',')
df

Dataset URL: https://www.kaggle.com/datasets/yasserh/titanic-dataset


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
# Transform 'Sex' column to binary matrix
sex_binary = pd.get_dummies(df['Sex'], prefix='Sex', drop_first=False)
print(sex_binary)

     Sex_female  Sex_male
0         False      True
1          True     False
2          True     False
3          True     False
4         False      True
..          ...       ...
886       False      True
887        True     False
888        True     False
889       False      True
890       False      True

[891 rows x 2 columns]


In [5]:
# Create subset based on the specified conditions
subset = df[
    (df['Survived'] == 1) & 
    (
        ((df['Sex'] == 'female') & (df['Age'] > 45)) |
        ((df['Sex'] == 'male') & (df['Age'] < 20))
    )
]

print(f"Subset shape: {subset.shape}")
print(subset)

Subset shape: (52, 12)
     PassengerId  Survived  Pclass  \
11            12         1       1   
15            16         1       2   
52            53         1       1   
78            79         1       2   
125          126         1       3   
165          166         1       3   
183          184         1       2   
193          194         1       2   
195          196         1       1   
204          205         1       3   
220          221         1       3   
226          227         1       2   
259          260         1       2   
261          262         1       3   
268          269         1       1   
275          276         1       1   
283          284         1       3   
299          300         1       1   
305          306         1       1   
340          341         1       2   
348          349         1       3   
366          367         1       1   
407          408         1       2   
445          446         1       1   
458          459         1 

In [6]:
# Count the number of passengers selected
num_passengers = len(subset)
print(f"Number of passengers selected: {num_passengers}")

Number of passengers selected: 52
