# Solution: Challenge Week 04

## Libraries and settings

In [None]:
# Libraries
import os
import json
import numpy as np
import pandas as pd
from tabulate import tabulate

# API credentials for Kaggle
with open('kaggle.json') as f:
    data = json.load(f)

os.environ['KAGGLE_USERNAME'] = data['username']
os.environ['KAGGLE_KEY'] = data['key']

from kaggle.api.kaggle_api_extended import KaggleApi

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Show current working directory
print(os.getcwd())

## Use the Kaggle Web API to download the Titanic data set

In [None]:
# Initialize API
api = KaggleApi()
api.authenticate()

# Download file
api.dataset_download_file('yasserh/titanic-dataset','Titanic-Dataset.csv')

# Read data to pandas data frame
df = pd.read_csv('Titanic-Dataset.csv', sep=',')
df

## Identify the data types in the Titanic data set

In [None]:
df.dtypes

## Transform variable 'Sex' (Gender) to a matrix with (0/1) values

In [None]:
# Use the get_dummies() method from pandas for conversion
df_02 = pd.get_dummies(df, drop_first=False, columns=['Sex'])
df_02

## Create a subset of the Titanic data which includes:
- passengers which have survived AND
- female passengers which were older than 45 years OR 
- male passengers which were younger than 20 years

In [None]:
# Create subset
df_sub = df.loc[(df['Survived'] == 1) & 
                (((df['Sex'] == 'female') & (df['Age'] >= 45)) | 
                 ((df['Sex'] == 'male') & (df['Age'] < 20)))
                 ].reset_index(drop=True)

# Show subset
print(tabulate(df_sub, headers=list(df_sub.columns)))

## Answer the question: How many passengers were selected?

In [None]:
# Using an f-string for formatting and to provide the answer
print(f'Using the selection above, a number of {df_sub.shape[0]} passengers were selected.')