# Cleaning Up the Titanic Manifest dataset
The inspiration for using this dataset came from the following Kaggle competition: https://www.kaggle.com/c/titanic

The data in Kaggle requiers a Kaggle user and entering the competition, but it is also public avilable in http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls


## Importing Packages

In [None]:
import numpy as np
import pandas as pd

## Downloading Dataset

In [None]:
import os
import subprocess
import requests
import tqdm

if not os.path.isfile('../original/titanic3.csv'):
    response = requests.get('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv', stream=True)
    with open('../original/titanic3.csv', 'wb') as fid:
        total_length = int(response.headers.get('content-length'))
        for chunk in tqdm.tqdm_notebook(response.iter_content(chunk_size=1024), desc='Downloading', total=(total_length / 1024) + 1): 
            if chunk:
                fid.write(chunk)
                fid.flush()

## Loading the Dataset

In [None]:
full_dataset = pd.read_csv('../original/titanic3.csv')

## Displaying the First 10 Rows of the Dataset

In [None]:
print('Dataset size: {}'.format(len(full_dataset)))
full_dataset.head(10)

Dataset size: 1309


Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
5,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.55,E12,S,3,,"New York, NY"
6,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0,1,0,13502,77.9583,D7,S,10,,"Hudson, NY"
7,1,0,"Andrews, Mr. Thomas Jr",male,39.0,0,0,112050,0.0,A36,S,,,"Belfast, NI"
8,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0,2,0,11769,51.4792,C101,S,D,,"Bayside, Queens, NY"
9,1,0,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C,,22.0,"Montevideo, Uruguay"


## Cleaning Up the Data

In [None]:
dataset = full_dataset.copy()   # Creat a copy of the data

## Translate the passenger Sex column into a numeric value (0 for male and 1 for female)
def return_numeric_sex(sex):
  if sex == 'male':
    return 0
  elif sex == 'female':
    return 1
  else:
    return np.nan
full_dataset['numeric_sex'] = full_dataset['sex'].apply(return_numeric_sex)

## Filter the data
full_dataset = full_dataset.query(
    ## Throw out rows with nan values
    'age >= 0 &' +
    'pclass >= 0 &' +
    'numeric_sex >= 0')

## Remove rows with non-integer age
full_dataset = full_dataset.loc[np.modf(full_dataset['age'].values)[0] < 1e-6]

full_dataset['age'] = full_dataset['age'].astype(np.int)

print('Dataset size: {}'.format(len(full_dataset)))
full_dataset.head(10)

Dataset size: 1001


Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,numeric_sex
0,1,1,"Allen, Miss. Elisabeth Walton",female,29,0,0,24160,211.3375,B5,S,2,,"St Louis, MO",1
2,1,0,"Allison, Miss. Helen Loraine",female,2,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",1
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON",0
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",1
5,1,1,"Anderson, Mr. Harry",male,48,0,0,19952,26.55,E12,S,3,,"New York, NY",0
6,1,1,"Andrews, Miss. Kornelia Theodosia",female,63,1,0,13502,77.9583,D7,S,10,,"Hudson, NY",1
7,1,0,"Andrews, Mr. Thomas Jr",male,39,0,0,112050,0.0,A36,S,,,"Belfast, NI",0
8,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53,2,0,11769,51.4792,C101,S,D,,"Bayside, Queens, NY",1
9,1,0,"Artagaveytia, Mr. Ramon",male,71,0,0,PC 17609,49.5042,,C,,22.0,"Montevideo, Uruguay",0
10,1,0,"Astor, Col. John Jacob",male,47,1,0,PC 17757,227.525,C62 C64,C,,124.0,"New York, NY",0


## Save the Clean Dataset

In [None]:
full_dataset.to_csv('../../datasets/titanic_manifest.csv', index=False)