In [1]:
import os
import pandas as pd
import scipy as sp
from pathlib import Path

data_path = Path("data/")
output = Path("output/")

In [2]:
from functools import wraps

def answer(func):
    try:
        counter = answer.cache[func.__name__]
    except AttributeError:
        answer.cache = {func.__name__: 1}
        counter = 1
    except KeyError:
        counter = len(answer.cache) + 1
        answer.cache[func.__name__] = counter
        
        
    output_file = output/(str(counter).zfill(2) + "_" + func.__name__ + ".dat")

    @wraps(func)
    def wrapped(*args, **kwargs):
        with open(output_file, "w") as f:
            func(f)
        
        print(f"*** OUTPUT ({output_file}) ***")
        with open(output_file, "r") as f:
            print(f.read())
    return wrapped

In [3]:
data = pd.read_csv(data_path/"train.csv")
os.makedirs(output, exist_ok=True)

In [4]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
import re
from collections import defaultdict

Find number of males and females on the ship

In [6]:
@answer
def sexcount(fout):
    counts = data.groupby("Sex")["PassengerId"].count()
    display(counts)
    fout.write(f"{counts['male']} {counts['female']}")
sexcount()

Sex
female    314
male      577
Name: PassengerId, dtype: int64

*** OUTPUT (output/01_sexcount.dat) ***
577 314


Percent of people who were saved

In [7]:
@answer
def alivepart(fout):
    counts = data.groupby("Survived")["PassengerId"].count()
    display(counts)
    fout.write(str(sp.round_(counts[1]/counts.sum()*100, 2)))
alivepart()

Survived
0    549
1    342
Name: PassengerId, dtype: int64

*** OUTPUT (output/02_alivepart.dat) ***
38.38


Number of passengers travelled in first class

In [8]:
@answer
def firstclass(fout):
    class_counts = data.groupby("Pclass")["PassengerId"].count()
    display(class_counts)
    fout.write(str(sp.round_(class_counts[1]/class_counts.sum()*100, 2)))
firstclass()

Pclass
1    216
2    184
3    491
Name: PassengerId, dtype: int64

*** OUTPUT (output/03_firstclass.dat) ***
24.24


Find mean and median age of travellers

In [24]:
@answer
def age(fout):
    # mean
    mean = sp.round_(sp.mean(data["Age"].dropna()), 2)
    
    # median
    median = sp.round_(sp.median(data["Age"].dropna()), 2)
    
    fout.write(f"{mean:.2f} {median:.2f}")
age()

*** OUTPUT (output/04_age.dat) ***
29.70 28.00


Pearson correleation of parent-child and sibling-sibling occurances

In [10]:
@answer
def corrsib(fout):
    corr = data.corr().loc["Parch", "SibSp"]
    corr = sp.round_(corr, 2)
    fout.write(f"{corr}")
corrsib()

*** OUTPUT (output/05_corrsib.dat) ***
0.41


In [11]:
@answer
def femname(fout):
    re_names = re.compile("(?:Mrs|Miss)\.? ?([^\(]*)\s(?:\(([^\)]+)\))?")
    def get_names(s):
        parts = re_names.search(s)
        if parts:
            if parts.group(2) is None:
                return parts.group(1).replace('"', "").strip()
            else:
                names = parts.group(2).replace('"', "").strip().split(" ")
                if len(names) > 1:
                    return " ".join(names[:-1])
                else:
                    return names[0]
        return False
    
    names = defaultdict(lambda: 0)
    
    for longname in data["Name"]:
        parsed = get_names(longname)
        parsed = [] if not parsed else parsed.split(" ")
        for name in parsed:
            names[name] += 1
            
    df = pd.DataFrame.from_dict(names, orient="index", columns=["Number"])\
                     .sort_values("Number", ascending=False)
    display(df.head())
    
    fout.write(df.index.values[0])
                
femname()

Unnamed: 0,Number
Anna,13
Mary,12
Margaret,10
Elizabeth,9
Maria,7


*** OUTPUT (output/06_femname.dat) ***
Anna
