## Create Data Set for Clean Up Practice
The data set is created from [Kaggle Titanic dataset][titanic]. The original dataset is one file. I'll split it up to 3 files, one for each port of embarkation. Each file will have slight different column name and currency/datatime format. Datetime column is random generated from age column in original file.

[titanic]:https://www.kaggle.com/c/titanic

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [2]:
df_titanic = pd.read_csv('titanic.csv')
df_titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df_titanic.columns = ['Passenger ID', 'Survived', 'Passenger Class', 'Name', 'Sex', 'Age', 'Sibling Spouse',
       'Parent Children', 'Ticket', 'Fare', 'Cabin', 'Embarked']

### Create Birthday Column
Random generate birthday based on Age Column. The tragidy happened on April 15, 1912.

In [4]:
df_titanic.drop(columns=['Passenger ID'], axis=1, inplace=True)

In [5]:
df_titanic.describe()

Unnamed: 0,Survived,Passenger Class,Age,Sibling Spouse,Parent Children,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
df_titanic.Age.unique()

array([22.  , 38.  , 26.  , 35.  ,   nan, 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  ,
       49.  , 29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  ,
       16.  , 25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  ,
       71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 ,
       51.  , 55.5 , 40.5 , 44.  ,  1.  , 61.  , 56.  , 50.  , 36.  ,
       45.5 , 20.5 , 62.  , 41.  , 52.  , 63.  , 23.5 ,  0.92, 43.  ,
       60.  , 10.  , 64.  , 13.  , 48.  ,  0.75, 53.  , 57.  , 80.  ,
       70.  , 24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  ])

In [7]:
from datetime import timedelta
from random import randrange
def gen_birthday(age):
    sunk_date = datetime(1912, 4, 15)
    days = int(age * 365)
    if age < 1:
        return sunk_date - timedelta(days=days)
    return sunk_date - timedelta(days=randrange(days-360, days))

In [8]:
df_titanic['Birthday'] = df_titanic.Age.apply(lambda x:np.NaN if pd.isnull(x) else gen_birthday(x))
df_titanic.sample(10)

Unnamed: 0,Survived,Passenger Class,Name,Sex,Age,Sibling Spouse,Parent Children,Ticket,Fare,Cabin,Embarked,Birthday
621,1,1,"Kimball, Mr. Edwin Nelson Jr",male,42.0,1,0,11753,52.5542,D19,S,1870-12-12
614,0,3,"Brocklebank, Mr. William Alfred",male,35.0,0,0,364512,8.05,,S,1877-11-04
645,1,1,"Harper, Mr. Henry Sleeper",male,48.0,1,0,PC 17572,76.7292,D33,C,1864-05-02
472,1,2,"West, Mrs. Edwy Arthur (Ada Mary Worth)",female,33.0,1,2,C.A. 34651,27.75,,S,1879-11-21
708,1,1,"Cleaver, Miss. Alice",female,22.0,0,0,113781,151.55,,S,1890-12-10
412,1,1,"Minahan, Miss. Daisy E",female,33.0,1,0,19928,90.0,C78,Q,1879-08-31
232,0,2,"Sjostedt, Mr. Ernst Adolf",male,59.0,0,0,237442,13.5,,S,1853-07-09
377,0,1,"Widener, Mr. Harry Elkins",male,27.0,0,2,113503,211.5,C82,C,1885-06-15
403,0,3,"Hakkarainen, Mr. Pekka Pietari",male,28.0,1,0,STON/O2. 3101279,15.85,,S,1884-06-29
761,0,3,"Nirva, Mr. Iisakki Antino Aijo",male,41.0,0,0,SOTON/O2 3101272,7.125,,S,1871-12-08


In [9]:
df_titanic.isnull().sum()

Survived             0
Passenger Class      0
Name                 0
Sex                  0
Age                177
Sibling Spouse       0
Parent Children      0
Ticket               0
Fare                 0
Cabin              687
Embarked             2
Birthday           177
dtype: int64

In [10]:
df_titanic[df_titanic.Age.isnull()].sample(5)

Unnamed: 0,Survived,Passenger Class,Name,Sex,Age,Sibling Spouse,Parent Children,Ticket,Fare,Cabin,Embarked,Birthday
95,0,3,"Shorney, Mr. Charles Joseph",male,,0,0,374910,8.05,,S,NaT
859,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C,NaT
295,0,1,"Lewy, Mr. Ervin G",male,,0,0,PC 17612,27.7208,,C,NaT
832,0,3,"Saad, Mr. Amin",male,,0,0,2671,7.2292,,C,NaT
653,1,3,"O'Leary, Miss. Hanora ""Norah""",female,,0,0,330919,7.8292,,Q,NaT


### Split to 3 Files
C = Cherbourg, Q = Queenstown, S = Southampton

In [11]:
df_c = df_titanic[df_titanic.Embarked=='C']
df_q = df_titanic[df_titanic.Embarked=='Q']
df_s = df_titanic[df_titanic.Embarked=='S']
print (df_c.shape[0], df_q.shape[0], df_s.shape[0])

168 77 644


In [12]:
df_c.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168 entries, 1 to 889
Data columns (total 12 columns):
Survived           168 non-null int64
Passenger Class    168 non-null int64
Name               168 non-null object
Sex                168 non-null object
Age                130 non-null float64
Sibling Spouse     168 non-null int64
Parent Children    168 non-null int64
Ticket             168 non-null object
Fare               168 non-null float64
Cabin              69 non-null object
Embarked           168 non-null object
Birthday           130 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(2), int64(4), object(5)
memory usage: 17.1+ KB


In [13]:
df_q.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77 entries, 5 to 890
Data columns (total 12 columns):
Survived           77 non-null int64
Passenger Class    77 non-null int64
Name               77 non-null object
Sex                77 non-null object
Age                28 non-null float64
Sibling Spouse     77 non-null int64
Parent Children    77 non-null int64
Ticket             77 non-null object
Fare               77 non-null float64
Cabin              4 non-null object
Embarked           77 non-null object
Birthday           28 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(2), int64(4), object(5)
memory usage: 7.8+ KB


In [14]:
df_s.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 644 entries, 0 to 888
Data columns (total 12 columns):
Survived           644 non-null int64
Passenger Class    644 non-null int64
Name               644 non-null object
Sex                644 non-null object
Age                554 non-null float64
Sibling Spouse     644 non-null int64
Parent Children    644 non-null int64
Ticket             644 non-null object
Fare               644 non-null float64
Cabin              129 non-null object
Embarked           644 non-null object
Birthday           554 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(2), int64(4), object(5)
memory usage: 65.4+ KB


In [15]:
def get_birthday_str(birthday, format_str):
    if pd.isnull(birthday):
        return np.NaN
    return birthday.strftime(format_str)

In [16]:
df_c['Birthday'] = df_c.Birthday.apply(lambda x:get_birthday_str(x, '%b %d, %Y'))
df_c.drop(columns=['Age', 'Embarked'], axis=1, inplace=True)
df_c.sample(10)

Unnamed: 0,Survived,Passenger Class,Name,Sex,Sibling Spouse,Parent Children,Ticket,Fare,Cabin,Birthday
698,0,1,"Thayer, Mr. John Borland",male,1,1,17421,110.8833,C68,"Mar 10, 1864"
843,0,3,"Lemberopolous, Mr. Peter L",male,0,0,2683,6.4375,,"Nov 23, 1877"
52,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,1,0,PC 17572,76.7292,D33,"Mar 03, 1864"
373,0,1,"Ringhini, Mr. Sante",male,0,0,PC 17760,135.6333,,"Jan 16, 1891"
716,1,1,"Endres, Miss. Caroline Louise",female,0,0,PC 17757,227.525,C45,"Nov 30, 1874"
218,1,1,"Bazzani, Miss. Albina",female,0,0,11813,76.2917,D15,"Oct 05, 1880"
557,0,1,"Robbins, Mr. Victor",male,0,0,PC 17757,227.525,,
487,0,1,"Kent, Mr. Edward Austin",male,0,0,11771,29.7,B37,"Mar 19, 1855"
378,0,3,"Betros, Mr. Tannous",male,0,0,2648,4.0125,,"Jan 23, 1893"
659,0,1,"Newell, Mr. Arthur Webster",male,0,2,35273,113.275,D48,"Feb 13, 1855"


In [17]:
df_q['Birthday'] = df_q.Birthday.apply(lambda x:get_birthday_str(x, '%m.%d.%Y'))
df_q.drop(columns=['Age', 'Embarked'], axis=1, inplace=True)
df_q.sample(10)

Unnamed: 0,Survived,Passenger Class,Name,Sex,Sibling Spouse,Parent Children,Ticket,Fare,Cabin,Birthday
241,1,3,"Murphy, Miss. Katherine ""Kate""",female,1,0,367230,15.5,,
613,0,3,"Horgan, Mr. John",male,0,0,370377,7.75,,
5,0,3,"Moran, Mr. James",male,0,0,330877,8.4583,,
188,0,3,"Bourke, Mr. John",male,1,1,364849,15.5,,09.13.1872
657,0,3,"Bourke, Mrs. John (Catherine)",female,1,1,364849,15.5,,07.03.1880
680,0,3,"Peters, Miss. Katie",female,0,0,330935,8.1375,,
459,0,3,"O'Connor, Mr. Maurice",male,0,0,371060,7.75,,
280,0,3,"Duane, Mr. Frank",male,0,0,336439,7.75,,02.23.1848
143,0,3,"Burke, Mr. Jeremiah",male,0,0,365222,6.75,,02.07.1894
171,0,3,"Rice, Master. Arthur",male,4,1,382652,29.125,,04.30.1908


In [18]:
df_s['Birthday'] = df_s.Birthday.apply(lambda x:get_birthday_str(x, '%Y-%m-%d'))
df_s.drop(columns=['Age', 'Embarked'], axis=1, inplace=True)
df_s.sample(10)

Unnamed: 0,Survived,Passenger Class,Name,Sex,Sibling Spouse,Parent Children,Ticket,Fare,Cabin,Birthday
542,0,3,"Andersson, Miss. Sigrid Elisabeth",female,4,2,347082,31.275,,1901-11-07
224,1,1,"Hoyt, Mr. Frederick Maxfield",male,1,0,19943,90.0,C93,1874-06-29
724,1,1,"Chambers, Mr. Norman Campbell",male,1,0,113806,53.1,E8,1885-11-25
272,1,2,"Mellinger, Mrs. (Elizabeth Anne Maidment)",female,0,1,250644,19.5,,1872-02-21
503,0,3,"Laitinen, Miss. Kristina Sofia",female,0,0,4135,9.5875,,1875-05-07
321,0,3,"Danoff, Mr. Yoto",male,0,0,349219,7.8958,,1885-10-12
467,0,1,"Smart, Mr. John Montgomery",male,0,0,113792,26.55,,1856-06-24
774,1,2,"Hocking, Mrs. Elizabeth (Eliza Needs)",female,1,3,29105,23.0,,1858-11-01
683,0,3,"Goodwin, Mr. Charles Edward",male,5,2,CA 2144,46.9,,1898-09-10
407,1,2,"Richards, Master. William Rowe",male,1,1,29106,18.75,,1909-11-22


### Change Fare to Str in One file

In [19]:
df_s['Fare'] = '$' + df_s.Fare.astype(str)
df_s.head()

Unnamed: 0,Survived,Passenger Class,Name,Sex,Sibling Spouse,Parent Children,Ticket,Fare,Cabin,Birthday
0,0,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,$7.25,,1891-03-04
2,1,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,$7.925,,1886-12-04
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,$53.1,C123,1878-02-15
4,0,3,"Allen, Mr. William Henry",male,0,0,373450,$8.05,,1877-10-02
6,0,1,"McCarthy, Mr. Timothy J",male,0,0,17463,$51.8625,E46,1858-12-10


### Save to csv
C = Cherbourg, Q = Queenstown, S = Southampton

In [20]:
df_c.to_csv('Titanic_Cherbourg.csv', index=False)
df_q.to_csv('Titanic_Queenstown.csv', index=False)
df_s.rename(columns={'Sex':'Gender'}).to_csv('Titanic_Southampton.csv', index=False)

In [21]:
df_c.columns

Index(['Survived', 'Passenger Class', 'Name', 'Sex', 'Sibling Spouse',
       'Parent Children', 'Ticket', 'Fare', 'Cabin', 'Birthday'],
      dtype='object')

In [22]:
!head Titanic_Cherbourg.csv

Survived,Passenger Class,Name,Sex,Sibling Spouse,Parent Children,Ticket,Fare,Cabin,Birthday
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,1,0,PC 17599,71.2833,C85,"May 06, 1874"
1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,1,0,237736,30.0708,,"Sep 14, 1898"
1,3,"Masselmani, Mrs. Fatima",female,0,0,2649,7.225,,
0,3,"Emir, Mr. Farred Chehab",male,0,0,2631,7.225,,
0,1,"Uruchurtu, Don. Manuel E",male,0,0,PC 17601,27.7208,,"Nov 17, 1872"
1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,1,0,PC 17569,146.5208,B78,
0,1,"Meyer, Mr. Edgar Joseph",male,1,0,PC 17604,82.1708,,"Nov 24, 1884"
1,3,"Mamee, Mr. Hanna",male,0,0,2677,7.2292,,
1,3,"Nicola-Yarred, Miss. Jamila",female,1,0,2651,11.2417,,"Oct 12, 1898"
