## Titanic dataset

In [10]:
import pandas as pd
import numpy as np
import random

In [2]:
# The data can be accessed online.

data = pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl')
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,?,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,?,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,?,135,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"


In [3]:
# Replace question mark by NaN.

data = data.replace('?', np.nan)

In [4]:
# Extract the first letter from the variable
# cabin.

def get_first_cabin(row):
    try:
        return row.split()[0]
    except:
        return np.nan
    
data['cabin'] = data['cabin'].apply(get_first_cabin)

In [5]:
# Save data.

data.to_csv('titanic.csv', index=False)

## Credit approval dataset

In [8]:
# Load data.
data = pd.read_csv('crx.data', header=None)

# Create variable names according to UCI Machine Learning
# information.
varnames = ['A'+str(s) for s in range(1,17)]

# Add column names.
data.columns = varnames

# Replace ? by np.nan.
data = data.replace('?', np.nan)

# Re-cast some variables to the correct types.
data['A2'] = data['A2'].astype('float')
data['A14'] = data['A14'].astype('float')

# Replace target values by numbers.
data['A16'] = data['A16'].map({'+':1, '-':0})

data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [11]:
# Add missing values at random positions.
# (This will help with the demos later on).

random.seed(9001)

values = list(set([random.randint(0, len(data)) for p in range(0, 100)]))

for var in ['A3', 'A8', 'A9', 'A10']:
    data.loc[values, var] = np.nan


data.isnull().sum()

A1     12
A2     12
A3     92
A4      6
A5      6
A6      9
A7      9
A8     92
A9     92
A10    92
A11     0
A12     0
A13     0
A14    13
A15     0
A16     0
dtype: int64

In [12]:
# Add missing values at random positions.
# (This will help with the demos later on).

random.seed(9001)

values = list(set([random.randint(0, len(data)) for p in range(0, 100)]))

for var in ['A3', 'A8', 'A9', 'A10']:
    data.loc[values, var] = np.nan


data.isnull().sum()

A1     12
A2     12
A3     92
A4      6
A5      6
A6      9
A7      9
A8     92
A9     92
A10    92
A11     0
A12     0
A13     0
A14    13
A15     0
A16     0
dtype: int64

In [13]:
# Save the data.

data.to_csv('creditApprovalUCI.csv', index=False)


In [14]:
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,,u,g,q,h,,,,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [15]:
# Categorical variables

cat_cols = [c for c in data.columns if data[c].dtypes=='O']

data[cat_cols].head()

Unnamed: 0,A1,A4,A5,A6,A7,A9,A10,A12,A13
0,b,u,g,w,v,t,t,f,g
1,a,u,g,q,h,t,t,f,g
2,a,u,g,q,h,,,f,g
3,b,u,g,w,v,t,t,t,g
4,b,u,g,w,v,t,f,f,s


In [16]:
# Numerical variables

num_cols = [c for c in data.columns if data[c].dtypes!='O']

data[num_cols].head()

Unnamed: 0,A2,A3,A8,A11,A14,A15,A16
0,30.83,0.0,1.25,1,202.0,0,1
1,58.67,4.46,3.04,6,43.0,560,1
2,24.5,,,0,280.0,824,1
3,27.83,1.54,3.75,5,100.0,3,1
4,20.17,5.625,1.71,0,120.0,0,1
