# NumPy data importing and manipulation

In [1]:
import csv
import numpy as np

In [2]:
csv_file_object = csv.reader(open('train.csv', 'rb'))
csv_file_object.next()  # skip header
data = [row for row in csv_file_object]

In [3]:
print(data[0])  # data is regular python list, just big one

['1', '0', '3', 'Braund, Mr. Owen Harris', 'male', '22', '1', '0', 'A/5 21171', '7.25', '', 'S']


In [4]:
data = np.array(data)

In [5]:
print(data)

[['1' '0' '3' ..., '7.25' '' 'S']
 ['2' '1' '1' ..., '71.2833' 'C85' 'C']
 ['3' '1' '3' ..., '7.925' '' 'S']
 ..., 
 ['889' '0' '3' ..., '23.45' '' 'S']
 ['890' '1' '1' ..., '30' 'C148' 'C']
 ['891' '0' '3' ..., '7.75' '' 'Q']]


In [6]:
print(data[0])  # note that all cells are imported as characters

['1' '0' '3' 'Braund, Mr. Owen Harris' 'male' '22' '1' '0' 'A/5 21171'
 '7.25' '' 'S']


In [7]:
# take one column, count elements
number_passengers = np.size(data[0::,1].astype(np.float))
number_survived = np.sum(data[0::,1].astype(np.float))  # sum it
proportion_survivors = number_survived / number_passengers

In [8]:
data[0::,4][:5]  # take column 4, all rows; display slice of 5 first elements

array(['male', 'female', 'female', 'female', 'male'], 
      dtype='|S82')

In [9]:
(data[0::,4] == 'female')[:5]  # take column 4 and use it to create filtering list

array([False,  True,  True,  True, False], dtype=bool)

In [10]:
women_only_filter = data[0::,4] == "female"
men_only_filter = data[0::,4] == "male"

women_survival = data[women_only_filter,1].astype(np.float)
men_survival = data[men_only_filter,1].astype(np.float)

proportion_women_survived = np.sum(women_survival) / np.size(women_survival)  
proportion_men_survived = np.sum(men_survival) / np.size(men_survival) 

print('Proportion of women who survived is {}'.format(proportion_women_survived))
print('Proportion of men who survived is {}'.format(proportion_men_survived))

Proportion of women who survived is 0.742038216561
Proportion of men who survived is 0.188908145581
