## Data Munging Solutions

In [23]:
import pandas as pd
import numpy as np

In [24]:
# Import data and investigate
data = pd.read_csv('train.csv')
print data.head()
print data.info()
print data.describe()

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
<c

In [4]:
# Convert Sex into boolean value
data.Sex = data.Sex.replace(['male','female'],[True,False])
print data[['Name','Sex']].head()

                                                Name    Sex
0                            Braund, Mr. Owen Harris   True
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  False
2                             Heikkinen, Miss. Laina  False
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  False
4                           Allen, Mr. William Henry   True


In [5]:
# Inpute missing values with average age
avg_age = data.Age.mean()
data.Age = data.Age.fillna(avg_age)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null bool
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: bool(1), float64(2), int64(5), object(4)
memory usage: 77.5+ KB


In [6]:
pclass = pd.get_dummies(data.Pclass, prefix = 'Pclass')
print pclass.head()

   Pclass_1  Pclass_2  Pclass_3
0       0.0       0.0       1.0
1       1.0       0.0       0.0
2       0.0       0.0       1.0
3       1.0       0.0       0.0
4       0.0       0.0       1.0


In [7]:
# Two ways to merge data (both have same results)
data = pd.merge(data,pclass,left_index=True,right_index=True)
# data = data.merge(pclass,left_index=True,right_index=True)
print data.head()

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name    Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris   True  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  False  38.0      1   
2                             Heikkinen, Miss. Laina  False  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  False  35.0      1   
4                           Allen, Mr. William Henry   True  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  Pclass_1  Pclass_2  \
0      0         A/5 21171   7.2500   NaN        S       0.0       0.0   
1      0          PC 17599  71.2833   C85        C       1.0       0.0   
2      0  STON/O2. 3101282   7.9250   NaN        S       0.0       0.0   
3      0            113803  53

In [33]:
# data.Name.apply(lambda x: x.split(','))
# data.Name.apply(lambda x: x.split(',')[1])
# data.Name.apply(lambda x: x.split(',')[1].split('.'))
# data.Name.apply(lambda x: x.split(',')[1].split('.')[0])
data.Name.apply(lambda x: x.split(',')[1].split('.')[0]).value_counts()

 Mr              517
 Miss            182
 Mrs             125
 Master           40
 Dr                7
 Rev               6
 Major             2
 Col               2
 Mlle              2
 Jonkheer          1
 Ms                1
 Sir               1
 Don               1
 Mme               1
 Capt              1
 Lady              1
 the Countess      1
Name: Name, dtype: int64

In [10]:
# Extract titles from names
data['Title'] = data.Name.apply(lambda x: x.split(',')[1].split('.')[0])

In [7]:
# Write DataFrame to csv file
data.to_csv('clean_data.csv',index=False)

In [35]:
# Example for unique() and value_counts()
print 'data.Pclass.unique():'
print data.Pclass.unique()
print
print 'data.Pclass.value_counts():'
print data.Pclass.value_counts(True)

data.Pclass.unique():
[3 1 2]

data.Pclass.value_counts():
3    0.551066
1    0.242424
2    0.206510
Name: Pclass, dtype: float64


### Clean data and impute ages for men and women separately

In [17]:
data2 = pd.read_csv('train.csv')
data2.Sex = data2.Sex.replace(['male','female'],[True,False])

In [19]:
# Inpute missing values with average age by sex
avg_age_men = data2.Age[data2.Sex==1].mean()
avg_age_women = data2.Age[data2.Sex==0].mean()

data2.Age[data2.Sex==1] = data2.Age[data2.Sex==1].fillna(avg_age_men)
data2.Age[data2.Sex==0] = data2.Age[data2.Sex==0].fillna(avg_age_women)

# data2.Age[data2.Sex==1].fillna(avg_age_men, inplace=True)
# data2.Age[data2.Sex==0].fillna(avg_age_women, inplace=True)

# Check for null values for men and women
print data2[(data2.Sex==1)&(data2.Age.isnull())]
print data2[(data2.Sex==0)&(data2.Age.isnull())]

Empty DataFrame
Columns: [PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked]
Index: []
Empty DataFrame
Columns: [PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked]
Index: []


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [11]:
pclass = pd.get_dummies(data2.Pclass, prefix = 'Pclass')
data2 = pd.merge(data2,pclass,left_index=True, right_index=True)

In [12]:
data.to_csv('clean_data2.csv', index=False)