# Titanic Dataset Part 1 - Wrangling Data

In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

%matplotlib inline
warnings.filterwarnings('ignore')

In [66]:
data = pd.read_csv('titanic_train.csv')

In [67]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


### Looks like there are 891 data entries, but Age only has 714. We could drop those 177 entries, or find a way to keep them. Let's find a way to keep them. 

In [68]:
print('There is a mean of {}, and a median of {}. Which would be the most appropriate for this dataset?'.format(
    data['Age'].mean(), data.Age.median()))

There is a mean of 29.69911764705882, and a median of 28.0. Which would be the most appropriate for this dataset?


### The mean is the average of all of the ages, and the median is the middle age if we put all of the ages in order. Because the median is lower than the mean, that tells me that there are more young people than old people. However, the two values are very close, so I think either one would be a good estimator. Let's go with the mean. 

In [69]:
data['Age'].fillna(data.Age.mean(), inplace=True) #inplace = True let's you fill the data inplace

### Now that we have dealt with the numeric varaibles, let's take a look at the categorical variables

In [70]:
data.info() #We can see that there are 5 object features, which means that they are most likely categorical. 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [71]:
def describe_categorical(X):
    """
    This will create a table for looking at the categorical variables based on the dtype
    """
    from IPython.display import display, HTML
    display(HTML(X[X.columns[X.dtypes == 'object']].describe().to_html()))

In [72]:
describe_categorical(data)

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Quick, Mrs. Frederick Charles (Jane Richards)",male,CA. 2343,C23 C25 C27,S
freq,1,577,7,4,644


#### We can see that there are 2 missing data entries in Embarked, and 687 missing data entries in Cabin. We can also see that there are only 2 possible entries for Sex, which we would expect. 

In [73]:
'''Let's fill the missing cabin numbers with a 0, then we can create a binary yes/no 
         that tells us if the person was wealthy enough to have a cabin.'''
data['Cabin'] = data.Cabin.fillna(0) #Filling the missing data points with 0s
data['Cabin'] = data['Cabin'].apply(lambda x: 0 if x == 0 else 1) #switching the rest of the data points to a 1

In [74]:
'No we can do the same with Sex, just making one sex a 1 and the other a 0, we will make male 0 and female 1'
data['Sex'] = data['Sex'].apply(lambda x: 1 if x == 'female' else 0)

### Let's look at our dataset now. 

In [75]:
data.head(20) #We can see that Sex and Cabin are numbers instead of words and letters now. 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,0,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,1,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,0,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,1,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,0,S
5,6,0,3,"Moran, Mr. James",0,29.699118,0,0,330877,8.4583,0,Q
6,7,0,1,"McCarthy, Mr. Timothy J",0,54.0,0,0,17463,51.8625,1,S
7,8,0,3,"Palsson, Master. Gosta Leonard",0,2.0,3,1,349909,21.075,0,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,27.0,0,2,347742,11.1333,0,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",1,14.0,1,0,237736,30.0708,0,C


## The last thing that we can do is drop some variables that we don't think would be useful*, in this case that will be the remaining categorical variables. 

######  *These variables could be extremely useful, but for the purpose of this code we will drop them. A great practice problem would be figuring out how to use these variables in a meaningful way. 

In [76]:
data.drop(['Name', 'Ticket', 'PassengerId', 'Embarked'], axis=1, inplace=True)

In [77]:
data.head(20) 

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin
0,0,3,0,22.0,1,0,7.25,0
1,1,1,1,38.0,1,0,71.2833,1
2,1,3,1,26.0,0,0,7.925,0
3,1,1,1,35.0,1,0,53.1,1
4,0,3,0,35.0,0,0,8.05,0
5,0,3,0,29.699118,0,0,8.4583,0
6,0,1,0,54.0,0,0,51.8625,1
7,0,3,0,2.0,3,1,21.075,0
8,1,3,1,27.0,0,2,11.1333,0
9,1,2,1,14.0,1,0,30.0708,0


# Now we have all numerical features and we can start building a baseline model!!!