In [None]:
import unicodecsv
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# The Data

At 2:20 am, on April 15, 1912, the RMS Titanic sank miles off the coast of Canada. The titanic was a luxury ocean liner and was carrying over 2,000 passengers from ports in England, France, and Ireland to New York City. A little over 2 hours after hitting an iceberg, the Titanic broke in half and sank bringing over 1500 people with her. 

The data that follows is a sample of passengers from the Titanic. The data includes passengers of all ages, and class levels. Within this sample, there are passengers who came aboard the Titanic at Cherbourg, France; Queenstown, Ireland; and Southampton, England. I will explore this data and create conclusions based on the following questions:

How many survivors were there? 
What was the average price of tickets in each class? 
Which port had the highest/lowest amount of 1st class passengers? 
Was there a higher survival rate among passengers from a certain port? A certain class?

In [None]:
# reading in the file
titanic = pd.read_csv('titanic-data.csv')

#changing dtypes to preferable formats
titanic['Survived'] = titanic['Survived'].astype(bool)
titanic.head(5)

In [None]:
# double checking number of sample passengers
print (len(titanic))

#drop rows where there is no data
titanic.dropna(axis=0, how='all')

#drop duplicates
titanic.drop_duplicates('Name')

print (len(titanic))

In [None]:
min_fare = titanic['Fare'].min()
max_fare = titanic['Fare'].max()
std_fare = titanic['Fare'].std()

bins = [0, 25, 50, 75, 100, 1000]
fare_category = ['$0-$25', '$25-$50', '$50-$75', '$75-$100', 'Over $100']
cut_fares = pd.cut(titanic['Fare'], bins, labels=fare_category)
titanic['cut_fares'] = pd.cut(titanic['Fare'], bins, labels=fare_category)
    
#separate passengers into class

first_class = titanic.groupby('Pclass').get_group(1)
second_class = titanic.groupby('Pclass').get_group(2)
third_class = titanic.groupby('Pclass').get_group(3)

# calculating number of passengers in each class
num_1st_class = len(first_class)
num_2nd_class = len(second_class)
num_3rd_class = len(third_class)

#double checking to make sure the total is the same
totals = num_1st_class + num_2nd_class + num_3rd_class

print(totals)

fc_fare_category = first_class['cut_fares'].tolist()
sc_fare_category = second_class['cut_fares'].tolist()
tc_fare_category = third_class['cut_fares'].tolist()

def scan_fares(mylist):
    bin1 = 0
    bin2 = 0
    bin3 = 0
    bin4 = 0
    bin5 = 0
    for fares in mylist:
        if fares == '$0-$25':
            bin1 += 1
        elif fares == '$25-$50':
            bin2 += 1
        elif fares == '$50-$75':
            bin3 += 1
        elif fares == '$75-$100':
            bin4 += 1
        elif fares == 'Over $100':
            bin5 += 1
            
    return bin1, bin2, bin3, bin4, bin5

fc_bins = scan_fares(fc_fare_category)
sc_bins = scan_fares(sc_fare_category)
tc_bins = scan_fares(tc_fare_category)

# Calculating Ticket Prices

Many people were scraping together their last dime to raise the money for a ticket on the doomed ocean liner.  Some passengers, mainly those in 1st class, had no trouble paying the fare, while others, mainly those in 3rd class, really struggled.  What was the average price of a ticket on the Titanic?

In [None]:
print ('First class passengers paid an average of ${}'.format(first_class['Fare'].mean()))
print ('Second class passengers paid an average of ${}'.format(second_class['Fare'].mean()))
print ('Third class passengers paid an average of ${}'.format(third_class['Fare'].mean()))

fare_chart = pd.DataFrame ({
    'First Class':[fc_bins[0], fc_bins[1], fc_bins[2], fc_bins[3], fc_bins[4]],
    'Second Class':[sc_bins[0], sc_bins[1], sc_bins[2], sc_bins[3], sc_bins[4]],
    'Third Class':[tc_bins[0], tc_bins[1], tc_bins[2], tc_bins[3], tc_bins[4]]
}, index=['$0-$25', '$25-$50', '$50-$75', '$75-$100', 'Over $100'])
fare_chart

In [None]:
fare_distribution = pd.DataFrame ({
    'First Class':[first_class['Fare'].min(), first_class['Fare'].median(), first_class['Fare'].max(), first_class['Fare'].std()],
    'Second Class':[second_class['Fare'].min(), second_class['Fare'].median(), second_class['Fare'].max(), second_class['Fare'].std()],
    'Third Class':[third_class['Fare'].min(), third_class['Fare'].median(), third_class['Fare'].max(), third_class['Fare'].std()]
}, index = ['Min', 'Median', 'Max', 'Standard Deviation'])

fare_distribution

With $14.45 being the median fare, $512 

# Origin of Passengers

There were 3 ports in which passengers boarded.  There were 1st, 2nd, and 3rd class passengers in each port. Some passengers boarded in Southampton, England; some boarded in Queenstown, Ireland; and some boarded in Cherbourg, France.  Here is a chart organized into ports of origin, and ticket class.

In [None]:
fclass_list = first_class['Embarked'].tolist()
sclass_list = second_class['Embarked'].tolist()
tclass_list = third_class['Embarked'].tolist()

def pass_per_port(mylist):
    queensland = 0
    southampton = 0
    cherbourg = 0
    for f in mylist:
        if f == 'Q':
            queensland +=1
        elif f == 'S':
            southampton +=1
        elif f == 'C':
            cherbourg +=1
    return queensland, southampton, cherbourg

results = pass_per_port(fclass_list)
second_results = pass_per_port(sclass_list)
third_results = pass_per_port(tclass_list)

print ('Locations Where Passengers Embarked')

passenger_class = pd.DataFrame({
    'First Class': [results[0], results[1], results[2]], 
    'Second Class': [second_results[0], second_results[1], second_results[2]],
    'Third Class': [third_results[0], third_results[1], third_results[2]], 
}, index = ['Queensland', 'Southampton', 'Cherbourg'])

passenger_class

In [None]:
def proportions(number):
    #converting a number to a proportion based on total passengers
    proportion = number/totals
    proportion = round(proportion, 3)
    return proportion

def percentages(number):
    #converting a number to a percentage based on total passengers
    percent = (number/totals)*100
    percent = round(percent, 1)
    return percent

total_queensland = passenger_class.loc['Queensland'].values.sum()
total_southampton = passenger_class.loc['Southampton'].values.sum()
total_cherbourg = passenger_class.loc['Cherbourg'].values.sum()

In [None]:
totals_by_port = pd.DataFrame ({
    'Queensland':[total_queensland],
    'Southampton':[total_southampton],
    'Cherbourg':[total_cherbourg]
}, index = ['Totals'])

totals_by_port

And by proportion to the total number of passengers

In [None]:
totals_by_port.applymap(proportions)

In [None]:
print ('Out of {} passengers, {} were 1st class, {} were 2nd class, and {} were 3rd class passengers '
      .format(totals, proportions(num_1st_class), proportions(num_2nd_class), proportions(num_3rd_class)))

location_proportions = pd.DataFrame ({
    'First Class': [proportions(results[0]), proportions(results[1]), proportions(results[2])],
    'Second Class':[proportions(second_results[0]), proportions(second_results[1]), proportions(second_results[2])],
    'Third Class': [proportions(third_results[0]), proportions(third_results[1]), proportions(third_results[2])]
}, index = ['Queensland', 'Southampton', 'Cherbourg'])

location_proportions

# Calculating Survivor Statistics

There were many passengers who were rescued from the water.  How many and what proportion of passengers survived that night?

In [None]:
survivors = titanic.groupby('Survived').get_group(True)
        
def survival_proportions(number):
    #converting a number to a proportion based on the total number of survivors
    proportion = number/(len(survivors))
    proportion = round(proportion, 3)
    return proportion
        
print ('{} or {} passengers survived'.format(len(survivors), proportions(len(survivors))))

### Survival By Port Entry

In [None]:
survived_in_first_class = survivors.groupby('Pclass').get_group(1)

fc_queensland_survivors = survived_in_first_class.groupby('Embarked').get_group('Q')
fc_southampton_survivors = survived_in_first_class.groupby('Embarked').get_group('S')
fc_cherbourg_survivors = survived_in_first_class.groupby('Embarked').get_group('C')

survived_in_second_class = survivors.groupby('Pclass').get_group(2)

sc_queensland_survivors = survived_in_second_class.groupby('Embarked').get_group('Q')
sc_southampton_survivors = survived_in_second_class.groupby('Embarked').get_group('S')
sc_cherbourg_survivors = survived_in_second_class.groupby('Embarked').get_group('C')

survived_in_third_class = survivors.groupby('Pclass').get_group(3)

tc_queensland_survivors = survived_in_third_class.groupby('Embarked').get_group('Q')
tc_southampton_survivors = survived_in_third_class.groupby('Embarked').get_group('S')
tc_cherbourg_survivors = survived_in_third_class.groupby('Embarked').get_group('C')

total_fc = len(fc_queensland_survivors) + len(fc_southampton_survivors) + len(fc_cherbourg_survivors)
total_sc = len(sc_queensland_survivors) + len(sc_southampton_survivors) + len(sc_cherbourg_survivors)
total_tc = len(tc_queensland_survivors) + len(tc_southampton_survivors) + len(tc_cherbourg_survivors)

survival_chart = pd.DataFrame({
    'First Class':[len(fc_queensland_survivors), len(fc_southampton_survivors), len(fc_cherbourg_survivors), total_fc],
    'Second Class':[len(sc_queensland_survivors), len(sc_southampton_survivors), len(sc_cherbourg_survivors), total_sc],
    'Third Class':[len(tc_queensland_survivors), len(tc_southampton_survivors), len(tc_cherbourg_survivors), total_tc]
}, index = ['Queensland', 'Southampton', 'Cherbourg', 'Total'])


In [None]:
survival_chart

In [None]:
survival_chart[0:3].applymap(proportions)

And by proportion to the total number of suvivors

In [None]:
survival_chart[0:3].applymap(survival_proportions)

#### Survival Rates

In viewing the chart below

In [None]:
fc_survival_rate = proportions(total_fc)
sc_survival_rate = proportions(total_sc)
tc_survival_rate = proportions(total_tc)

fc_percent = percentages(total_fc)
sc_percent = percentages(total_sc)
tc_percent = percentages(total_tc)

survival_rates = pd.DataFrame ({
    'First Class':[fc_survival_rate, fc_percent],
    'Second Class':[sc_survival_rate, sc_percent],
    'Third Class':[tc_survival_rate, tc_percent]
}, index=['Proportion', 'Percentage'])

survival_rates

# Charts

#### Fare Prices

In [None]:
titanic['Fare'].hist(bins=5)

#### Proportion of Passengers per port

In [None]:
totals_by_port.applymap(proportions).plot(kind = 'bar')

#### Ports of Origin for each ticket class

In [None]:
location_proportions.plot(kind='bar')

#### Survival Chart

In [None]:
survival_chart.applymap(proportions).plot(kind = 'bar')

# Conclusion

In conclusion, less than half of the passengers on Titanic survived, with the survival rate of .38.  Most of the survivors were 2nd class passengers from Southampton, England at .222 which is only slightly more than the .216 1st class passengers also from Southampton, England.

Queensland, Ireland was the port with the fewest passengers with .086 of the total. Southampton, England was the port with the most passengers at .723 of the total.  So it was not surprising to see that most of the survivors came from Southampton since that is where the majority of the passengers were from anyway.  What was surprising to me was that the amount of 3rd class passengers who survived is only slightly less than the total first class surviving passengers, and much more than the 2nd class passengers.  Of course given the fact that there were a higher number of 3rd class passengers to begin with this would not be surprising except that it counters the notion that 3rd class passengers were not given a fair chance to survive because of their lower status.

This data seems to indicate that 2nd and 3rd class passengers had a fair chance of survival. 