In [103]:
import pandas as pd
import matplotlib.pyplot as plt

# This is the dataset I've collected from the IAAF website
# You can view the web pages themselves if you like
#     git clone xyz
#     cd /websites/
#

df = pd.read_csv('IAAF_Results.csv')


# What we got?
df.head()

# I like to check that a new dataset is at least somewhat correct
#  Lets compare to some stats we know.
#  We'll look at the athletes with the most appearances and
#  check that their medal counts are right.

# Select the athletes by first and last name...
athletes_groups = df.groupby(['athlete_first_name','athlete_last_name'],
                            as_index = True)

# ... and count their number of occurances
athletes_results = athletes_groups.agg({
        'country': 'first',
        'gender' : 'first',
        'championship_number' : 'count' # Number of events occured
    })

# Select the top five athletes by occurance
athletes_results.sort('championship_number',ascending=False).head()

# Uh oh. Looks like wikipedia disagrees with Merlene Ottey's 12 appearance. 
# But wikipedia would be incorrect. Check the downloaded web pages!
# Anyway, the others match.
# 
# Moving on...
# Lets see how the winning times for each event have trended
# 


# Get all the winning times for each gender and event
winning_times = df[df['place'] == 1]
winning_times_groups = df.groupby(['distance','gender','championship_number'],
                                 as_index = False)
winning_results = winning_times_groups.agg({
        'athlete_last_name' : 'first',
        'mark_time' : 'first',
    })
winning_results.head()


# Get the first result of the group, to get indexed changes
first_wins = winning_results.groupby(['distance','gender']).first()
first_wins.reset_index(inplace = True)
first_wins.head()
winning_results.merge
winning_first_times_added = winning_results.merge(
    first_wins,
    on=('distance','gender'),
    copy = False
)

winning_first_times_added.head()

winning_fields = winning_first_times_added[[
        'distance',
        'gender',
        'athlete_last_name_x',
        'mark_time_x',
        'mark_time_y']]
winning_fields.columns = [[
        'distance',
        'gender',
        'athlete_last_name',
        'winning_time',
        'index_time']]

winning_fields['change'] = winning_fields['winning_time'] - winning_fields['index_time']

winning_fields['percent_change'] = winning_fields['change'] / winning_fields['index_time']
winning_fields['percent_increase'] = winning_fields['percent_change'] * 100
winning_fields.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,distance,gender,athlete_last_name,winning_time,index_time,change,percent_change,percent_increase
0,100,men,Lewis,10.07,10.07,0.0,0.0,0.0
1,100,men,Lewis,9.93,10.07,-0.14,-0.013903,-1.390268
2,100,men,Lewis,9.86,10.07,-0.21,-0.020854,-2.085402
3,100,men,Christie,9.87,10.07,-0.2,-0.019861,-1.986097
4,100,men,Bailey,9.97,10.07,-0.1,-0.00993,-0.993049


In [None]:
# Interesting.
# Conclusion 1, 2, and 3 go here

In [None]:
# Now lets look at victory margins over the years.
# We'll have to select the top 2 place finishers each year.
# Then, we'll split the dataset into 2 pieces, the first data-
# frame with the first place finishers, and the second dataframe
# with the second place finishers. This will make calculating
# the margin a trivial.

