## ATP ranking stats

 Importing the required libraries.

In [14]:
import os
import pandas as pd
import numpy as np

### Data

We're going to download our data from [ATP World Tour tennis data](https://datahub.io/sports-data/atp-world-tour-tennis-data/r/rankings_1973-2017.csv)

In [15]:
!mkdir -p data/
if not os.path.exists('data/rankings_1973-2017.csv'):
    !curl -sSL -o data/rankings_1973-2017.csv https://datahub.io/sports-data/atp-world-tour-tennis-data/r/rankings_1973-2017.csv

In [16]:
atp_rankings = pd.read_csv('data/rankings_1973-2017.csv')

  interactivity=interactivity, compiler=compiler, result=result)


Lets take a look at the size of our data set. The shape attribute gives us (as a tuple) the number of rows and the number of columns in the data that we loaded.

In [17]:
print(atp_rankings.shape)

(2694539, 14)


We can also look at the type of each data that each column contains. Here, you'll see the columns of the data that we loaded, and the type.

In [18]:
print(atp_rankings.dtypes)

week_title          object
week_year            int64
week_month           int64
week_day             int64
rank_text           object
rank_number          int64
move_positions     float64
move_direction      object
player_age         float64
ranking_points       int64
tourneys_played      int64
player_url          object
player_slug         object
player_id           object
dtype: object


Another thing that we would want to do with our data is to subset it by the columns we need. We want to get out the week_year, week_month, week_day, rank_number and player_age, and save this into a variable.

In [19]:
rankings_subset = atp_rankings[['week_year', 'week_month', 'week_day', 'rank_number', 'player_age']]

In [20]:
def find_all_years(df):
    all_years = df['week_year'].unique()
    all_years.sort()
    all_years_list = all_years.tolist()
    return all_years_list

def find_last_date(df, year):
    search_year = df[df['week_year'] == year]
    search_month = sorted(search_year['week_month'].unique())[-1]
    search_year_search_month = search_year[search_year['week_month'] == search_month]
    search_day = sorted(search_year_search_month['week_day'].unique())[-1]
    return search_month, search_day

def year_end_top_100(df, year):
    last_month, last_day = find_last_date(df, year)
    search_year = df[df['week_year'] == year]
    year_end_month = search_year[search_year['week_month'] == last_month]
    year_end_day = year_end_month[year_end_month['week_day'] == last_day]
    sorted_top = year_end_day.sort_values(by=['rank_number'])
    top_100 = sorted_top[:100]
    age_top_100 = top_100['player_age'].values
    age_top_100_list = age_top_100.tolist()
    return age_top_100_list

def year_end_rank(df, year):
    last_month, last_day = find_last_date(df, year)
    search_year = df[df['week_year'] == year]
    year_end_month = search_year[search_year['week_month'] == last_month]
    year_end_day = year_end_month[year_end_month['week_day'] == last_day]
    sorted_top = year_end_day.sort_values(by=['rank_number'])
    return sorted_top

def find_youngest_in_top_100(df, year):
    last_month, last_day = find_last_date(df, year)
    search_year = df[df['week_year'] == year]
    year_end_month = search_year[search_year['week_month'] == last_month]
    year_end_day = year_end_month[year_end_month['week_day'] == last_day]
    sorted_top = year_end_day.sort_values(by=['rank_number'])
    top_100 = sorted_top[:100]
    find_youngest = sorted(top_100['player_age'].unique())[0]
    return find_youngest

def find_oldest_in_top_100(df, year):
    last_month, last_day = find_last_date(df, year)
    search_year = df[df['week_year'] == year]
    year_end_month = search_year[search_year['week_month'] == last_month]
    year_end_day = year_end_month[year_end_month['week_day'] == last_day]
    sorted_top = year_end_day.sort_values(by=['rank_number'])
    top_100 = sorted_top[:100]
    find_oldest = sorted(top_100['player_age'].unique())[-1]
    return find_oldest

In [27]:
averages_for_every_year = []
for year in find_all_years(rankings_subset):
   average_100 = np.mean(year_end_top_100(rankings_subset, year))
   averages_for_every_year.append(average_100)

youngest_for_every_year = []
for year in find_all_years(rankings_subset):
   youngest = find_youngest_in_top_100(rankings_subset, year)
   youngest_for_every_year.append(youngest)

oldest_for_every_year = []
for year in find_all_years(rankings_subset):
   oldest = find_oldest_in_top_100(rankings_subset, year)
   oldest_for_every_year.append(oldest)

In [28]:
print(find_all_years(rankings_subset))

[1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]


In [29]:
print(averages_for_every_year)

[26.45, 26.76, 26.62, nan, 27.01, 26.19, 26.08, nan, 21.0, 20.0, 25.12, 24.61, 23.92, 23.57, 23.74, 23.71, 24.18, 24.08, 24.25, 24.21, 24.56, 25.1, 24.6, 24.89, 24.94, 25.0, 24.93, 25.12, 24.89, 24.98, 25.1, 25.33, 25.23, 25.52, 25.59, 25.89, 26.31, 26.29, 26.67, 27.22, 27.64, 27.76, 27.62, 28.06, 28.13]


In [30]:
print(youngest_for_every_year)

[17.0, 17.0, 18.0, 19.0, 12.0, 18.0, 19.0, 28.0, 21.0, 20.0, 18.0, 17.0, 17.0, 16.0, 17.0, 16.0, 17.0, 18.0, 19.0, 18.0, 19.0, 19.0, 19.0, 18.0, 19.0, 17.0, 18.0, 18.0, 19.0, 18.0, 17.0, 18.0, 18.0, 18.0, 18.0, 19.0, 21.0, 20.0, 19.0, 20.0, 20.0, 19.0, 18.0, 19.0, 18.0]


In [31]:
print(oldest_for_every_year)

[39.0, 40.0, 41.0, nan, 43.0, 44.0, 38.0, nan, 21.0, 20.0, 32.0, 33.0, 34.0, 34.0, 35.0, 36.0, 37.0, 32.0, 39.0, 40.0, 33.0, 34.0, 31.0, 32.0, 32.0, 33.0, 35.0, 35.0, 34.0, 32.0, 33.0, 34.0, 35.0, 34.0, 35.0, 36.0, 37.0, 34.0, 33.0, 34.0, 35.0, 36.0, 36.0, 37.0, 38.0]


In [32]:
year_1976 = year_end_top_100(rankings_subset, 1976) #problem: 'nan' instead of 23
year_end_1980 = year_end_rank(rankings_subset, 1980) #problem: only two rows