The data can be downloaded from this website: <br>
    https://www.kaggle.com/dcohen21/8anu-climbing-logbook <br>
The files should be put in the same folder as this notebook

In [None]:
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd
from sqlite3 import connect

## Read the raw data from a sqlite database

In [None]:
def qry(q, connection = connect("database.sqlite")):
    df = pd.read_sql_query(q, connection)
    connection.close
    return df

The database consists of 4 tables: users, method, grade, ascent

In [None]:
tables = qry("SELECT name FROM sqlite_master")
tables

In [None]:
%%time
# ~ 55 seconds
## TODO: use a postgres database
df_user = qry("SELECT * FROM USER")# 62'593 users x 22 columns
df_grade = qry("SELECT * FROM grade")# 83 grades x 14 columns
df_method = qry("SELECT * FROM method")# 5 methods x 4 columns
df_ascent = qry("SELECT * FROM ascent")# 4'111'877 ascents x 28 columns

In [None]:
df_user.columns

In [None]:
df_grade.columns

In [None]:
df_ascent.columns

<b> climb type: 0 = sport, 1 = boulder </b>

In [None]:
df_method.columns

In [None]:
%%time
df_user.set_index('id', inplace = True)
df_grade.set_index('id', inplace = True) #Set the column 'id' to be the index.
df_ascent.set_index('id', inplace = True)  #Set the column 'id' to be the index.

###### Grade ID

In [None]:
def grade_id(grade):
    """
    Given a grade (given as a string), returns its id number. 
    """
    row = 0
    exit = False 
    while not exit:
        line = df_grade.iloc[row].values
        if grade in line:
            exit = True 
            return df_grade.index[row]
        else: row = row +1

### Extracting active climbers

We extract the "active" climbers, i.e the users with recorded ascents. 

In [None]:
A = set(df_ascent['user_id'])
B = set(df_user.index)
active_users = A.intersection(B)

print(f"There are {len(A.difference(B))} climbers in the ascent table which do not correspond to anyone in the user database!!")

print(f"There are {len(active_users)} active climbers in the users table.")

df_active_user = df_user.loc[active_users]



### Extracting various years

In [None]:
def extract_year(Series, nullValue):
    """
    Given a Series of dates, replaces every occurance of 'nullValue' by a np.nan, and extract the year of the 
    admissible dates. 
    """
    nullYear = Series == nullValue
    Series[nullYear] = np.nan
    Series = Series.apply(lambda row: row.year)
    
    return Series

##### Birthyears of active users

In [None]:
birthyears = pd.to_datetime(df_active_user['birth'],errors = 'coerce') 
#If ‘coerce’, then invalid parsing will be set as NaT
birthyears = extract_year(birthyears, None)
df_active_user["birthyear"] = birthyears

##### Years of ascents

In [None]:
# No funny things in the 'date' column of the ascent. It's either a unix timestamp or 0.
earliest_ascent_date = df_ascent['date'][(df_ascent['date'] !=0)].min()
pd.to_datetime(earliest_ascent_date, unit = 's').year

In [None]:
%%time
# ~11 seconds
nullDate = pd.to_datetime(0, unit='s')
sendyears = pd.to_datetime(df_ascent['date'], unit = 's')
sendyears = extract_year(sendyears, nullDate)
df_ascent["send_year"] = sendyears
# Filter ascentes that don't have a year
df_ascent = df_ascent[df_ascent.send_year.notnull()]

##### Year started climbing

In [None]:
startedyears = df_active_user['started'].replace(0, np.nan)

Some active users have starting year lower than their birth year! We compute how many such active climbers there are.  

In [None]:
# Some non-sensical starting years! At least it's 0 or a year number. 
df_active_user['started'][(df_active_user['started'] !=0)].min()

In [None]:
plt.hist(df_active_user[(df_active_user.started > 0) & (df_active_user.started < 1979)].started)

In [None]:
admissible = np.array([not value for value in (df_active_user.birthyear > df_active_user.started)])
print('Filtering ',admissible.shape[0] - admissible.sum(), 'climbers with starting year lower than their birth year.' )

df_active_user = df_active_user[admissible]

### Store cleaned data

In [None]:
%%time
# ~7 seconds
df_active_user.to_pickle("active_user.pkl")
df_ascent.to_pickle("ascent.pkl")
df_grade.to_pickle("grade.pkl")

# Analysis

### Reset kernel, and load clean data

In [None]:
%reset

import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd

In [None]:
%%time
# ~3-4 seconds
df_active_user = pd.read_pickle("active_user.pkl")
df_ascent = pd.read_pickle("ascent.pkl")
df_grade = pd.read_pickle("grade.pkl")

##### Age

In [None]:
ages = 2017 - df_active_user.birthyear #Database is from 13th Sept 2017

In [None]:
plt.hist(df_active_user.birthyear)

In [None]:
df_ascent.head()

In [None]:
ages_nonan = ages.values[[np.isfinite(a) for a in ages.values]]
len(ages_nonan)

### Age and Age at which started climbing 

In [None]:
ages = ages_nonan.astype('float64')
plt.rcParams['figure.figsize'] = [12, 8]
plt.hist(ages,bins = 50)
plt.xlabel('Age of active climbers')
plt.ylabel('Number of active climbers')
plt.show

# TODO: clean up the centenarians and infants

In [None]:
pd.Series(ages_nonan).describe()

In [None]:
age_started = startedyears - birthyears
age_started_nonan = age_started.values[[np.isfinite(a) for a in age_started.values]]

In [None]:
plt.hist(age_started_nonan.astype('float64'), bins = 50)
plt.axis([0, 80, 0, 5000])
plt.xlabel('Age at which started climbing')
plt.ylabel('Number of active climbers')
plt.show

In [None]:
age_started.describe()

## Max grades with respect to physical characteristics

##### Maximum grades

In [None]:
df_maxgrades_index = df_ascent.groupby('user_id')['grade_id'].aggregate(np.max)
df_maxgrades = pd.merge(pd.DataFrame(df_maxgrades_index), df_grade, left_on = 'grade_id' ,right_index = True)
df_maxgrades.index

###### Gender

In [None]:
gender = df_active_user['sex']

In [None]:
print('There are', gender.sum(),'active female climbers.') # female is 1

###### Weight

In [None]:
weight = df_active_user['weight'].replace(0, np.nan)
weight_male = weight[(gender == 0)]
weight_female = weight[(gender == 1)]

In [None]:
weight_male_nonan = weight_male[[np.isfinite(w) for w in weight_male.values]]
weight_female_nonan = weight_female[[np.isfinite(w) for w in weight_female.values]]

In [None]:
maxgrades_vs_weight_male = df_maxgrades_index.loc[weight_male_nonan.index]
maxgrades_vs_weight_female = df_maxgrades_index.loc[weight_female_nonan.index]

In [None]:
#Label for graphs
usa_routes = df_grade['usa_routes']
usa_grades = [grade for grade in usa_routes.loc[35:81] if grade != '']
ticks = [(usa_routes == grade).idxmax() for grade in usa_routes.loc[35:81] if grade != '']

In [None]:
plt.rcParams['figure.figsize'] = [15, 8]
plt.scatter(weight_male_nonan.astype('float64'), maxgrades_vs_weight_male.astype('float64'))
plt.scatter(weight_female_nonan.astype('float64'), maxgrades_vs_weight_female.astype('float64'))
plt.xlabel('Weight in kg')
plt.ylabel('Max grades')
plt.ylim(ymin = 35, ymax = 85)
plt.yticks(ticks, usa_grades)
plt.legend(['male', 'female'], loc = 2)

###### Height

In [None]:
height = df_active_user['height'].replace(0, np.nan)
height_male = height[(gender == 0)]
height_female = height[(gender == 1)]

In [None]:
height_male_nonan = height_male[[np.isfinite(h) for h in height_male.values]]
height_female_nonan = height_female[[np.isfinite(h) for h in height_female.values]]

In [None]:
maxgrades_vs_height_male = df_maxgrades_index.loc[height_male_nonan.index]
maxgrades_vs_height_female = df_maxgrades_index.loc[height_female_nonan.index]

In [None]:
plt.rcParams['figure.figsize'] = [25, 12]
plt.scatter(height_male_nonan.astype('float64'), maxgrades_vs_height_male.astype('float64'))
plt.scatter(height_female_nonan.astype('float64'), maxgrades_vs_height_female.astype('float64'))
plt.axis([103, 240, 0, 85])
plt.xlabel('Height in cm')
plt.ylabel('Max grades')
plt.ylim(ymin = 35, ymax = 85)
plt.yticks(ticks, usa_grades)
plt.legend(['male', 'female'], loc = 2)

In [None]:
height_male_nonan.mean()

In [None]:
height_female_nonan.mean()

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
<br>
&&&&&&&&&&&&&&&&&&&&&&&&&&&&&

## Average number of years to climb a grade

In [None]:
%%time # ~20 seconds
df_ascent['send_year'] = sendyears
# Merge the df_ascent DataFrame with the startedyears Series, using the user_id as the merge key
df_tmp = pd.merge(df_ascent, pd.DataFrame(startedyears), left_on='user_id', right_index = True)
df_tmp['years_to_send'] = df_tmp['send_year'] - df_tmp['started']

In [None]:
%%time # ~3 seconds
# For each (user, grade, climb_type) find the minimum number of years from start-of-climbing
# until sending one route of that grade
grouped_climb_type = df_tmp.groupby(['user_id', 'grade_id', 'climb_type'])['years_to_send'].agg(np.nanmin)

In [None]:
result_climb_type = grouped_climb_type.groupby(level = ['grade_id', 'climb_type']).agg(np.nanmean)
sport_climb = result_climb_type.groupby(level = 'climb_type').get_group(0)
boulder = result_climb_type.groupby(level = 'climb_type').get_group(1)

In [None]:
#Label for graphs
usa_boulders = df_grade['usa_boulders']
usa_grades_boulder = [grade for grade in usa_boulders.loc[27:76] if grade != '']
ticks_boulder = [(usa_boulders == grade).idxmax() for grade in usa_boulders.loc[27:76] if grade != '']

In [None]:
foo1 = [ticks_boulder, [1]]
foo1_index = pd.MultiIndex.from_product(foo1)

In [None]:
plt.rcParams['figure.figsize'] = [30, 10]
plt.scatter(ticks_boulder, boulder.reindex(foo1_index), s = 100)
plt.legend(['Average nb of years: boulder'], loc = 2,fontsize = 'xx-large')
plt.xlim(xmin = 27, xmax = 70)
plt.xticks(ticks_boulder, usa_grades_boulder)
plt.show

In [None]:
foo0 = [ticks, [0]]
foo0_index = pd.MultiIndex.from_product(foo0)

In [None]:
plt.rcParams['figure.figsize'] = [30, 10]
plt.scatter(ticks, sport_climb.reindex(foo0_index), s = 100)
plt.legend(['Average nb of years: sport climbing'], loc = 2, fontsize = 'xx-large')
plt.xlim(xmin = 35, xmax = 85)
plt.xticks(ticks, usa_grades)
plt.show