The data can be downloaded from this website: <br>
    https://www.kaggle.com/dcohen21/8anu-climbing-logbook <br>
The files should be put in the same folder as this notebook

In [None]:
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd

In [None]:
#import connect
from sqlite3 import connect

## Opening the databases

In [None]:
def qry(q, connection = connect("database.sqlite")):
    df = pd.read_sql_query(q, connection)
    connection.close
    return df

The database consists of 4 tables: users, method, grade, ascent

In [None]:
tables = qry("SELECT name FROM sqlite_master")
tables

In [None]:
%%time
df_user = qry("SELECT * FROM USER")# 62'593 users x 22 columns
df_grade = qry("SELECT * FROM grade")# 83 grades x 14 columns
df_method = qry("SELECT * FROM method")# 5 methods x 4 columns
df_ascent = qry("SELECT * FROM ascent")# 4'111'877 ascents x 28 columns

&&&&&&&&&&&&&&&&&&&&&&&&&

In [None]:
df_user.columns

In [None]:
df_grade.columns

In [None]:
df_ascent.columns

<b> climb type: 0 = sport, 1 = boulder </b>

In [None]:
df_method

In [None]:
%%time
df_user.set_index('id', inplace = True)
df_grade.set_index('id', inplace = True) #Set the column 'id' to be the index.
df_ascent.set_index('id', inplace = True)  #Set the column 'id' to be the index.

###### Grade ID

In [None]:
def grade_id(grade):
    """
    Given a grade (given as a string), returns its id number. 
    """
    row = 0
    exit = False 
    while not exit:
        line = df_grade.iloc[row].values
        if grade in line:
            exit = True 
            return df_grade.index[row]
        else: row = row +1

### Extracting active climbers

We extract the "active" climbers, i.e the users with recorded ascents. 

In [None]:
A = set(df_ascent['user_id'])
B = set(df_user.index)
active_users = A.intersection(B)

In [None]:
print(f"There are {len(A.difference(B))} climbers in the ascent database which do not correspond to anyone in the user database!!")

In [None]:
df_active_user = df_user.loc[active_users]
df_active_user.head()

In [None]:
df_active_user.shape

### Extracting various years

In [None]:
def extract_year(Series, nullValue):
    """
    Given a Series of dates, replaces every occurance of 'nullValue' by a np.nan, and extract the year of the 
    admissible dates. 
    """
    nullYear = Series == nullValue
    Series[nullYear] = np.nan
    Series = Series.apply(lambda row: row.year)
    
    return Series
    
def old_extract_year(Series, nullValue):
    for date in Series.index:
        if Series[date] == nullValue:
            Series[date] = np.nan
        else:
            S_year = Series[date].year
            Series[date] = S_year

##### Birthyears of active users

In [None]:
%%time
birthyears = pd.to_datetime(df_active_user['birth'],errors = 'coerce') 
#If ‘coerce’, then invalid parsing will be set as NaT
birthyears = extract_year(birthyears, None)

##### Years of ascends

In [None]:
# No funny things in the 'date' column of the ascent. It's either a unix timestamp or 0.
df_ascent['date'][(df_ascent['date'] !=0)].min()

In [None]:
pd.to_datetime(315529200, unit = 's').year

In [None]:
%%time
nullDate = pd.to_datetime(0, unit='s')
sendyears = pd.to_datetime(df_ascent['date'], unit = 's')
sendyears = extract_year(sendyears, nullDate)

In [None]:
sendyears[411123]

##### Year started climbing

Some active users have starting year lower than their birth year! We compute how many such active climbers there are.  

In [None]:
startedyears = df_active_user['started'].replace(0, np.nan)

In [None]:
# Some non-sensical starting years! At least it's 0 or a year number. 
df_active_user['started'][(df_active_user['started'] !=0)].min()

In [None]:
admissible = np.array([not value for value in (birthyears > startedyears)])
print('There are only',admissible.shape[0] - admissible.sum(), 'climbers with starting year lower than their birth year.' )

##### Age

In [None]:
ages = 2017 - birthyears #Database is from 13th Sept 2017

In [None]:
ages_nonan = ages.values[[np.isfinite(a) for a in ages.values]]
len(ages_nonan)

### Age and Age at which started climbing 

In [None]:
ages = ages_nonan.astype('float64')
plt.rcParams['figure.figsize'] = [12, 8]
plt.hist(ages,bins = 50)
plt.xlabel('Age of active climbers')
plt.ylabel('Number of active climbers')
plt.show

In [None]:
ages_nonan.mean()

In [None]:
ages_nonan.std()

In [None]:
age_started = startedyears - birthyears
age_started_nonan = age_started.values[[np.isfinite(a) for a in age_started.values]]

In [None]:
plt.hist(age_started_nonan.astype('float64'), bins = 50)
plt.axis([0, 80, 0, 5000])
plt.xlabel('Age at which started climbing')
plt.ylabel('Number of active climbers')
plt.show

In [None]:
age_started_nonan.mean()

In [None]:
age_started_nonan.std()

## Max grades with respect to physical characteristics

##### Maximum grades

In [None]:
%%time
#Maximum grades achieved by every active users
max_grades = [df_ascent[(df_ascent['user_id'] == user)]['grade_id'].max() for user in df_active_user.index]
max_grades_usa = [df_grade.loc[grade]['usa_routes'] for grade in max_grades]
df_maxgrades = pd.Series(max_grades, index = df_active_user.index)
df_maxgrades_usa = pd.Series(max_grades_usa, index = df_active_user.index)

###### Gender

In [None]:
gender = df_active_user['sex']

In [None]:
print('There are', gender.sum(),'active female climbers.') # female is 1

###### Weight

In [None]:
weight = df_active_user['weight'].replace(0, np.nan)
weight_male = weight[(gender == 0)]
weight_female = weight[(gender == 1)]

In [None]:
weight_male_nonan = weight_male[[np.isfinite(w) for w in weight_male.values]]
weight_female_nonan = weight_female[[np.isfinite(w) for w in weight_female.values]]

In [None]:
maxgrades_vs_weight_male = df_maxgrades[weight_male_nonan.index]
maxgrades_vs_weight_female = df_maxgrades[weight_female_nonan.index]

In [None]:
#Label for graphs
usa_routes = df_grade['usa_routes']
usa_grades = [grade for grade in usa_routes.loc[35:81] if grade != '']
ticks = [(usa_routes == grade).idxmax() for grade in usa_routes.loc[35:81] if grade != '']

In [None]:
plt.rcParams['figure.figsize'] = [15, 8]
plt.scatter(weight_male_nonan.astype('float64'), maxgrades_vs_weight_male.astype('float64'))
plt.scatter(weight_female_nonan.astype('float64'), maxgrades_vs_weight_female.astype('float64'))
plt.xlabel('Weight in kg')
plt.ylabel('Max grades')
plt.ylim(ymin = 35, ymax = 85)
plt.yticks(ticks, usa_grades)
plt.legend(['male', 'female'], loc = 2)

###### Height

In [None]:
height = df_active_user['height'].replace(0, np.nan)
height_male = height[(gender == 0)]
height_female = height[(gender == 1)]

In [None]:
height_male_nonan = height_male[[np.isfinite(h) for h in height_male.values]]
height_female_nonan = height_female[[np.isfinite(h) for h in height_female.values]]

In [None]:
maxgrades_vs_height_male = df_maxgrades[height_male_nonan.index]
maxgrades_vs_height_female = df_maxgrades[height_female_nonan.index]

In [None]:
plt.rcParams['figure.figsize'] = [25, 12]
plt.scatter(height_male_nonan.astype('float64'), maxgrades_vs_height_male.astype('float64'))
plt.scatter(height_female_nonan.astype('float64'), maxgrades_vs_height_female.astype('float64'))
plt.axis([103, 240, 0, 85])
plt.xlabel('Height in cm')
plt.ylabel('Max grades')
plt.ylim(ymin = 35, ymax = 85)
plt.yticks(ticks, usa_grades)
plt.legend(['male', 'female'], loc = 2)

In [None]:
height_male_nonan.mean()

In [None]:
height_female_nonan.mean()

###### BMI

$BMI = \frac{weight(kg)}{(height(m))^2}$

In [None]:
BMI = weight/np.sqrt(0.01*height)
BMI_male = BMI[(gender == 0)]
BMI_female = BMI[(gender == 1)]

In [None]:
BMI_male_nonan = BMI_male[[np.isfinite(h) for h in BMI_male.values]]
BMI_female_nonan = BMI_female[[np.isfinite(h) for h in BMI_female.values]]

In [None]:
plt.rcParams['figure.figsize'] = [25, 12]
plt.hist(BMI_male_nonan.astype('float64'), bins = 1000)
plt.hist(BMI_female_nonan.astype('float64'), bins = 1000)
plt.xlim(xmin = 30, xmax = 80)
plt.xlabel('BMI')
plt.ylabel('Number of active climbers')
plt.legend(['male', 'female'], loc = 2)

In [None]:
BMI_male_nonan.mean()

In [None]:
BMI_male_nonan.std()

In [None]:
BMI_male_nonan.median()

&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
<br>
&&&&&&&&&&&&&&&&&&&&&&&&&&&&&

## Average number of years to climb a grade

In [None]:
def grade_type_sends(grade, boulder = False):
    """
    Returns dataframes of all ascends at given grade and climbing type, and the years of ascends.  
    """
    gradeid = grade_id(grade)
    grade_and_type = (df_ascent['grade_id'] == gradeid) & (df_ascent['climb_type'] == boulder)
    relevent_sends = df_ascent[grade_and_type]
    relevent_years = sendyears[grade_and_type]
    
    return relevent_sends, relevent_years

def achieve_grade(sample, grade, boulder = False):
    """
    Given a grade and a sample of active users, returns the average age and average number of climbing 
    years after which said grade was achieved. 
    Input:
    sample: A sub-dataframe of active users.
    grade: string
    boulder: boolean. False is sport, True is boulder (default = False)
    Output: (2,) array.  
    """
    result = []
    relevent_sends, relevent_years = grade_type_sends(grade, boulder)
    for climber in sample.index:
        years = relevent_years[(relevent_sends['user_id'] == climber)].values
        if len(years) == 0 or pd.isnull(years).all():
            (age, nb_years) = (np.nan, np.nan)
        else:
            first_year = np.nanmin(years) #np.nanmin is min ignoring the nan's
            (age, nb_years) = (first_year - birthyears[climber], first_year - startedyears[climber])
        result.append([age, nb_years])
            
    return np.nanmean(np.array(result), axis = 0) #np.nanmean is mean ignoring the nan's

In [None]:
def achieve_grades(sample, grades, boulder = False):
    """
    Given a list of grades and a sample of active users, returns an np.array corresponding to the following:
    Every row corresponds to a grade; first column is average age; second column is average number of climbing years.
    """
    result = achieve_grade(sample, grades[0], boulder)
    for grade in grades[1:]:
        new_line = achieve_grade(sample, grade, boulder)
        result = np.vstack((result, new_line))
        
    return result

In [None]:
sample = df_active_user.sample(n=1000)

In [None]:
%%time
achieve_grade(sample, '6b+')

In [None]:
%%time
achieve_grade(df_active_user, '7a+')

In [None]:
%%time
average_achieve_grade = achieve_grades(sample, usa_grades)

In [None]:
%%time
fulldataset_achieve_grade = achieve_grades(df_active_user, usa_grades)

In [None]:
plt.rcParams['figure.figsize'] = [30, 10]
plt.scatter(usa_grades, fulldataset_achieve_grade[:,0], s = 100)
plt.scatter(usa_grades, fulldataset_achieve_grade[:,1], s = 100)
plt.legend(['age', 'nb of years'], loc = 2)

In [None]:
started_after_25 = df_active_user[(age_started > 25)]
started_after_25.shape

In [None]:
%%time
average_achieve_grade = achieve_grades(started_after_25, usa_grades)

In [None]:
plt.rcParams['figure.figsize'] = [20, 10]
plt.scatter(usa_grades, average_achieve_grade[:,1])

In [None]:
started_after_30 = df_active_user[(age_started > 30)]
started_after_30.shape

In [None]:
%%time
average_achieve_grade = achieve_grades(started_after_30, usa_grades)

In [None]:
plt.rcParams['figure.figsize'] = [20, 10]
plt.scatter(usa_grades, average_achieve_grade[:,1])

In [None]:
plt.scatter(sendyears[logbook_active_users[37502]], df_ascent_id.loc[logbook_active_users[37502]]['grade_id'])
plt.yticks(ticks, usa_grades)
plt.show

In [None]:
nine_c =(df_ascent_id['grade_id'] == grade_id('9c')).idxmax()
nine_c

In [None]:
df_ascent.loc[2407853]