In [1]:
from pathlib import Path

import pandas as pd

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 150)

ROOT_PATH = Path('..')
DATA_PATH = ROOT_PATH / 'data'
if not DATA_PATH.exists():
    DATA_PATH.mkdir(parents=True)

# 2020 Population Statistics

## External Websites
- The population of the US in 2020 was about near 330 million. 
    - US Women Population 2020: 167.5 million
    - US Men Population 2020: 162.4 million

- 15%: percentage of US adults who actively play chess: https://www.chess.com/news/view/how-popular-is-chess-8306
    - Thus about 49.5 million Americans played chess somewhat actively.
- FIDE women percentage: ~10.1 percent were female; US: ~8.2:
  https://slate.com/technology/2020/12/why-are-the-best-chess-players-men.html


## 2020-12-30 FIDE Population Extraction

In [2]:
df = pd.read_xml(DATA_PATH / '2020-12-30_fide-standard-ratings-xml.zip', compression='zip')

In [8]:
n_m = (df['sex'] == 'M').sum()
n_w = (df['sex'] == 'F').sum()
m_w_ratio = n_m / n_w
n_m, n_w, n_m + n_w

(324249, 38244, 362493)

In [13]:
gm_df = df[df['title'] == 'GM']
gm_df_w = gm_df[gm_df['sex'] == 'F']
gm_df_m = gm_df[gm_df['sex'] == 'M']
len(gm_df_w), len(gm_df_m), len(gm_df_m) + len(gm_df_w)

(37, 1688, 1725)

In [12]:
gt2400_df = df[df['rating'] > 2400]
gt2400_df_w = gt2400_df[gt2400_df['sex'] == 'F']
gt2400_df_m = gt2400_df[gt2400_df['sex'] == 'M']
len(gt2400_df_w), len(gt2400_df_m), len(gt2400_df_m) + len(gt2400_df_w)

(77, 3035, 3112)

In [10]:
us_df = df[(df['country'] == 'USA') & df['rating'].notna()]
n_us_m = (us_df['sex'] == 'M').sum()
n_us_w = (us_df['sex'] == 'F').sum()
n_us_m, n_us_w, n_us_m + n_us_w

(6191, 499, 6690)

In [11]:
us_gm_df = us_df[us_df['title'] == 'GM']
n_us_gm_m = (us_gm_df['sex'] == 'M').sum()
n_us_gm_w = (us_gm_df['sex'] == 'F').sum()
n_us_gm_m, n_us_gm_w, n_us_gm_m + n_us_gm_w

(94, 1, 95)

In [22]:
titled_df = df[df['title'].notna()]
titled_df_w = titled_df[(titled_df['sex'] == 'F') & (titled_df['title'] != titled_df['w_title'])] # for a fair comparison, we want titles to be on par with each other
titled_df_m = titled_df[titled_df['sex'] == 'M']
len(titled_df_w), len(titled_df_m), len(titled_df_m) + len(titled_df_w)

(215, 15470, 15685)

In [24]:
top1k_df = df.sort_values('rating',ascending=False).head(1000)
n_top1k_w = (top1k_df['sex'] == 'F').sum()
n_top1k_m = (top1k_df['sex'] == 'M').sum()
n_top1k_m, n_top1k_w, n_top1k_m + n_top1k_w

(98, 2, 100)