# ***Data preprocessing***


## *В данном документе описан процесс предобработки исторических данных о теннисных матчах, полученных из [БД энтузиастов](https://github.com/JeffSackmann/tennis_atp "Jeff Sackmann Github repository")*


## Skills changelog:
### Это первая попытка использования Machine Learning как инструмента в принципе, и поэтому
- есть только общее понимание того, что из себя представляет инструмент Machine Learning;
- есть огромное желание попробовать ML на практике;
- используются любые руководства для начинающих;
- отсутствуют специфические знания и навыки, а именно - для подготовки данных, обучения моделей и т.п.;
- все, чем приходится пока руководствоваться - логика и здравый смысл.


## Sources:
- фыв

***Примечание:*** изначально предобработка данных была реализована на Ruby (изучение Python находилось на самом начальном этапе, а желание приступить к проекту было уже давно). Однако, для удобства использования единого формата, в данной статье реализованы все те же шаги, только с использованием Python.



# Разделы:
## [1. Отбор турниров для анализа](#section1)
## [2. Очистка данных](#section2)
## [3. Окончательная группировка данных](#section3)

## <a id='section1'></a>
## 1. Отбор турниров для анализа
### Для данного учебного проекта выбраны матчи Ассоциации теннисистов-профессионалов (ATP) (без учета квалификационных матчей - существует распространенное мнение, что в таких матчах игроки не демонстрируют весь свой потенциал, и, соответственно, данные об этих матчах могут носить дезинформирующий характер для ML-модели).

Так выглядит репозиторий, содержащий информацию о матчах разных годов и не только:
![Содержание репозитория с информацией о теннисных матчах 1](../../img/notebooks/series_1/preprocessing/tennis-repo-overview-1.jpg "Tennis repo overview")
...
![Содержание репозитория с информацией о теннисных матчах 1](../../img/notebooks/series_1/preprocessing/tennis-repo-overview-2.jpg "Tennis repo overview")

Таблицы содержат строки с информацией матча:
- турнир (id турнира, название, покрытие, дата и др.);
- игроки (id, имя, рост, возраст и др.);
- очки (эйсы, двойные ошибки, подача с первой подачи, со второй, отыгранные брейк-поинты и др.).

Вначале импортируем необходимые библиотеки и, для удобства в дальнейшем, перейдем в корневой каталог репозитория:

In [1]:
### !pip install pandas
import pandas as pd
pd.options.display.max_columns = None
import urllib
import urllib.request
import os
import glob
import csv
import re


In [4]:
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '../../'))
print('Root directory:', ROOT_DIR)
downloaded_files_path = os.path.join(ROOT_DIR, 'data', 'match_data_downloaded')

/home/vl/Repos/machine-learning-on-tennis


Так выглядит таблица с данными о матчах:

In [72]:
url = 'https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2007.csv'

def show_csv(file):
    data = pd.read_csv(file, header=0, encoding="utf-8", engine='python')
    return data

show_csv(url)

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,winner_rank,winner_rank_points,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,loser_rank,loser_rank_points,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced
0,2007-500,Halle,Grass,32,A,20070611,1,102318,,LL,Andrei Pavel,R,183.0,ROU,33.368925,124.0,345.0,103694,,,Olivier Rochus,R,168.0,BEL,26.392882,51.0,660.0,6-3 7-6(5),3,R32,90.0,14.0,4.0,73.0,42.0,34.0,15.0,10.0,1.0,1.0,3.0,2.0,61.0,38.0,28.0,13.0,11.0,1.0,3.0
1,2007-500,Halle,Grass,32,A,20070611,2,103813,,,Jarkko Nieminen,L,185.0,FIN,25.883641,22.0,1240.0,104019,,,Kristof Vliegen,R,193.0,BEL,24.969199,56.0,614.0,6-3 6-1,3,R32,52.0,1.0,0.0,45.0,33.0,29.0,5.0,8.0,0.0,0.0,2.0,2.0,44.0,30.0,17.0,5.0,8.0,1.0,5.0
2,2007-500,Halle,Grass,32,A,20070611,3,103794,,,Benjamin Becker,R,178.0,GER,25.984942,44.0,734.0,104559,,,Teymuraz Gabashvili,R,188.0,RUS,22.050650,82.0,511.0,6-4 7-6(1),3,R32,99.0,8.0,1.0,61.0,31.0,27.0,18.0,11.0,0.0,1.0,5.0,6.0,95.0,44.0,35.0,19.0,11.0,12.0,14.0
3,2007-500,Halle,Grass,32,A,20070611,4,102967,,,Marc Gicquel,R,188.0,FRA,30.198494,42.0,746.0,103900,7.0,,David Nalbandian,R,180.0,ARG,25.440110,25.0,1115.0,5-7 6-2 6-4,3,R32,111.0,5.0,2.0,90.0,49.0,39.0,21.0,15.0,2.0,4.0,8.0,7.0,86.0,47.0,38.0,17.0,15.0,2.0,6.0
4,2007-500,Halle,Grass,32,A,20070611,5,104607,4.0,,Tomas Berdych,R,196.0,CZE,21.730322,13.0,1875.0,103017,,WC,Nicolas Kiefer,R,183.0,GER,29.932923,,,6-4 7-6(3),3,R32,100.0,9.0,1.0,60.0,36.0,31.0,15.0,11.0,2.0,3.0,9.0,4.0,80.0,43.0,33.0,15.0,11.0,6.0,8.0
5,2007-500,Halle,Grass,32,A,20070611,6,102148,,,Fabrice Santoro,R,178.0,FRA,34.502396,53.0,621.0,103325,,Q,Wesley Moodie,R,196.0,RSA,28.320329,139.0,308.0,6-3 7-6(9),3,R32,86.0,3.0,0.0,69.0,41.0,37.0,17.0,11.0,3.0,3.0,8.0,3.0,71.0,45.0,35.0,14.0,10.0,4.0,5.0
6,2007-500,Halle,Grass,32,A,20070611,7,104217,,Q,Simon Stadler,L,183.0,GER,23.893224,227.0,167.0,104999,,WC,Mischa Zverev,L,190.0,GER,19.802875,128.0,334.0,6-4 7-6(5),3,R32,82.0,2.0,2.0,77.0,47.0,33.0,16.0,11.0,3.0,4.0,3.0,1.0,63.0,49.0,35.0,7.0,11.0,2.0,4.0
7,2007-500,Halle,Grass,32,A,20070611,8,104022,6.0,,Mikhail Youzhny,R,183.0,RUS,24.960986,14.0,1800.0,103188,,,Michael Russell,R,173.0,USA,29.111567,70.0,554.0,6-2 6-2,3,R32,60.0,4.0,2.0,39.0,20.0,19.0,13.0,8.0,0.0,0.0,0.0,1.0,42.0,27.0,19.0,3.0,8.0,3.0,7.0
8,2007-500,Halle,Grass,32,A,20070611,9,103529,,Q,Aisam Ul Haq Qureshi,R,183.0,PAK,27.233402,304.0,114.0,104755,5.0,,Richard Gasquet,R,185.0,FRA,20.980151,11.0,1900.0,7-6(8) 6-4,3,R32,98.0,12.0,6.0,78.0,47.0,38.0,15.0,11.0,6.0,7.0,6.0,2.0,64.0,34.0,25.0,20.0,11.0,1.0,3.0
9,2007-500,Halle,Grass,32,A,20070611,10,104259,,,Philipp Kohlschreiber,R,178.0,GER,23.652293,34.0,890.0,103171,,Q,Raemon Sluiter,R,185.0,NED,29.160849,167.0,263.0,7-6(5) 7-5,3,R32,95.0,9.0,2.0,77.0,42.0,35.0,21.0,12.0,2.0,2.0,12.0,5.0,71.0,42.0,34.0,16.0,12.0,0.0,1.0


Если посмотреть таблицы за разные года, то можно увидеть, что данные по очкам за матч (крайние правые столбцы) начинают стабильно появляться в файлах с 1991 года по текущее время. Отчеты за эти года и будут использованы в данном проекте.

Загрузим эти файлы в наш проект:

In [33]:
def download_raw_data_files():
    counter = 0
    for year in range(1991, 2019):
        output_template = os.path.join(downloaded_files_path, 'atp_matches_')
        extension = '.csv'
        output = str(output_template + str(year) + extension)
        print('Downloading', f"https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_{year}.csv")
        urllib.request.urlretrieve(f"https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_{year}.csv",
                                   output)
        counter += 1
    print(counter, 'files downloaded.')
    
download_raw_data_files()

Downloading https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_1991.csv
Downloading https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_1992.csv
Downloading https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_1993.csv
Downloading https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_1994.csv
Downloading https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_1995.csv
Downloading https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_1996.csv
Downloading https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_1997.csv
Downloading https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_1998.csv
Downloading https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_1999.csv
Downloading https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2000.csv
Downloading https://

Объединим все файлы в один CSV-файл:

In [65]:
def _get_files_in_directory(path, ext):
    all_filenames = []
    for root,dirs,files in os.walk(path):
        # create a list of all CSVs with full path (i.e. directory)
        func = lambda f: os.path.join(root, f)
        all_filenames = sorted([func(file) for file in files if re.search(r'atp_matches_....\.csv$', file)])
    return all_filenames

def _get_headers(path, file):
    file_dir = os.path.join(path, file)
    headers = ''
    with open(file_dir, 'r', encoding='utf-8-sig') as file:
        reader = csv.reader(file)
        headers = next(reader)
    return headers

def _write_headers(file, flag, headers):
    csv_merge = open(file, flag)
    csv_merge.write(",".join(headers))
    csv_merge.write('\n')
    csv_merge.close()
    
def _write_data(input_file, output_file, flag):
    with open(input_file, 'r') as input, open(output_file, flag) as output:
        reader = csv.reader(input)
        writer = csv.writer(output, quotechar=None)
        # skip first line with headers
        next(reader)
        for r in reader:
            writer.writerow(r)

def _merge_csv_files(path, input_files_list, output_name):
    full_output = os.path.join(path, output_name)
    headers = _get_headers(path, input_files_list[0])
    
    _write_headers(full_output, 'w', headers)
    for file in input_files_list:
        _write_data(file, full_output, 'a')

        
raw_match_data_files = _get_files_in_directory(downloaded_files_path, 'csv')
_merge_csv_files(downloaded_files_path,
                 raw_match_data_files,
                 'atp_matches_1991-2018.csv')

Узнаем, сначала не загружая весь файл в DataFrame, сколько итоговый файл содержит строк и столбцов:

In [5]:
raw_match_data_file = 'atp_matches_1991-2018.csv'

raw_data_full_path = os.path.join(downloaded_files_path, raw_match_data_file)


with open(raw_data_full_path, "r") as file:
    reader = csv.reader(file)
    # read first line to count features
    feature_count = len(next(reader))
    # as we have already read first line (describing columns) there only left lines with match data
    row_count = sum(1 for row in reader) - 1

raw_data_file_size = os.path.getsize(raw_data_full_path) / 1024 / 1024
print('File size:', raw_data_file_size, 'MB')
print('Number of lines:', row_count)
print('Number of features:', feature_count)

File size: 19.421184539794922 MB
Number of lines: 92019
Number of features: 49


Как видно, файл не очень велик, поэтому можем работать с ним в DataFrame: 

In [34]:
match_data = pd.read_csv(raw_data_full_path, header=0, encoding="utf-8", engine='python')
match_data

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,winner_rank,winner_rank_points,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,loser_rank,loser_rank_points,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced
0,1991-475,San Marino,Clay,32.0,A,19910729.0,1.0,101649.0,1.0,,Guillermo Perez Roldan,R,178.0,ARG,21.771389,33.0,786.0,100800.0,,,Diego Perez,R,178.0,URU,29.464750,201.0,153.0,6-0 6-1,3.0,R32,59.0,0.0,0.0,41.0,34.0,25.0,4.0,7.0,1.0,1.0,0.0,2.0,40.0,22.0,9.0,6.0,6.0,7.0,12.0
1,1991-475,San Marino,Clay,32.0,A,19910729.0,2.0,101035.0,,Q,Menno Oosting,R,180.0,NED,27.197810,441.0,36.0,101216.0,,WC,Olli Rahnasto,R,183.0,FIN,25.582478,548.0,22.0,6-0 6-1,3.0,R32,55.0,2.0,1.0,49.0,43.0,30.0,3.0,7.0,1.0,1.0,1.0,3.0,34.0,17.0,6.0,6.0,6.0,1.0,6.0
2,1991-475,San Marino,Clay,32.0,A,19910729.0,3.0,101534.0,,,Libor Nemecek,R,168.0,CZE,22.754278,208.0,143.0,100536.0,,,Fernando Luna,R,175.0,ESP,33.262149,223.0,119.0,6-2 6-1,3.0,R32,75.0,0.0,0.0,49.0,40.0,21.0,6.0,7.0,3.0,5.0,0.0,0.0,54.0,42.0,15.0,4.0,8.0,6.0,13.0
3,1991-475,San Marino,Clay,32.0,A,19910729.0,4.0,101475.0,,,Carlos Costa,R,183.0,ESP,23.266256,142.0,249.0,101460.0,8.0,,Jose Francisco Altur,L,190.0,ESP,23.345654,121.0,315.0,6-4 7-6(5),3.0,R32,100.0,1.0,2.0,74.0,44.0,31.0,12.0,11.0,4.0,7.0,0.0,4.0,69.0,46.0,29.0,9.0,11.0,3.0,7.0
4,1991-475,San Marino,Clay,32.0,A,19910729.0,5.0,101686.0,4.0,,Franco Davin,L,173.0,ARG,21.544148,47.0,632.0,101876.0,,Q,Massimo Ardinghi,R,175.0,ITA,20.396988,257.0,92.0,6-1 6-1,3.0,R32,55.0,0.0,2.0,43.0,29.0,23.0,7.0,7.0,1.0,1.0,0.0,2.0,44.0,18.0,6.0,11.0,7.0,2.0,7.0
5,1991-475,San Marino,Clay,32.0,A,19910729.0,6.0,101124.0,,,Paolo Cane,R,180.0,ITA,26.302533,124.0,300.0,101416.0,,,Joao Cunha Silva,R,173.0,POR,23.668720,141.0,253.0,7-6(5) 6-3,3.0,R32,112.0,6.0,2.0,77.0,29.0,19.0,27.0,11.0,4.0,7.0,1.0,4.0,67.0,35.0,24.0,11.0,10.0,2.0,6.0
6,1991-475,San Marino,Clay,32.0,A,19910729.0,7.0,101370.0,,,Paolo Pambianco,R,190.0,ITA,24.049281,189.0,163.0,101792.0,,,Slava Dosedel,R,183.0,CZE,20.955510,140.0,259.0,7-6(2) 4-6 6-2,3.0,R32,160.0,3.0,10.0,106.0,52.0,39.0,22.0,15.0,6.0,10.0,3.0,10.0,102.0,60.0,38.0,18.0,15.0,4.0,9.0
7,1991-475,San Marino,Clay,32.0,A,19910729.0,8.0,101746.0,5.0,,Renzo Furlan,R,175.0,ITA,21.199179,51.0,594.0,101368.0,,,Christian Miniussi,R,185.0,ARG,24.065708,161.0,206.0,6-3 6-1,3.0,R32,76.0,1.0,1.0,42.0,27.0,21.0,6.0,8.0,0.0,2.0,1.0,6.0,59.0,25.0,13.0,12.0,8.0,4.0,10.0
8,1991-475,San Marino,Clay,32.0,A,19910729.0,9.0,101243.0,6.0,,Roberto Azar,L,185.0,ARG,25.355236,108.0,349.0,101031.0,,,Massimo Cierro,R,173.0,ITA,27.225188,197.0,158.0,6-3 6-1,3.0,R32,45.0,1.0,0.0,38.0,32.0,23.0,5.0,8.0,0.0,1.0,0.0,2.0,49.0,21.0,12.0,11.0,8.0,1.0,6.0
9,1991-475,San Marino,Clay,32.0,A,19910729.0,10.0,101490.0,,,Daniel Orsanic,L,183.0,ARG,23.129363,145.0,238.0,101420.0,,,Vaclav Roubicek,R,185.0,CZE,23.624914,159.0,215.0,6-2 6-2,3.0,R32,65.0,4.0,0.0,49.0,32.0,27.0,6.0,8.0,2.0,3.0,1.0,6.0,57.0,37.0,18.0,7.0,8.0,4.0,9.0


Отсортируем данные так, чтобы матчи шли от старых вначале к новым в конце.
Сделать это можно упорядочив данные по:
- дате турнира;
- id турнира;
- стадии турнира (от первых матчей в турнирной сетке до финала).

Если с первыми двумя все понятно, то по третьему пункту нужно посмотреть, какие стадии вообще есть в таблице:

In [105]:
match_data['round'].unique()

array(['R32', 'R16', 'QF', 'SF', 'F', 'R64', 'R128', 'RR', 'BR', nan],
      dtype=object)

In [None]:
Видно, что в каких-то записях отсутствует информация о стадии турнира. Посмотрим, что с этими матчами не так:

In [35]:
match_data.loc[match_data['round'].isnull()]

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,winner_rank,winner_rank_points,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,loser_rank,loser_rank_points,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced
85604,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
85606,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
85608,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
85610,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
85612,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
85614,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
85616,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
85618,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
85620,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
85622,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
match_data.loc[match_data['round'].isnull()].shape[0]

Как видим, в этих строках отсутствует какая-либо информация - можем их удалить из таблицы:

In [42]:
# use drop=True to avoid old index being added as a column
#match_data_clean = match_data.dropna(how='all').reset_index(drop=True)
#print('rows:', match_data_clean.shape[0])
#match_data_clean
match_data = match_data.dropna(how='all').reset_index(drop=True)
print('rows:', match_data.shape[0])
match_data

rows: 91957


Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,winner_rank,winner_rank_points,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,loser_rank,loser_rank_points,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced
0,1991-475,San Marino,Clay,32.0,A,19910729.0,1.0,101649.0,1.0,,Guillermo Perez Roldan,R,178.0,ARG,21.771389,33.0,786.0,100800.0,,,Diego Perez,R,178.0,URU,29.464750,201.0,153.0,6-0 6-1,3.0,R32,59.0,0.0,0.0,41.0,34.0,25.0,4.0,7.0,1.0,1.0,0.0,2.0,40.0,22.0,9.0,6.0,6.0,7.0,12.0
1,1991-475,San Marino,Clay,32.0,A,19910729.0,2.0,101035.0,,Q,Menno Oosting,R,180.0,NED,27.197810,441.0,36.0,101216.0,,WC,Olli Rahnasto,R,183.0,FIN,25.582478,548.0,22.0,6-0 6-1,3.0,R32,55.0,2.0,1.0,49.0,43.0,30.0,3.0,7.0,1.0,1.0,1.0,3.0,34.0,17.0,6.0,6.0,6.0,1.0,6.0
2,1991-475,San Marino,Clay,32.0,A,19910729.0,3.0,101534.0,,,Libor Nemecek,R,168.0,CZE,22.754278,208.0,143.0,100536.0,,,Fernando Luna,R,175.0,ESP,33.262149,223.0,119.0,6-2 6-1,3.0,R32,75.0,0.0,0.0,49.0,40.0,21.0,6.0,7.0,3.0,5.0,0.0,0.0,54.0,42.0,15.0,4.0,8.0,6.0,13.0
3,1991-475,San Marino,Clay,32.0,A,19910729.0,4.0,101475.0,,,Carlos Costa,R,183.0,ESP,23.266256,142.0,249.0,101460.0,8.0,,Jose Francisco Altur,L,190.0,ESP,23.345654,121.0,315.0,6-4 7-6(5),3.0,R32,100.0,1.0,2.0,74.0,44.0,31.0,12.0,11.0,4.0,7.0,0.0,4.0,69.0,46.0,29.0,9.0,11.0,3.0,7.0
4,1991-475,San Marino,Clay,32.0,A,19910729.0,5.0,101686.0,4.0,,Franco Davin,L,173.0,ARG,21.544148,47.0,632.0,101876.0,,Q,Massimo Ardinghi,R,175.0,ITA,20.396988,257.0,92.0,6-1 6-1,3.0,R32,55.0,0.0,2.0,43.0,29.0,23.0,7.0,7.0,1.0,1.0,0.0,2.0,44.0,18.0,6.0,11.0,7.0,2.0,7.0
5,1991-475,San Marino,Clay,32.0,A,19910729.0,6.0,101124.0,,,Paolo Cane,R,180.0,ITA,26.302533,124.0,300.0,101416.0,,,Joao Cunha Silva,R,173.0,POR,23.668720,141.0,253.0,7-6(5) 6-3,3.0,R32,112.0,6.0,2.0,77.0,29.0,19.0,27.0,11.0,4.0,7.0,1.0,4.0,67.0,35.0,24.0,11.0,10.0,2.0,6.0
6,1991-475,San Marino,Clay,32.0,A,19910729.0,7.0,101370.0,,,Paolo Pambianco,R,190.0,ITA,24.049281,189.0,163.0,101792.0,,,Slava Dosedel,R,183.0,CZE,20.955510,140.0,259.0,7-6(2) 4-6 6-2,3.0,R32,160.0,3.0,10.0,106.0,52.0,39.0,22.0,15.0,6.0,10.0,3.0,10.0,102.0,60.0,38.0,18.0,15.0,4.0,9.0
7,1991-475,San Marino,Clay,32.0,A,19910729.0,8.0,101746.0,5.0,,Renzo Furlan,R,175.0,ITA,21.199179,51.0,594.0,101368.0,,,Christian Miniussi,R,185.0,ARG,24.065708,161.0,206.0,6-3 6-1,3.0,R32,76.0,1.0,1.0,42.0,27.0,21.0,6.0,8.0,0.0,2.0,1.0,6.0,59.0,25.0,13.0,12.0,8.0,4.0,10.0
8,1991-475,San Marino,Clay,32.0,A,19910729.0,9.0,101243.0,6.0,,Roberto Azar,L,185.0,ARG,25.355236,108.0,349.0,101031.0,,,Massimo Cierro,R,173.0,ITA,27.225188,197.0,158.0,6-3 6-1,3.0,R32,45.0,1.0,0.0,38.0,32.0,23.0,5.0,8.0,0.0,1.0,0.0,2.0,49.0,21.0,12.0,11.0,8.0,1.0,6.0
9,1991-475,San Marino,Clay,32.0,A,19910729.0,10.0,101490.0,,,Daniel Orsanic,L,183.0,ARG,23.129363,145.0,238.0,101420.0,,,Vaclav Roubicek,R,185.0,CZE,23.624914,159.0,215.0,6-2 6-2,3.0,R32,65.0,4.0,0.0,49.0,32.0,27.0,6.0,8.0,2.0,3.0,1.0,6.0,57.0,37.0,18.0,7.0,8.0,4.0,9.0


Теперь убедимся, что со стадиями турнира все в порядке:

In [43]:
#match_data_clean['round'].unique()
match_data['round'].unique()

array(['R32', 'R16', 'QF', 'SF', 'F', 'R64', 'R128', 'RR', 'BR'],
      dtype=object)

Проверим оставшиеся записи по колонкам "дата турнира" и "id турнира" на отсутствие нулевых записей:

In [44]:
match_data.loc[match_data['tourney_date'].isnull()]

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,winner_rank,winner_rank_points,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,loser_rank,loser_rank_points,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced


In [46]:
match_data.loc[match_data['tourney_id'].isnull()]

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,winner_rank,winner_rank_points,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,loser_rank,loser_rank_points,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced


Удалим записи о матчах групповых этапов турниров (RR) (вновь используя опцию drop=True):

In [47]:
match_data_clean = match_data[match_data['round'] != 'RR'].reset_index(drop=True)
match_data_clean

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,winner_rank,winner_rank_points,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,loser_rank,loser_rank_points,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced
0,1991-475,San Marino,Clay,32.0,A,19910729.0,1.0,101649.0,1.0,,Guillermo Perez Roldan,R,178.0,ARG,21.771389,33.0,786.0,100800.0,,,Diego Perez,R,178.0,URU,29.464750,201.0,153.0,6-0 6-1,3.0,R32,59.0,0.0,0.0,41.0,34.0,25.0,4.0,7.0,1.0,1.0,0.0,2.0,40.0,22.0,9.0,6.0,6.0,7.0,12.0
1,1991-475,San Marino,Clay,32.0,A,19910729.0,2.0,101035.0,,Q,Menno Oosting,R,180.0,NED,27.197810,441.0,36.0,101216.0,,WC,Olli Rahnasto,R,183.0,FIN,25.582478,548.0,22.0,6-0 6-1,3.0,R32,55.0,2.0,1.0,49.0,43.0,30.0,3.0,7.0,1.0,1.0,1.0,3.0,34.0,17.0,6.0,6.0,6.0,1.0,6.0
2,1991-475,San Marino,Clay,32.0,A,19910729.0,3.0,101534.0,,,Libor Nemecek,R,168.0,CZE,22.754278,208.0,143.0,100536.0,,,Fernando Luna,R,175.0,ESP,33.262149,223.0,119.0,6-2 6-1,3.0,R32,75.0,0.0,0.0,49.0,40.0,21.0,6.0,7.0,3.0,5.0,0.0,0.0,54.0,42.0,15.0,4.0,8.0,6.0,13.0
3,1991-475,San Marino,Clay,32.0,A,19910729.0,4.0,101475.0,,,Carlos Costa,R,183.0,ESP,23.266256,142.0,249.0,101460.0,8.0,,Jose Francisco Altur,L,190.0,ESP,23.345654,121.0,315.0,6-4 7-6(5),3.0,R32,100.0,1.0,2.0,74.0,44.0,31.0,12.0,11.0,4.0,7.0,0.0,4.0,69.0,46.0,29.0,9.0,11.0,3.0,7.0
4,1991-475,San Marino,Clay,32.0,A,19910729.0,5.0,101686.0,4.0,,Franco Davin,L,173.0,ARG,21.544148,47.0,632.0,101876.0,,Q,Massimo Ardinghi,R,175.0,ITA,20.396988,257.0,92.0,6-1 6-1,3.0,R32,55.0,0.0,2.0,43.0,29.0,23.0,7.0,7.0,1.0,1.0,0.0,2.0,44.0,18.0,6.0,11.0,7.0,2.0,7.0
5,1991-475,San Marino,Clay,32.0,A,19910729.0,6.0,101124.0,,,Paolo Cane,R,180.0,ITA,26.302533,124.0,300.0,101416.0,,,Joao Cunha Silva,R,173.0,POR,23.668720,141.0,253.0,7-6(5) 6-3,3.0,R32,112.0,6.0,2.0,77.0,29.0,19.0,27.0,11.0,4.0,7.0,1.0,4.0,67.0,35.0,24.0,11.0,10.0,2.0,6.0
6,1991-475,San Marino,Clay,32.0,A,19910729.0,7.0,101370.0,,,Paolo Pambianco,R,190.0,ITA,24.049281,189.0,163.0,101792.0,,,Slava Dosedel,R,183.0,CZE,20.955510,140.0,259.0,7-6(2) 4-6 6-2,3.0,R32,160.0,3.0,10.0,106.0,52.0,39.0,22.0,15.0,6.0,10.0,3.0,10.0,102.0,60.0,38.0,18.0,15.0,4.0,9.0
7,1991-475,San Marino,Clay,32.0,A,19910729.0,8.0,101746.0,5.0,,Renzo Furlan,R,175.0,ITA,21.199179,51.0,594.0,101368.0,,,Christian Miniussi,R,185.0,ARG,24.065708,161.0,206.0,6-3 6-1,3.0,R32,76.0,1.0,1.0,42.0,27.0,21.0,6.0,8.0,0.0,2.0,1.0,6.0,59.0,25.0,13.0,12.0,8.0,4.0,10.0
8,1991-475,San Marino,Clay,32.0,A,19910729.0,9.0,101243.0,6.0,,Roberto Azar,L,185.0,ARG,25.355236,108.0,349.0,101031.0,,,Massimo Cierro,R,173.0,ITA,27.225188,197.0,158.0,6-3 6-1,3.0,R32,45.0,1.0,0.0,38.0,32.0,23.0,5.0,8.0,0.0,1.0,0.0,2.0,49.0,21.0,12.0,11.0,8.0,1.0,6.0
9,1991-475,San Marino,Clay,32.0,A,19910729.0,10.0,101490.0,,,Daniel Orsanic,L,183.0,ARG,23.129363,145.0,238.0,101420.0,,,Vaclav Roubicek,R,185.0,CZE,23.624914,159.0,215.0,6-2 6-2,3.0,R32,65.0,4.0,0.0,49.0,32.0,27.0,6.0,8.0,2.0,3.0,1.0,6.0,57.0,37.0,18.0,7.0,8.0,4.0,9.0


Теперь необходимо отсортировать стадии турнира в соответствии с реальной последовательностью для дальнейшей
сортировки всего DataFrame:

In [38]:
# List of tourney rounds sequence (BR - match for the third place)
rounds = ['RR', 'R128', 'R64', 'R32', 'R16', 'QF', 'SF', 'F', 'BR']

Отсортируем весь DataFrame в соответствии с выбранными критериями: