In [10]:
import glob
from natsort import natsorted
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from typing import Final
from __future__ import annotations

In [11]:
YEARLY_RESULTS_HEADER: Final[tuple[str]] = (
    '選手名','年齢','性別','身長','体重','級班','脚質','登録番号',
    '年度','1着','2着','3着','4着〜','出走回数','優勝回数','勝率','2連対率','3連対率'
)
YEARLY_RESULTS_HEADER_RENAME: Final[tuple[str]] = (
    '選手名','性別','級班','脚質','登録番号',
    '年度','1着','2着','3着','4着〜','出走回数','優勝回数',
    '年齢','身長','体重','勝率','2連対率','3連対率'
)
YEARLY_RESULTS_CONV_HEADER: Final[dict(str, int)] = {
    '年齢': 1,
    '身長': 3,
    '体重': 4,
    '勝率': 15,
    '2連対率': 16,
    '3連対率': 17,
}
PLAYER_BASIC_DATA_COLS: Final[list[str]] = [
    '選手名','性別','級班','脚質','登録番号','年齢','身長','体重'
]
DROP_COLS: Final[list[str]] = [
    '性別','級班','脚質','登録番号','年齢','身長','体重'
]
CUMSUM_MEAN_SELECT_COLS: Final[list(tuple[str])] = [
    ('1着','sum'), ('2着', 'sum'), ('3着', 'sum'), ('4着〜', 'sum'),
    ('出走回数', 'sum'), ('優勝回数', 'sum'), ('勝率', 'mean'),
    ('2連対率', 'mean'), ('3連対率', 'mean')
]
CUMSUM_MEAN_AFTER_COLS: Final[tuple[str]] = (
    '1着', '2着', '3着', '4着～', '出走回数', '優勝回数', '勝率', '2連対率', '3連対率'
)
LABELENCODER_COLUMNS: Final[list[str]] = ['性別', '級班', '脚質']

In [12]:
def get_df() -> pd.DataFrame:
    files = natsorted(glob.glob('player_data/yearly_results*.csv'))

    yearly_results_list = []
    for file in files:
        try:
            yearly_results_list.append(pd.read_csv(file, encoding='utf-8', header=0, names=YEARLY_RESULTS_HEADER))
        except UnicodeDecodeError:
            yearly_results_list.append(pd.read_csv(file, encoding='cp932', header=0, names=YEARLY_RESULTS_HEADER))
    df_yearly_results = pd.concat(yearly_results_list)
    
    return df_yearly_results

In [13]:
def char_exclusion(df: pd.DataFrame) -> pd.DataFrame:
    for column_name, column_num in YEARLY_RESULTS_CONV_HEADER.items():
        if column_num == 1:
            df[column_name + '(数字)'] = df.iloc[:, column_num].str.extract('(\d+)')
        elif column_num == 3:
            df[column_name + '(数字)'] = df.iloc[:, column_num].str.rstrip('cm')
        elif column_num == 4:
            df[column_name + '(数字)'] = df.iloc[:, column_num].str.rstrip('kg')
        else:
            df[column_name + '(数字)'] = df.iloc[:, column_num].str.rstrip('%')
    return df

In [14]:
def df_drop(df: pd.DataFrame, column_names: list[str]) -> pd.DataFrame:
    for column_name in column_names:
        df.drop(column_name, axis=1, inplace=True)
    return df

In [15]:
def labelencoder(df: pd.DataFrame) -> pd.DataFrame:
    for column in LABELENCODER_COLUMNS:
        le = LabelEncoder()
        le.fit(df[column])
        df[column] = le.transform(df[column])

    return df

In [16]:
def cum_sum_mean(df: pd.DataFrame) -> pd.DataFrame:
    df_list = []
    df_key = df['選手名'].drop_duplicates()
    
    for player_name in df_key:
        df_player = df.query('選手名==@player_name')
        for i in range(len(df_player)):
            df_temp = df_player.iloc[i:,:].groupby('選手名').agg(['sum', 'mean'])
            df_temp = df_temp[CUMSUM_MEAN_SELECT_COLS]
            df_temp.columns = CUMSUM_MEAN_AFTER_COLS
            df_temp = df_temp.reset_index()
            df_temp.insert(1, '年度', df_player.iloc[i, 1])
            df_list.append(df_temp)
    
    return pd.concat(df_list)

In [17]:
df_yearly_results: pd.DataFrame = get_df()
df_yearly_results = char_exclusion(df_yearly_results)
df_yearly_results = df_drop(df_yearly_results, list(YEARLY_RESULTS_CONV_HEADER.keys()))
# df_yearly_results = labelencoder(df_yearly_results)
df_yearly_results.columns = YEARLY_RESULTS_HEADER_RENAME
df_yearly_results.set_index('選手名')
df_player_basic_data = df_yearly_results[PLAYER_BASIC_DATA_COLS]
df_player_basic_data.drop_duplicates(inplace=True)
df_yearly_results.drop(DROP_COLS, axis=1, inplace=True)
df_yearly_results['勝率'] = df_yearly_results['勝率'].astype('float')
df_yearly_results['2連対率'] = df_yearly_results['2連対率'].astype('float')
df_yearly_results['3連対率'] = df_yearly_results['3連対率'].astype('float')
df_yearly_results = cum_sum_mean(df_yearly_results)
pd.to_pickle(df_yearly_results, 'yearly_results.pkl')
df_yearly_results = pd.read_pickle('yearly_results.pkl')
pd.to_pickle(df_player_basic_data, 'player_basic_data.pkl')
df_player_basic_data = pd.read_pickle('player_basic_data.pkl')
display(df_yearly_results)
display(df_player_basic_data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_basic_data.drop_duplicates(inplace=True)


Unnamed: 0,選手名,年度,1着,2着,3着,4着～,出走回数,優勝回数,勝率,2連対率,3連対率
0,相川永伍,2021,149,91,67,385,692,9,20.500000,33.211111,43.133333
0,相川永伍,2020,139,86,59,359,643,9,20.512500,33.537500,42.662500
0,相川永伍,2019,131,81,53,320,585,9,21.471429,35.128571,44.071429
0,相川永伍,2018,108,65,45,273,491,6,20.966667,34.066667,43.083333
0,相川永伍,2017,101,63,41,206,411,6,23.400000,38.620000,48.440000
...,...,...,...,...,...,...,...,...,...,...,...
0,鰐渕正利,2017,17,38,55,262,372,0,4.440000,14.280000,28.900000
0,鰐渕正利,2016,16,36,47,212,311,0,5.150000,16.625000,31.625000
0,鰐渕正利,2015,12,30,37,154,233,0,5.166667,17.900000,33.633333
0,鰐渕正利,2014,8,20,26,100,154,0,5.200000,18.000000,34.650000


Unnamed: 0,選手名,性別,級班,脚質,登録番号,年齢,身長,体重
0,相川永伍,男,S級2班,逃,14448,37,183.0,93.0
9,相川巧,男,A級3班,逃,14697,31,166.0,73.0
18,相笠翔太,男,S級2班,両,14502,31,169.0,70.0
27,愛敬博之,男,S級2班,追,14390,37,175.0,77.0
36,相澤恵一,男,A級3班,追,14432,34,172.0,72.0
...,...,...,...,...,...,...,...,...
0,和田誠寿,男,A級1班,両,14953,30,170.7,67.3
9,和田真久留,男,S級1班,逃,14744,30,172.0,69.7
18,和田禎嗣,男,A級1班,両,14732,36,172.0,81.0
27,鰐淵圭佑,男,A級2班,追,14065,37,174.0,80.0
