### 강의에서 사용된 파이썬 주요 기능

- 데이터 불러오기
  - pandas.read_csv: https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
  - pandas.read_pickle: https://pandas.pydata.org/docs/reference/api/pandas.read_pickle.html

- 중복 제거를 활용한 데이터 구성 파악
  - pandas.Series.unique: https://pandas.pydata.org/docs/reference/api/pandas.Series.unique.html
  - pandas.DataFrame.drop_duplicates: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop_duplicates.html
  - pandas.DataFrame.sort_values: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html
  - set: https://wikidocs.net/22262

### 데이터 불러오기

In [1]:
import pandas as pd

##### (1) 경기 정보 불러오기

In [2]:
dataset_name = 'World_Cup'
match_df = pd.read_csv(f'data/refined_events/{dataset_name}/matches.csv', index_col=0, encoding='utf-8-sig')
match_df

Unnamed: 0_level_0,gameweek,datetime,venue,team1_id,team1_name,team1_goals,team2_id,team2_name,team2_goals,duration
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2057954,1,2018-06-14 15:00:00,Olimpiyskiy stadion Luzhniki,14358,Russia,5,16521,Saudi Arabia,0,Regular
2057955,1,2018-06-15 12:00:00,Stadion Central'nyj,16129,Egypt,0,15670,Uruguay,1,Regular
2057961,1,2018-06-15 15:00:00,Stadion Krestovskyi,16216,Morocco,0,10840,Iran,1,Regular
2057960,1,2018-06-15 18:00:00,Olimpiyskiy Stadion Fisht,9905,Portugal,3,1598,Spain,3,Regular
2057966,1,2018-06-16 10:00:00,Kazan' Arena,4418,France,2,8493,Australia,1,Regular
...,...,...,...,...,...,...,...,...,...,...
2058012,0,2018-07-07 18:00:00,Olimpiyskiy Stadion Fisht,14358,Russia,2,9598,Croatia,2,Penalties
2058014,0,2018-07-10 18:00:00,Stadion Krestovskyi,4418,France,1,5629,Belgium,0,Regular
2058015,0,2018-07-11 18:00:00,Olimpiyskiy stadion Luzhniki,9598,Croatia,2,2413,England,1,ExtraTime
2058016,0,2018-07-14 14:00:00,Stadion Krestovskyi,5629,Belgium,2,2413,England,0,Regular


##### (2) 경기 정보 필터링

In [3]:
match_df[(match_df['team1_name'] == 'Korea Republic') | (match_df['team2_name'] == 'Korea Republic')]

Unnamed: 0_level_0,gameweek,datetime,venue,team1_id,team1_name,team1_goals,team2_id,team2_name,team2_goals,duration
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2057985,1,2018-06-18 12:00:00,Stadion Nizhny Novgorod,7047,Sweden,1,14855,Korea Republic,0,Regular
2057987,2,2018-06-23 15:00:00,Rostov Arena,14855,Korea Republic,1,15473,Mexico,2,Regular
2057988,3,2018-06-27 14:00:00,Kazan' Arena,14855,Korea Republic,2,3148,Germany,0,Regular


##### (3) 경기 이벤트 데이터 불러오기

- 3차전 한국 vs 독일 이벤트 데이터를 DataFrame 형태로 불러오기

In [4]:
match_id = 2057988
match_events = pd.read_pickle(f'data/refined_events/{dataset_name}/{match_id}.pkl')
match_events

Unnamed: 0,match_id,event_id,period,time,team_id,team_name,player_id,player_name,event_type,sub_event_type,tags,start_x,start_y,end_x,end_y
0,2057988,260662125,1H,1.157,3148,Germany,3319,M. Özil,Pass,Simple pass,[Accurate],52.00,34.68,39.52,36.04
1,2057988,260662126,1H,2.669,3148,Germany,14723,T. Kroos,Pass,Simple pass,[Accurate],39.52,36.04,46.80,32.64
2,2057988,260662127,1H,4.965,3148,Germany,3319,M. Özil,Pass,Simple pass,[Accurate],46.80,32.64,32.24,25.84
3,2057988,260662128,1H,8.096,3148,Germany,134383,N. Süle,Pass,Simple pass,[Accurate],32.24,25.84,38.48,32.64
4,2057988,260662129,1H,9.505,3148,Germany,14723,T. Kroos,Pass,Simple pass,[Accurate],38.48,32.64,27.04,24.48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1659,2057988,260664023,2H,3213.652,3148,Germany,134383,N. Süle,Others on the ball,Touch,[],85.28,31.96,72.80,40.12
1660,2057988,260664025,2H,3216.522,3148,Germany,14723,T. Kroos,Pass,Simple pass,[Accurate],72.80,40.12,90.48,51.68
1661,2057988,260664027,2H,3218.856,3148,Germany,14732,T. Müller,Pass,Cross,"[Key pass, Right foot, High, Accurate]",90.48,51.68,96.72,36.72
1662,2057988,260664031,2H,3219.998,3148,Germany,14795,M. Hummels,Shot,Shot,"[Head/body, Opportunity, Position: Out high le...",96.72,36.72,104.00,34.00


### 이벤트 데이터 인덱싱·슬라이싱·필터링

##### (1) 열 인덱싱(column indexing)

In [5]:
match_events['player_name']

0            M. Özil
1           T. Kroos
2            M. Özil
3            N. Süle
4           T. Kroos
            ...     
1659         N. Süle
1660        T. Kroos
1661       T. Müller
1662      M. Hummels
1663    Hyun-Woo Cho
Name: player_name, Length: 1664, dtype: object

##### (2) 행 인덱싱(row indexing)

In [6]:
match_events.loc[0]

match_id              2057988
event_id            260662125
period                     1H
time                    1.157
team_id                  3148
team_name             Germany
player_id                3319
player_name           M. Özil
event_type               Pass
sub_event_type    Simple pass
tags               [Accurate]
start_x                  52.0
start_y                 34.68
end_x                   39.52
end_y                   36.04
Name: 0, dtype: object

##### (3) 슬라이싱(slicing)

In [7]:
match_events[0:10]

Unnamed: 0,match_id,event_id,period,time,team_id,team_name,player_id,player_name,event_type,sub_event_type,tags,start_x,start_y,end_x,end_y
0,2057988,260662125,1H,1.157,3148,Germany,3319,M. Özil,Pass,Simple pass,[Accurate],52.0,34.68,39.52,36.04
1,2057988,260662126,1H,2.669,3148,Germany,14723,T. Kroos,Pass,Simple pass,[Accurate],39.52,36.04,46.8,32.64
2,2057988,260662127,1H,4.965,3148,Germany,3319,M. Özil,Pass,Simple pass,[Accurate],46.8,32.64,32.24,25.84
3,2057988,260662128,1H,8.096,3148,Germany,134383,N. Süle,Pass,Simple pass,[Accurate],32.24,25.84,38.48,32.64
4,2057988,260662129,1H,9.505,3148,Germany,14723,T. Kroos,Pass,Simple pass,[Accurate],38.48,32.64,27.04,24.48
5,2057988,260662130,1H,11.939,3148,Germany,134383,N. Süle,Pass,Simple pass,[Accurate],27.04,24.48,9.36,33.32
6,2057988,260662132,1H,16.057,3148,Germany,14712,M. Neuer,Pass,Simple pass,[Accurate],9.36,33.32,28.08,35.36
7,2057988,260662133,1H,17.709,3148,Germany,14723,T. Kroos,Pass,Simple pass,[Accurate],28.08,35.36,31.2,27.88
8,2057988,260662134,1H,19.595,3148,Germany,3318,S. Khedira,Pass,Simple pass,[Accurate],31.2,27.88,33.28,51.0
9,2057988,260662135,1H,24.925,3148,Germany,14795,M. Hummels,Pass,Simple pass,[Accurate],33.28,51.0,30.16,43.52


##### (4) 필터링(filtering) 또는 불린 인덱싱(Boolean indexing)

In [8]:
match_events[match_events['event_type'] == 'Shot'].head()

Unnamed: 0,match_id,event_id,period,time,team_id,team_name,player_id,player_name,event_type,sub_event_type,tags,start_x,start_y,end_x,end_y
250,2057988,260662420,1H,825.666,3148,Germany,3318,S. Khedira,Shot,Shot,"[Head/body, Opportunity, Position: Goal high r...",93.6,32.64,104.0,34.0
296,2057988,260839866,1H,1144.624,14855,Korea Republic,107529,Yong Lee,Shot,Shot,"[Right foot, Position: Out high right, Not acc...",73.84,24.48,104.0,34.0
402,2057988,260662698,1H,1442.867,14855,Korea Republic,14911,Son Heung-Min,Shot,Shot,"[Right foot, Opportunity, Position: Out high r...",94.64,25.16,104.0,34.0
522,2057988,260662734,1H,1927.182,3148,Germany,15024,M. Reus,Shot,Shot,"[Left foot, Blocked, Not accurate]",82.16,33.32,89.44,34.68
642,2057988,260662853,1H,2288.665,3148,Germany,173214,T. Werner,Shot,Shot,"[Right foot, Blocked, Opportunity, Not accurate]",94.64,22.44,92.56,21.76


In [9]:
match_events[match_events['player_name'] == 'Son Heung-Min'].head()

Unnamed: 0,match_id,event_id,period,time,team_id,team_name,player_id,player_name,event_type,sub_event_type,tags,start_x,start_y,end_x,end_y
33,2057988,260662167,1H,96.836,14855,Korea Republic,14911,Son Heung-Min,Duel,Ground attacking duel,"[Anticipation, Lost, Not accurate]",93.6,27.2,93.6,34.0
67,2057988,260662209,1H,182.528,14855,Korea Republic,14911,Son Heung-Min,Duel,Ground loose ball duel,"[Lost, Not accurate]",45.76,20.4,57.2,22.44
116,2057988,260662264,1H,363.448,14855,Korea Republic,14911,Son Heung-Min,Duel,Ground attacking duel,"[Free space left, Neutral, Accurate]",73.84,22.44,74.88,27.2
118,2057988,260662267,1H,365.364,14855,Korea Republic,14911,Son Heung-Min,Duel,Ground loose ball duel,"[Won, Accurate]",74.88,27.2,78.0,31.96
222,2057988,260662406,1H,733.162,14855,Korea Republic,14911,Son Heung-Min,Duel,Ground loose ball duel,"[Lost, Not accurate]",72.8,23.12,71.76,19.72


### 중복 제거를 활용한 데이터 구성 파악

##### (1) Series 원소 중복 제거

- 전후반 구분

In [10]:
match_events['period'].unique()

array(['1H', '2H'], dtype=object)

- 팀 구분

In [11]:
match_events['team_name'].unique()

array(['Germany', 'Korea Republic'], dtype=object)

- 경기 출전 선수

In [12]:
match_events['player_name'].unique()

array(['M. Özil', 'T. Kroos', 'N. Süle', 'M. Neuer', 'S. Khedira',
       'M. Hummels', 'Seon-Min Moon', 'Ja-Cheol Koo', 'Young-Gwon Kim',
       'J. Hector', 'Jae-Sung Lee', 'Yong Lee', 'Son Heung-Min',
       'Woo-Young Jung', 'Young-Sun Yun', 'Chul Hong', 'J. Kimmich',
       'M. Reus', 'T. Werner', 'Hyun-Woo Cho', 'Hyun-Soo Jang',
       'L. Goretzka', 'Hwang Hee-Chan', 'M. Gómez', 'T. Müller',
       'Se-Jong Ju', 'J. Brandt', 'Yo-Han Go'], dtype=object)

##### (2) DataFrame 행 중복 제거

- 팀과 선수의 ID 및 이름

In [13]:
match_events[['team_id', 'team_name', 'player_id', 'player_name']].drop_duplicates().sort_values('team_id')

Unnamed: 0,team_id,team_name,player_id,player_name
0,3148,Germany,3319,M. Özil
99,3148,Germany,173214,T. Werner
1361,3148,Germany,146583,J. Brandt
163,3148,Germany,134387,L. Goretzka
23,3148,Germany,18669,J. Hector
48,3148,Germany,224593,J. Kimmich
1086,3148,Germany,14732,T. Müller
1001,3148,Germany,14731,M. Gómez
9,3148,Germany,14795,M. Hummels
8,3148,Germany,3318,S. Khedira


- 이벤트 유형의 종류

In [14]:
match_events[['event_type', 'sub_event_type']].drop_duplicates().sort_values(['event_type', 'sub_event_type'])

Unnamed: 0,event_type,sub_event_type
94,Duel,Air duel
16,Duel,Ground attacking duel
15,Duel,Ground defending duel
17,Duel,Ground loose ball duel
21,Foul,Foul
525,Foul,Hand foul
1135,Foul,Simulation
155,Foul,Violent Foul
244,Free kick,Corner
22,Free kick,Free kick


##### (3) List 원소 중복 제거

- 추가 태그의 종류

In [15]:
match_events['tags'].sum()

['Accurate',
 'Accurate',
 'Accurate',
 'Accurate',
 'Accurate',
 'Accurate',
 'Accurate',
 'Accurate',
 'Accurate',
 'Accurate',
 'Accurate',
 'Accurate',
 'Accurate',
 'Accurate',
 'Accurate',
 'Free space right',
 'Won',
 'Accurate',
 'Free space left',
 'Neutral',
 'Accurate',
 'Lost',
 'Not accurate',
 'Neutral',
 'Accurate',
 'Lost',
 'Not accurate',
 'Not accurate',
 'Accurate',
 'Accurate',
 'Neutral',
 'Accurate',
 'Neutral',
 'Accurate',
 'Not accurate',
 'Accurate',
 'Free space left',
 'Lost',
 'Not accurate',
 'Free space right',
 'Won',
 'Accurate',
 'Won',
 'Accurate',
 'Accurate',
 'Anticipation',
 'Lost',
 'Not accurate',
 'Anticipated',
 'Won',
 'Accurate',
 'Not accurate',
 'Accurate',
 'Neutral',
 'Accurate',
 'Accurate',
 'Accurate',
 'Accurate',
 'Not accurate',
 'Neutral',
 'Accurate',
 'Neutral',
 'Accurate',
 'Accurate',
 'Accurate',
 'Accurate',
 'Accurate',
 'Accurate',
 'Accurate',
 'Accurate',
 'Free space left',
 'Won',
 'Accurate',
 'Free space right',
 '

In [16]:
set(match_events['tags'].sum())

{134387,
 146583,
 14731,
 14732,
 15781,
 18669,
 209567,
 3318,
 349687,
 51625,
 'Accurate',
 'Anticipated',
 'Anticipation',
 'Assist',
 'Blocked',
 'Counter attack',
 'Dangerous ball lost',
 'Direct',
 'Fairplay',
 'Free space left',
 'Free space right',
 'Goal',
 'Head/body',
 'High',
 'Interception',
 'Key pass',
 'Left foot',
 'Lost',
 'Missed ball',
 'Neutral',
 'Not accurate',
 'Opportunity',
 'Position: Goal center',
 'Position: Goal center left',
 'Position: Goal center right',
 'Position: Goal high center',
 'Position: Goal high right',
 'Position: Goal low center',
 'Position: Goal low left',
 'Position: Out center left',
 'Position: Out high left',
 'Position: Out high right',
 'Position: Out low left',
 'Right foot',
 'Sliding tackle',
 'Take on left',
 'Take on right',
 'Through',
 'Won',
 'Yellow card'}

In [17]:
match_events[match_events['event_type'] == 'Substitution']

Unnamed: 0,match_id,event_id,period,time,team_id,team_name,player_id,player_name,event_type,sub_event_type,tags,start_x,start_y,end_x,end_y
977,2057988,0,2H,600.0,14855,Korea Republic,349687,Hwang Hee-Chan,Substitution,Player in,[15781],,,,
978,2057988,0,2H,600.0,14855,Korea Republic,15781,Ja-Cheol Koo,Substitution,Player out,[349687],,,,
1001,2057988,0,2H,720.0,3148,Germany,14731,M. Gómez,Substitution,Player in,[3318],,,,
1002,2057988,0,2H,720.0,3148,Germany,3318,S. Khedira,Substitution,Player out,[14731],,,,
1086,2057988,0,2H,1020.0,3148,Germany,14732,T. Müller,Substitution,Player in,[134387],,,,
1087,2057988,0,2H,1020.0,3148,Germany,134387,L. Goretzka,Substitution,Player out,[14732],,,,
1207,2057988,0,2H,1380.0,14855,Korea Republic,209567,Se-Jong Ju,Substitution,Player in,[51625],,,,
1208,2057988,0,2H,1380.0,14855,Korea Republic,51625,Seon-Min Moon,Substitution,Player out,[209567],,,,
1361,2057988,0,2H,1920.0,3148,Germany,146583,J. Brandt,Substitution,Player in,[18669],,,,
1362,2057988,0,2H,1920.0,3148,Germany,18669,J. Hector,Substitution,Player out,[146583],,,,
