In [1]:
# move to project root
%cd ..
%pwd

/Users/heste/workspace/soccernet/sn-script


'/Users/heste/workspace/soccernet/sn-script'

In [13]:
import os

import pandas as pd
import numpy as np
from dataclasses import dataclass
from pathlib import Path

from sn_script.config import binary_category_name
from sn_script.csv_utils import gametime_to_seconds, seconds_to_gametime

In [3]:
@dataclass
class Config:
    csv_path = Path('database/stable/scbi-v2.csv')

In [4]:
scbi_df = pd.read_csv(Config.csv_path)

In [5]:
scbi_df.columns

Index(['id', 'game', 'half', 'start', 'end', 'text', '小分類', '備考', '付加的情報か'], dtype='object')

In [15]:
# データの確認
target_game = "england_epl/2015-2016/2015-08-29 - 17-00 Chelsea 1 - 2 Crystal Palace"

scbi_df.loc[
    scbi_df['game'] == target_game,
    [ "start", "end", "text", binary_category_name]
].head(20)

Unnamed: 0,start,end,text,付加的情報か
7810,0.89,7.84,Chelsea won this fixture 1-0 at Stamford Bridg...,1
7811,9.82,13.7,Jose Mourinho's team had arguably done their b...,1
7812,14.9,19.73,A 14-game unbeaten run which took them six poi...,1
7813,24.6,30.18,he's punching Zaha now to provide the width,0
7814,31.82,32.9,Put behind by Ivanovic.,0
7815,33.0,35.76,It is a first-minute corner for Crystal Palace.,1
7816,36.46,39.7,He had a chat with Alan Poggi down in the medi...,0
7817,39.74,41.42,He said they were going to take the game to Ch...,0
7818,41.46,42.24,"Hope they do, Tony.",0
7819,42.32,44.7,It will prove to be a good one.,0


In [32]:
# 映像の統計情報
# 試合数
n_games = scbi_df['game'].nunique()
print(f"Number of games: {n_games}")
# 総合時間
group_per_video = scbi_df.groupby(['game', 'half'])
durations = group_per_video['end'].max() - group_per_video['start'].min()
durations.index = group_per_video['game'].first()
total_duration = durations.sum()
print(f"Total duration: {total_duration}")


Number of games: 339
Total duration: 1863010.56


In [24]:
# コメントの統計情報
# Total # of utterances
total_num_utterances = scbi_df.shape[0]
print(f"Total # of utterances: {total_num_utterances}")
# avg. # of utterances video (game, half)
avg_num_utterances_per_game = scbi_df.groupby(['game', 'half']).size().mean()
print(f"avg. # of utterances per game: {avg_num_utterances_per_game:.2f}")
# avg. # of characters per utterance
avg_chars_per_utterance = scbi_df['text'].dropna().apply(len).mean()
print(f"avg. # of characters per utterance: {avg_chars_per_utterance:.2f}")
# avg. length of an utterance (sec)
avg_chars_per_utterance = (scbi_df['end'] - scbi_df['start']).mean()
print(f"avg. length of an utterance: {avg_chars_per_utterance:.2f}")
# avg. length of silence (sec)
scbi_df['prev_end'] = scbi_df['end'].shift(1)
scbi_df['silence'] = (scbi_df['start'] - scbi_df['prev_end'])
scbi_df.loc[scbi_df['silence'] < 0, 'silence'] = None
silence_mean = scbi_df['silence'].mean(skipna=True)
print(f"avg. length of silence: {silence_mean:.2f}")


Total # of utterances: 338026
avg. # of utterances per game: 498.56
avg. # of characters per utterance: 57.19
avg. length of an utterance: 3.38
avg. length of silence: 2.14


In [14]:
# 分析
# 全体におけるラベルの割合
print(scbi_df[binary_category_name].value_counts(normalize=True))
# ラベルの数
print(scbi_df[binary_category_name].value_counts())

# 結果、付加的情報が18%あった
# つまり、無視できない割合存在する

付加的情報か
 0    0.814582
 1    0.185394
 2    0.000018
-1    0.000006
Name: proportion, dtype: float64
付加的情報か
 0    275350
 1     62668
 2         6
-1         2
Name: count, dtype: int64


In [None]:
# 増加タイミングの分析1
# 時間帯(5分でbinning)ごとのラベルの割合


In [None]:
# 増加タイミングの分析2
# イベント周辺でのラベルの割合


In [None]:
# 増加タイミングの分析3
# 沈黙時間とラベルの割合の関係
