In [1]:
# move to project root
%cd ..
%pwd

/Users/heste/workspace/soccernet/sn-script


'/Users/heste/workspace/soccernet/sn-script'

In [2]:
import os
from dataclasses import dataclass
from pathlib import Path

import matplotlib
import numpy as np
import pandas as pd
from matplotlib import pylab as plt_lab
from matplotlib import pyplot as plt
from sn_script.config import binary_category_name
from sn_script.csv_utils import gametime_to_seconds, seconds_to_gametime
from tqdm import tqdm

# matplotlibのデフォルトフォントをTakaoGothicに設定
font = {"family": "IPAexGothic"}
matplotlib.rc("font", **font)
plt_lab.rcParams.update({"font.size": 22})

In [3]:
@dataclass
class Config:
    csv_path = Path('database/stable/scbi-v2.csv')
    sncaption_path = Path('database/misc/soccernet_labels.csv')

In [4]:
scbi_df = pd.read_csv(Config.csv_path)
sncaption_df = pd.read_csv(Config.sncaption_path)

In [None]:
scbi_df.columns

In [18]:
# データの確認
target_game = "england_epl/2015-2016/2015-08-29 - 17-00 Chelsea 1 - 2 Crystal Palace"
target_game = "england_epl/2014-2015/2015-02-21 - 18-00 Chelsea 1 - 1 Burnley"

scbi_df.loc[
    scbi_df['game'] == target_game,
    [ "start", "end", "text", binary_category_name]
].head(20)

Unnamed: 0,start,end,text,付加的情報か
0,0.01,2.55,"The duel has already begun, Barley is in control.",0
1,2.57,9.72,They were also measured in the first match of ...,1
2,9.78,10.86,Barley advanced.,1
3,11.3,14.47,It must be remembered that he advanced in that...,1
4,14.91,17.85,without too many difficulties.,0
5,17.87,19.22,"Exactly, that's what I was going to say.",0
6,19.24,20.2,I remember that game very well.,0
7,20.22,22.46,"Barley started well, scored a goal, but Chelse...",0
8,22.48,26.26,"That's why today shouldn't have any problems, ...",0
9,33.6,37.62,"He started with the ball to Cuadrado, left it ...",0


In [None]:
# 映像の統計情報
# 試合数
n_games = scbi_df['game'].nunique()
print(f"Number of games: {n_games}")
# 総合時間
group_per_video = scbi_df.groupby(['game', 'half'])
durations = group_per_video['end'].max() - group_per_video['start'].min()
durations.index = group_per_video['game'].first()
total_duration = durations.sum()
print(f"Total duration: {total_duration}")


In [None]:
# コメントの統計情報
# Total # of utterances
total_num_utterances = scbi_df.shape[0]
print(f"Total # of utterances: {total_num_utterances}")
# avg. # of utterances video (game, half)
avg_num_utterances_per_game = scbi_df.groupby(['game', 'half']).size().mean()
print(f"avg. # of utterances per game: {avg_num_utterances_per_game:.2f}")
# avg. # of characters per utterance
avg_chars_per_utterance = scbi_df['text'].dropna().apply(len).mean()
print(f"avg. # of characters per utterance: {avg_chars_per_utterance:.2f}")
# avg. length of an utterance (sec)
avg_chars_per_utterance = (scbi_df['end'] - scbi_df['start']).mean()
print(f"avg. length of an utterance: {avg_chars_per_utterance:.2f}")
# avg. length of silence (sec)
scbi_df['prev_end'] = scbi_df['end'].shift(1)
scbi_df['silence'] = (scbi_df['start'] - scbi_df['prev_end'])
scbi_df.loc[scbi_df['silence'] < 0, 'silence'] = None
silence_mean = scbi_df['silence'].mean(skipna=True)
print(f"avg. length of silence: {silence_mean:.2f}")


In [None]:
# 分析
# 全体におけるラベルの割合
print(scbi_df[binary_category_name].value_counts(normalize=True))
# ラベルの数
print(scbi_df[binary_category_name].value_counts())

# 結果、付加的情報が18%あった
# つまり、無視できない割合存在する

In [None]:
# 増加タイミングの分析1
# 時間帯(5分でbinning)ごとのラベルの割合
def plot_label_ratio_by_time(df: pd.DataFrame):
    fig, ax = plt.subplots(1, 2, figsize=(20, 10))

    df["start_bin"] = (
        df["start"] // 300
    )

    df_half_1 = df.loc[df["half"] == 1]
    piv_1_df = df_half_1.pivot_table(
        index="start_bin", columns=binary_category_name, aggfunc="count", values="text"
    )
    piv_1_df.index = [seconds_to_gametime(i * 300) + "~" for i in piv_1_df.index]
    (piv_1_df[1.0] / piv_1_df.sum(axis=1)).plot(
        xlabel="試合時間",
        ylabel="割合",
        title="前半",
        ylim=(0, 0.30),
        ax=ax[0]
    )
    # 後半
    half_offset = 2700
    df_half_2 = df.loc[df["half"] == 2]

    piv_2_df = df_half_2.pivot_table(
        index="start_bin", columns=binary_category_name, aggfunc="count", values="text"
    )
    piv_2_df.index = [seconds_to_gametime(i * 300 + half_offset) + "~" for i in piv_2_df.index]
    (piv_2_df[1.0] / piv_2_df.sum(axis=1)).plot(
        xlabel="試合時間",
        ylabel="割合",
        title="後半",
        ylim=(0, 0.30),
        ax=ax[1]
    )
    # 保存
    fig.savefig("database/images/scbi_v2-label_ratio_by_time.png")

plot_label_ratio_by_time(scbi_df)

In [None]:
# 増加タイミングの分析2
# イベント周辺でのラベルの割合
def label_ratio_around_event(df: pd.DataFrame, action_df: pd.DataFrame = sncaption_df, window_size=5):
    """
    event_df (sn-captionの速報テキストデータ) の importantなコメントの前後5秒に含まれる， df の実況コメントのラベルの割合を調べる手順
    1. dfの書くコメントの is_important を False で初期化
    2. soccernet_df を game でグループ化する
    2. game ごとに，
        - soccernet_dfの各行について df の start が action_df の各アクションの start, end に含まれるかを計算する
        - 含まれるなら， df のカラム refer_$action_label を追加し，Trueにする
    3. df[refer_$action_label] = True の付加的情報かの割合を計算する
    """

    if action_df["time"].dtype == "O":
        action_df["time"] = action_df["time"].apply(gametime_to_seconds)
    action_df["start"] = action_df["time"] - 5
    action_df["end"] = action_df["time"] + 5

    # game, half, time でソート
    action_df = action_df.sort_values(by=["game", "half", "start"])

    # イベントラベルのリスト
    action_grouped = action_df.groupby("label")
    action_labels = action_grouped.size().index
    print("Unique labels:")
    print(*action_labels, sep="\n") # nanを除く

    # 各アクションのフラグ列を一時的に保持するための辞書を作成
    new_columns = {}

    # イベント周辺のコメントにフラグを立てる
    for action_label, action_df_subset in tqdm(action_grouped):
        if f"refer_{action_label}" in df.columns:
            print(f"refer_{action_label} is already in df.columns")
            continue
        new_columns[f"refer_{action_label}"] = np.zeros(len(df), dtype=bool)
        tqdm.write(f"Processing {action_label}")
        for _, action_row in tqdm(action_df_subset.iterrows()):
            # dfを直接更新するために、適切なインデックスを取得
            indices = df[
                (df["game"] == action_row["game"])
                & (df["half"] == action_row["half"])
                & (df["start"] >= action_row["start"])
                & (df["start"] <= action_row["end"])
            ].index
            new_columns[f"refer_{action_label}"][indices] = True

    # 新しい列を追加
    df = pd.concat([df, pd.DataFrame(new_columns)], axis=1)

    # イベント周辺のコメントの割合を計算
    for action_label in action_labels:
        if f"refer_{action_label}" not in df.columns:
            print(f"refer_{action_label} is not in df.columns")
            continue
        refer_count = df.loc[:, f"refer_{action_label}"].sum() # noqa
        additional_info_refer_count = df.loc[
            (df[f"refer_{action_label}"] == True) # noqa
            & (df[binary_category_name] == 1),
            f"refer_{action_label}",
        ].sum()


        print(f"推定の{action_label}言及コメント数: {refer_count}")
        print(f"推定の{action_label}言及付加的情報コメント数: {additional_info_refer_count}")
        if refer_count == 0:
            continue
        else:
            print(
                f"推定の{action_label}言及コメントの付加的情報の割合 {additional_info_refer_count / refer_count * 100}%"
            )

label_ratio_around_event(scbi_df, sncaption_df)

In [None]:
# 増加タイミングの分析3
# 沈黙時間とラベルの割合の関係
def label_ratio_by_silence(df: pd.DataFrame):
    # intervalカラムを追加
    df["prev_end"] = df["end"].shift(1)
    df["interval"] = df["start"] - df["prev_end"]
    df.loc[df["interval"] < 0, "interval"] = None

    # 文字の長さを確認
    df['word_count'] = df['text'].str.split().str.len()


    # インターバルが30秒以上のものを表示
    for interval_length in [0.01, 0.1, 0.5, 1, 2, 4, 10, 20, 30, 40, 50, 60]:
        df[f"interval_over_{interval_length}"] = (
            df["interval"] > interval_length
        )
        # 全コメント　interval_length秒以上の沈黙の後の発話テキストのラベル
        rate1 = df[f"interval_over_{interval_length}"].sum() / len(df)

        # 付加的情報のみ　interval_length秒以上の沈黙の後の発話テキストのラベル
        rate2 = df[df[binary_category_name] == 1][
            f"interval_over_{interval_length}"
        ].sum() / len(df[df[binary_category_name] == 1])

        print(
            "全コメント{}秒インターバル後の発話数:{}\n付加的情報のみの{}秒インターバル後の発話数:{}".format(
                interval_length,
                df[f"interval_over_{interval_length}"].sum(),
                interval_length,
                df[df[binary_category_name] == 1][
                    f"interval_over_{interval_length}"
                ].sum(),
            )
        )

        print(f"rate1: {rate1:%}\nrate2: {rate2:%}")

        # word_countの平均
        avg_word_count = df.loc[
            df[f"interval_over_{interval_length}"] == True, # noqa
            'word_count'
        ].mean()
        print(f"word_countの平均: {avg_word_count}")


label_ratio_by_silence(scbi_df)

# scbi-v1と真逆だった最悪

In [None]:
#データの品質チェック
# オーバーラップの確認
def overlap_check(df: pd.DataFrame):
    df['overlap'] = df["prev_end"] - df["start"]
    rate = len(df[df['overlap'] > 0]) / len(df)
    print(f"Overlapping rate: {rate:.2f}")

overlap_check(scbi_df)

In [None]:
# ラベルの特徴
# word_countとラベルの関係 箱ひげ図で可視化
def label_by_column(column: str, df: pd.DataFrame):
    fig, ax = plt.subplots(figsize=(20, 10))
    filtered_df = df.loc[df[binary_category_name].isin([0, 1])]

    filtered_df.boxplot(column=column, by=binary_category_name, ax=ax)
    ax.set_ylim(0, 60)
    ax.set_xlabel("ラベル")
    ax.set_ylabel(column)
    ax.set_title(f"{column}とラベルの関係")
    fig.savefig(f"database/images/scbi_v2-label_by_{column}.png")

label_by_column("word_count", scbi_df)

In [None]:
scbi_df['duration'] = scbi_df['end'] - scbi_df['start']
label_by_column("duration", scbi_df)

In [None]:
# word_countと binary_category_nameの割合 の相関
def plot_label_ratio_by_column(column: str, df: pd.DataFrame):
    fig, ax = plt.subplots(figsize=(10, 10))
    filtered_df = df.loc[
        (df[binary_category_name].isin([0, 1])) &
        (df[column] > 0)
    ]

    # 5で割って丸めてグルーピング
    filtered_df[column] = filtered_df[column].apply(lambda x: int(x) // 5 * 5)

    # xlimを決めるために、word_countの3シグマを計算
    mu = df[column].mean()
    sigma = df[column].std()
    xlim = (max(mu - 3 * sigma, 0), round(mu + 3 * sigma))
    print(f" mu: {mu}, sigma: {sigma}")
    print(f" xlim: {xlim}")

    # xlimの範囲でデータをフィルタリング
    filtered_df = filtered_df.loc[
        (filtered_df[column] >= xlim[0]) & (filtered_df[column] <= xlim[1])
    ]


    piv_df = filtered_df.pivot_table(
        index=column, columns=binary_category_name, aggfunc="count", values="text"
    )
    piv_df = piv_df.fillna(0)
    # 割合に変換?
    piv_df = piv_df.div(piv_df.sum(axis=1), axis=0)

    piv_df.plot(
        kind="bar",
        stacked=True,
        ax=ax,
    )

    ax.set_xlabel(column)
    ax.set_ylabel("割合")
    ax.set_title(f"{column}とラベルの割合の関係")

    fig.savefig(f"database/images/scbi_v2-label_ratio_by_{column}.png")


plot_label_ratio_by_column("word_count", scbi_df)
# 文字数が多いほど付加的情報の割合が高い という当たり前のことを示した

In [None]:
# sn-echoes avarage unique word ratio
# (unique word count divided by the the number of total words)
def avg_unique_word_ratio(df: pd.DataFrame):
    # unique_word_ratio を video (game, half) ごとに計算して平均を取る
    filtered_df = df.loc[df['text'].notnull()].copy()

    group_per_video = filtered_df.groupby(['game', 'half'])
    text_chunk = group_per_video['text'].apply(lambda x: ' '.join(x))
    unique_word_ratio = text_chunk.apply(
        lambda x: len(set(x.split())) / len(x.split())
    )

    # 平均を取る
    avg_unique_word_ratio = unique_word_ratio.mean()
    print(f"Average unique word ratio: {avg_unique_word_ratio:.3f}")
avg_unique_word_ratio(scbi_df)

# 結果、Average unique word ratio: 0.292
# sn-echoesは large-v3で0.370だった
# 低い->書き起こしの品質が悪いと、sn-echoesでは仮定していたけど、そもそもそれは間違いだと考えるべき

In [20]:
# sn-echoes にある、述語目的語の割合 円グラフを scbi-v2 に適用してみる
from collections import Counter

import nltk
import plotly.express as px
from nltk import pos_tag, word_tokenize
from nltk.corpus import (
    stopwords,  # Import the stop word list
    wordnet,
)

# 必要なデータをダウンロード
nltk.download('all')

nltk.data.path.append('/Users/heste/nltk_data')

stops = set(stopwords.words("english"))
stemmer = nltk.WordNetLemmatizer()

# 品詞タグをWordNetの形式に変換する関数
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None


# 動詞-目的語ペアを抽出する関数
def extract_vn_nltk(text):
    """
    1. 文章をトークン化
    2. 品詞タグ付け
    3. stopwordを除外 3文字以上 動詞を抽出
    4. 動詞を見出し語化
    5. 動詞の後に名詞がある場合、動詞-名詞ペアを抽出
    """
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)

    vn_pairs = []
    for i, word_pos in enumerate(tagged):
        word, pos = word_pos

        #  動詞を探す stopwordを除外 3文字以上
        if pos.startswith('VB') and (word.lower() not in stops) and (len(word) >= 3):
            # 目的語となる名詞 (名詞が動詞の後にくる場合を探す)
            if i + 1 < len(tagged) and tagged[i + 1][1] in ['NN', 'NNS']:
                noun = tagged[i + 1][0].lower()
                # 動詞のみ レンマタイズ
                word = stemmer.lemmatize(word.lower(), pos=get_wordnet_pos(pos))
                vn_pairs.append((word, noun))

    return vn_pairs


def plot_vn_ratio(df: pd.DataFrame, only_bi: bool = False):
    # サンプルデータフレームのテキスト列から主語-動詞ペアを抽出
    filtered_df = df.loc[df['text'].notnull()].copy()

    if only_bi:
        # only background information
        filtered_df = filtered_df.loc[filtered_df[binary_category_name] == 1]

    # pandasのprogress_applyを使うために必要
    tqdm.pandas()

    filtered_df['vn_pairs'] = filtered_df['text'].progress_apply(extract_vn_nltk)

    print(filtered_df['vn_pairs'].str.len().describe())

    # 動詞ごとの集計
    vn_list = [
        (v, n)
        for pairs in filtered_df['vn_pairs']
        for v, n in pairs
        if len(pairs) > 0
    ]

    # 動詞の上位20個を取得
    v_list = [v for v, _ in vn_list]
    top_20_verbs = [v for v, _ in Counter(v_list).most_common(20)]

    print(f"{top_20_verbs=}")

    # 動詞ごとの目的語の上位を制限して取得
    filtered_pairs = []
    for verb in top_20_verbs:
        nouns = []
        for v, n in vn_list:
            if v == verb:
                nouns.append(n)

        noun_counts = Counter(nouns)
        print(f"{noun_counts=}")
        # 動詞の順位に応じた目的語の表示数を設定
        rank = top_20_verbs.index(verb) + 1
        if rank <= 3:
            top_nouns = noun_counts.most_common(4)  # 上位3位までの動詞には4個の名詞
        elif rank <= 5:
            top_nouns = noun_counts.most_common(3)  # 上位5位までの動詞には3個の名詞
        else:
            top_nouns = noun_counts.most_common(1)  # それ以外の動詞には1個の名詞

        print(f"{verb=}, {top_nouns=}")
        # 結果をリストに追加
        for noun, count in top_nouns:
            filtered_pairs.append((verb, noun, count))

    sunburst_df = pd.DataFrame(filtered_pairs, columns=['verb', 'noun', 'count'])

    # サンバーストチャートの作成
    fig = px.sunburst(
        sunburst_df,
        path=['verb', 'noun'],   # 動詞をメインセグメント、目的語をサブセグメントとして設定
        values='count',          # 頻度データ
        color='verb',            # 動詞ごとに色分け
        title="Verb-Noun 共起関係",
    )
    fig.show()

# plot_vn_ratio(scbi_df)

# take advantageが以上に多い
# 偏りがありそうなことがわかった


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /Users/heste/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/heste/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/heste/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /Users/heste/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/heste/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_dat

In [21]:
plot_vn_ratio(scbi_df, only_bi=True)

100%|██████████| 62668/62668 [00:24<00:00, 2603.19it/s]


count    62668.000000
mean         0.062520
std          0.264526
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          5.000000
Name: vn_pairs, dtype: float64
top_20_verbs=['take', 'get', 'make', 'score', 'give', 'win', 'lose', 'play', 'start', 'say', 'see', 'leave', 'put', 'create', 'attack', 'find', 'set', 'keep', 'repeat', 'change']
noun_counts=Counter({'advantage': 187, 'care': 28, 'place': 26, 'part': 14, 'charge': 12, 'something': 8, 'control': 7, 'risks': 6, 'time': 6, 'possession': 5, 'goal': 4, 'speed': 4, 'shape': 2, 'anything': 2, 'penalty': 2, 'hold': 2, 'confidence': 2, 'air': 2, 'height': 2, 'crosses': 1, 'corners': 1, 'shots': 1, 'nothing': 1, 'delaney': 1, 'office': 1, 'penalties': 1, 'exception': 1, 'sides': 1, 'chances': 1, 'numbers': 1, 'eye': 1, 'videos': 1, 'positions': 1, 'options': 1, 'refuge': 1, 'everything': 1, '5-0': 1, 'support': 1, 'knowledge': 1, 'home': 1, 'position': 1, 'momentum': 1, 'notes': 1, 'minutes':

In [None]:
# TODO
# scbiをリッチにする
