In [25]:
import os
import glob
import json
import pandas
from tqdm import tqdm 
from collections import defaultdict
from src.models.video.labels import EVENT_DICTIONARY_V2
from src.preprocessing.video import(
    json_reader,
    round_down_to_minute,
    round_down_to_minute_v2,
    normalize_path,
    round_down_to_minute_half
)

In [26]:
EVENT_DICT = {key: index for index, key in enumerate(EVENT_DICTIONARY_V2.keys())}
EVENT_DICT
video_path = os.listdir("F:/video_classification")
video_paths = [f"F:/video_classification/{i}" for i in video_path]

labels = [normalize_path(glob.glob(f"F:/video_classification/{i}/*.json")[0]) for i in video_path]

In [27]:
NO_NEEDED_CLASSES = []
for games in tqdm(video_paths):
    # print(games)
    anno_path = normalize_path(os.path.join(games, 'Labels-v2.json'))
    annos = json_reader(anno_path)['annotations']
    # Bỏ qua các frames mà không được show
    # tức là các frame này khả năng cao không có ý nghĩa
    
    # annos_v2 = [
    #     item for item in annos 
    #     if item['visibility'] != 'not shown' 
            # and item['label'] not in NO_NEEDED_CLASSES
        # ]
    
    # annos_v3 = [item for item in annos_v2
    #     if item['label'] == 'Kick-off' and (item['gameTime'].startswith('1 - 00:') 
    #                                     or item['gameTime'].startswith('2 - 00:'))      
        # ]
    # print(annos_v3)
    
    grouped_items = defaultdict(list)
    annos_v2 = sorted([
        dict(t) for t in {tuple(d.items()) for d in annos}], 
        key = lambda x: x['gameTime']
        )
    
    for item in annos_v2:
        if item['gameTime'].startswith("1"):
            group = round_down_to_minute(item['gameTime'].split(' - ')[-1])
            # group2 = round_down_to_minute_half(item['gameTime'].split(' - ')[-1], delta = 0)
        else:
            group = round_down_to_minute_v2(item['gameTime'].split(' - ')[-1])
            # group2 = round_down_to_minute_half(item['gameTime'].split(' - ')[-1], delta = 45)
        grouped_items[group].append(item['label'])
        # grouped_items[group2].append(item['label'])
    grouped_items = defaultdict(list, {k: v for k, v in grouped_items.items() if not k.startswith("-")})
    # sorted keys chỉ chứa các thông tin về các phút chứa sự kiện
    # tức là có thể có CÁC PHÚT không có sự kiện nào xảy ra
    # sort_keys là các time-minute 
    sorted_keys = sorted(
            grouped_items.keys(), 
            key = lambda x: int(x.split(":")[0])
        )
    result = {}
    previous_minute = None
    
    for key in sorted_keys:
        current_minute = int(key.split(":")[0])
        if previous_minute is not None and current_minute > previous_minute +1:
            for missing_minute in range(previous_minute + 1, current_minute):
                # Xử lý trường hợp có phút bị khuyết thì bổ sung thêm
                # Và đặt nó thành list rỗng
                result[f"{missing_minute:02}:00"] = ["Event not recognized"]      
        result[key] = grouped_items[key]
        previous_minute = current_minute
    # # multilabel
    result = {k: list(set(v)) for k, v in result.items()}
    # print(result)
    # print("----")
    # create initial dataset
    df = pandas.DataFrame({
        'chunk': [int(i.split(":")[0]) for i in result.keys()],
        'video_base_dir': games,
        "timestamp": result.keys(),
        "labels": [values for values in result.values()],  # Join list items into a single string
    })
    
    df['labels_encoder'] = df['labels'].apply(
        lambda label_list: ", ".join(
            str(v) for k, v in EVENT_DICT.items() if k in label_list
        ) if label_list else str(list(EVENT_DICT.values())[-1])
    )
    def get_video_path(row):
        minute = int(row['timestamp'].split(':')[0])
        return glob.glob(f"{row['video_base_dir']}/chunk_{minute}/*.mp4")

    df['video_path'] = df.apply(get_video_path, axis=1)
    
    def get_audio_path(row):
        minute = int(row['timestamp'].split(':')[0])
        return glob.glob(f"{row['video_base_dir']}/chunk_{minute}/*.mp3")

    df['audio_path'] = df.apply(get_audio_path, axis=1)
    
    # df['video_path'] = [normalize_path(i[0]) for i in df['video_path']]
    # df['audio_path'] = [normalize_path(i[0]) for i in df['audio_path']]

    
    
    # df = df[['chunk','video_base_dir', 'video_path', 'audio_path', 'timestamp', 'labels', 'labels_encoder']]
    
    # vdo_paths = []
    # ado_paths = []
    # for video in df['video_path']:
    #     vdo_path = f"{video}/visual"
    #     ado_path = f"{video}/audio"
    #     vdo_paths.append(vdo_path)
    #     ado_paths.append(ado_path)

    # df['new_video_path'] = vdo_paths
    # df['new_audio_path'] = ado_paths
    df.to_csv(f"{games}/{games.split('/')[-1]}.csv", index=False)  
    

# basedir = "F:/video_classification"
# csv_files = [f"{basedir}/{i}/{i}.csv" for i in os.listdir(basedir)]

# import pandas as pd
# import glob

# # Read and concatenate all CSVs
# df2 = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)
# # df2 = df2[~df2['video_path'].apply(lambda x: x == [])]

# video_ids = []
# for i in df2['video_path']:
#     ites = i.replace('[', '').replace(']', '')
#     ites = ites.replace("'", '').replace("'", '')
#     video_ids.append(normalize_path(ites))
# df2['video_id'] = video_ids

# audio_ids = []
# for i in df2['audio_path']:
#     ites = i.replace('[', '').replace(']', '')
#     ites = ites.replace("'", '').replace("'", '')
#     audio_ids.append(normalize_path(ites))
# df2['audio_id'] = audio_ids

# df2 = df2[['chunk', 'video_base_dir', 'video_id', 'audio_id', 'labels_encoder']].rename(columns={'labels_encoder': 'labels'})
# # Save the merged CSV
# df2.to_csv("dataset.csv", index=False, sep = ";")

# print("CSV files concatenated successfully!")

100%|██████████| 91/91 [00:01<00:00, 58.80it/s]


In [28]:
import os
import pandas
import csv
import shutil
from src.preprocessing.video import normalize_path
basedir = "F:/video_classification"
ass = 0
for vdo in tqdm(os.listdir(basedir)):
    vdo_path = normalize_path(os.path.join(basedir, vdo))

    count_chunk = len([i for i in os.listdir(vdo_path) if os.path.isdir(f"{vdo_path}/{i}")]) - 1
    count_csv = len(pandas.read_csv(f"{vdo_path}/{vdo}.csv", sep = ",")) - 1
    print(count_chunk, count_csv, vdo)
    if count_chunk <= 20:
        # ass += 1
        shutil.rmtree(vdo_path)
    elif count_chunk > count_csv:
        count_remover = (count_chunk) - (count_csv)
        for i in range(count_remover+1):
            if os.path.exists(f"{vdo_path}/chunk_{count_csv + i}"):
                shutil.rmtree(f"{vdo_path}/chunk_{count_csv + i}")
    elif count_chunk < count_csv:
        count_remover = count_csv - count_chunk
        readcsv = pandas.read_csv(f"{vdo_path}/{vdo}.csv", sep = ",")
        readcsv = readcsv[:-count_remover]
        readcsv.to_csv(f"{vdo_path}/{vdo}.csv", index=False)

 13%|█▎        | 12/91 [00:00<00:00, 117.62it/s]

91 91 2014-11-04 - 20-00 Zenit Petersburg 1 - 2 Bayer Leverkusen
89 89 2015-02-14 - 20-00 Real Madrid 2 - 0 Dep. La Coruna
92 93 2015-02-24 - 22-45 Manchester City 1 - 2 Barcelona
92 93 2015-03-10 - 22-45 Real Madrid 3 - 4 Schalke
89 89 2015-03-17 - 22-45 Monaco 0 - 2 Arsenal
86 87 2015-04-15 - 21-45 FC Porto 3 - 1 Bayern Munich
91 92 2015-04-18 - 21-00 Real Madrid 3 - 1 Malaga
93 94 2015-04-22 - 21-45 Real Madrid 1 - 0 Atl. Madrid
91 91 2015-04-25 - 17-00 Espanyol 0 - 2 Barcelona
89 90 2015-04-29 - 21-00 Real Madrid 3 - 0 Almeria
89 90 2015-04-29 - 21-45 Juventus 3 - 2 Fiorentina
87 88 2015-05-02 - 17-00 Cordoba 0 - 8 Barcelona
92 92 2015-05-05 - 21-45 Juventus 2 - 1 Real Madrid
91 92 2015-05-09 - 16-30 Bayern Munich 0 - 1 FC Augsburg
91 92 2015-05-09 - 19-00 Barcelona 2 - 0 Real Sociedad
90 91 2015-08-16 - 18-00 Manchester City 3 - 0 Chelsea
89 89 2015-08-23 - 15-30 West Brom 2 - 3 Chelsea
95 96 2015-08-29 - 17-00 Liverpool 0 - 3 West Ham
89 89 2015-08-29 - 19-30 Bayern Munich 3 - 0 

 43%|████▎     | 39/91 [00:00<00:00, 125.37it/s]

97 97 2015-09-26 - 19-30 Newcastle Utd 2 - 2 Chelsea
90 90 2015-09-27 - 21-45 Inter 1 - 4 Fiorentina
89 89 2015-09-29 - 21-45 Bayern Munich 5 - 0 D. Zagreb
91 91 2015-10-03 - 19-30 Chelsea 1 - 3 Southampton
86 87 2015-10-24 - 16-30 Bayern Munich 4 - 0 FC Koln
91 92 2015-11-03 - 22-45 Real Madrid 1 - 0 Paris SG
90 90 2015-11-03 - 22-45 Sevilla 1 - 3 Manchester City
89 90 2015-11-07 - 20-30 Stoke City 1 - 0 Chelsea
93 93 2015-11-08 - 17-30 Dortmund 3 - 2 Schalke
90 91 2015-11-08 - 19-00 Arsenal 1 - 1 Tottenham
89 89 2015-11-25 - 22-45 Shakhtar Donetsk 3 - 4 Real Madrid
88 89 2016-02-03 - 22-45 Watford 0 - 0 Chelsea
87 88 2016-03-01 - 22-45 Norwich 1 - 2 Chelsea
90 90 2016-04-05 - 21-45 Bayern Munich 1 - 0 Benfica
89 89 2016-05-08 - 18-00 Real Madrid 3 - 2 Valencia
89 89 2016-05-14 - 18-00 Dep. La Coruna 0 - 2 Real Madrid
92 93 2016-08-27 - 14-30 Tottenham 1 - 1 Liverpool
93 93 2016-08-27 - 21-45 Napoli 4 - 2 AC Milan
90 91 2016-08-28 - 21-45 Monaco 3 - 1 Paris SG
89 89 2016-09-10 - 19-30

 75%|███████▍  | 68/91 [00:00<00:00, 127.53it/s]

90 90 2016-10-01 - 19-30 Bayer Leverkusen 2 - 0 Dortmund
89 90 2016-10-02 - 21-45 AS Roma 2 - 1 Inter
89 89 2016-10-15 - 14-30 Chelsea 3 - 0 Leicester
87 88 2016-11-01 - 20-45 Besiktas 1 - 1 Napoli
89 89 2016-11-01 - 22-45 Manchester City 3 - 1 Barcelona
89 89 2016-11-05 - 17-30 Hamburger SV 2 - 5 Dortmund
89 90 2016-11-19 - 20-30 Dortmund 1 - 0 Bayern Munich
92 93 2016-11-20 - 17-00 Atalanta 2 - 1 AS Roma
88 89 2016-11-23 - 22-45 Arsenal 2 - 2 Paris SG
87 88 2016-11-26 - 18-15 Real Madrid 2 - 1 Gijon
89 89 2016-11-26 - 22-45 Empoli 1 - 4 AC Milan
89 89 2016-11-30 - 23-00 Paris SG 2 - 0 Angers
94 94 2016-12-04 - 17-00 Lazio 0 - 2 AS Roma
92 92 2016-12-16 - 22-30 Hoffenheim 2 - 2 Dortmund
89 89 2016-12-18 - 22-45 Barcelona 4 - 1 Espanyol
89 89 2017-01-08 - 17-00 Genoa 0 - 1 AS Roma
94 94 2017-01-21 - 15-30 Liverpool 2 - 3 Swansea
89 89 2017-01-21 - 17-30 SV Werder Bremen 1 - 2 Dortmund
90 91 2017-01-29 - 17-00 Sampdoria 3 - 2 AS Roma
89 89 2017-01-29 - 19-30 1. FSV Mainz 05 1 - 1 Dortmu

100%|██████████| 91/91 [00:00<00:00, 128.33it/s]

93 94 2017-03-08 - 22-45 Barcelona 6 - 1 Paris SG
89 89 2017-03-12 - 22-45 Real Madrid 2 - 1 Betis
89 89 2017-04-01 - 21-45 AS Roma 2 - 0 Empoli
89 89 2017-04-02 - 17-15 Real Madrid 3 - 0 Alaves
88 89 2017-04-08 - 21-45 Malaga 2 - 0 Barcelona
90 91 2017-04-12 - 21-45 Bayern Munich 1 - 2 Real Madrid
89 89 2017-04-26 - 20-30 Barcelona 7 - 1 Osasuna
89 89 2017-04-29 - 16-30 Dortmund 0 - 0 FC Koln
89 89 2017-04-30 - 21-45 Inter 0 - 1 Napoli
89 89 2017-05-02 - 21-45 Real Madrid 3 - 0 Atl. Madrid
92 93 2017-05-06 - 17-00 Leicester 3 - 0 Watford
86 87 2017-05-20 - 21-45 Napoli 4 - 1 Fiorentina





In [30]:

basedir = "F:/video_classification"
csv_files = [f"{basedir}/{i}/{i}.csv" for i in os.listdir(basedir)]

import pandas as pd
import glob

# Read and concatenate all CSVs
df2 = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)
# df2 = df2[~df2['video_path'].apply(lambda x: x == [])]

video_ids = []
for i in df2['video_path']:
    ites = str(i).replace('[', '').replace(']', '')
    ites = ites.replace("'", '').replace("'", '')
    video_ids.append(normalize_path(ites))
df2['video_id'] = video_ids

audio_ids = []
for i in df2['audio_path']:
    ites = str(i).replace('[', '').replace(']', '')
    ites = ites.replace("'", '').replace("'", '')
    audio_ids.append(normalize_path(ites))
df2['audio_id'] = audio_ids

df2 = df2[['chunk', 'video_base_dir', 'video_id', 'audio_id', 'labels_encoder']].rename(columns={'labels_encoder': 'labels'})
# # Save the merged CSV
df2.to_csv("dataset2.csv", index=False, sep = ";")

# print("CSV files concatenated successfully!")
# df2

In [None]:
video_ids = []
for i in df2['video_path']:
    ites = str(i).replace('[', '').replace(']', '')
    ites = ites.replace("'", '').replace("'", '')
    video_ids.append(normalize_path(ites))
df2['video_id'] = video_ids

audio_ids = []
for i in df2['audio_path']:
    ites = str(i).replace('[', '').replace(']', '')
    ites = ites.replace("'", '').replace("'", '')
    audio_ids.append(normalize_path(ites))
df2['audio_id'] = audio_ids

df2 = df2[['chunk', 'video_base_dir', 'video_id', 'audio_id', 'labels_encoder']].rename(columns={'labels_encoder': 'labels'})
# Save the merged CSV
df2.to_csv("dataset2.csv", index=False, sep = ";")

print("CSV files concatenated successfully!")