In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import json

# 读取数据集
file_name = 'data/pbpstats_2000.csv'
df = pd.read_csv(file_name)

# 反转DataFrame
df = df.iloc[::-1]

# 转换时间格式并调整时间
df['ENDTIME'] = pd.to_datetime(df['ENDTIME'], format='%M:%S')
df['STARTTIME'] = pd.to_datetime(df['STARTTIME'], format='%M:%S')
df.loc[df['PERIOD'] == 1, ['ENDTIME', 'STARTTIME']] += pd.to_timedelta('36 minutes')
df.loc[df['PERIOD'] == 2, ['ENDTIME', 'STARTTIME']] += pd.to_timedelta('24 minutes')
df.loc[df['PERIOD'] == 3, ['ENDTIME', 'STARTTIME']] += pd.to_timedelta('12 minutes')
df['ENDTIME'] = df['ENDTIME'].dt.strftime('%M:%S')
df['STARTTIME'] = df['STARTTIME'].dt.strftime('%M:%S')

# 在时间调整之后对数据进行排序
df['ENDTIME'] = pd.to_datetime(df['ENDTIME'], format='%M:%S')
df['STARTTIME'] = pd.to_datetime(df['STARTTIME'], format='%M:%S')

# 对每场比赛的事件进行排序
df.sort_values(by=['GAMEID', 'STARTTIME'], ascending=[True, False], inplace=True)

# 将时间转换回字符串格式，以便之后的处理
df['ENDTIME'] = df['ENDTIME'].dt.strftime('%M:%S')
df['STARTTIME'] = df['STARTTIME'].dt.strftime('%M:%S')

# 识别每场比赛中的两支球队并统一分差视角
for gameid in df['GAMEID'].unique():
    game_data = df[df['GAMEID'] == gameid]

    # 确定两支球队（假设每场比赛只有两支球队参与）
    teams = game_data['OPPONENT'].unique()
    if len(teams) != 2:
        raise ValueError(f"Unexpected number of teams in game {gameid}")

    # 选择球队1作为参考
    team1 = teams[0]

    # 调整 STARTSCOREDIFFERENTIAL，使其始终从球队1的视角表示
    for index, row in game_data.iterrows():
        if row['OPPONENT'] != team1:
            df.at[index, 'STARTSCOREDIFFERENTIAL'] = -row['STARTSCOREDIFFERENTIAL']

# 创建 ABSOLUTESCOREDIFFERENTIAL 列
df['ABSOLUTESCOREDIFFERENTIAL'] = df['STARTSCOREDIFFERENTIAL']

In [2]:
#### identify a single testgame
testgame = df[df['GAMEID']==20001109]
testgame.head(40)

Unnamed: 0,ENDTIME,EVENTS,FG2A,FG2M,FG3A,FG3M,GAMEDATE,GAMEID,NONSHOOTINGFOULSTHATRESULTEDINFTS,OFFENSIVEREBOUNDS,OPPONENT,PERIOD,SHOOTINGFOULSDRAWN,STARTSCOREDIFFERENTIAL,STARTTIME,STARTTYPE,TURNOVERS,DESCRIPTION,URL,ABSOLUTESCOREDIFFERENTIAL
47766,47:49,MISS Webber 18' Jump Shot\nSabonis REBOUND (Of...,1,0,0,0,2001-04-08,20001109,0,0,POR,1,0,0,48:00,Off Dead Ball,0,Sabonis REBOUND (Off:0 Def:1),,0
47765,47:49,MISS Webber 18' Jump Shot\nSabonis REBOUND (Of...,1,0,0,0,2001-04-08,20001109,0,0,POR,1,0,0,48:00,Off Dead Ball,0,MISS Webber 18' Jump Shot,,0
47964,47:25,MISS Sabonis 8' Hook Shot\nDivac REBOUND (Off:...,1,0,0,0,2001-04-08,20001109,0,0,SAC,1,0,0,47:49,Off Long Mid-Range Miss,0,Divac REBOUND (Off:0 Def:1),,0
47963,47:25,MISS Sabonis 8' Hook Shot\nDivac REBOUND (Off:...,1,0,0,0,2001-04-08,20001109,0,0,SAC,1,0,0,47:49,Off Long Mid-Range Miss,0,MISS Sabonis 8' Hook Shot,,0
47764,47:14,MISS Divac 17' Jump Shot\nStojakovic REBOUND (...,2,1,0,0,2001-04-08,20001109,0,1,POR,1,0,0,47:25,Off Short Mid-Range Miss,0,Stojakovic Reverse Layup (2 PTS),,0
47763,47:14,MISS Divac 17' Jump Shot\nStojakovic REBOUND (...,2,1,0,0,2001-04-08,20001109,0,1,POR,1,0,0,47:25,Off Short Mid-Range Miss,0,Stojakovic REBOUND (Off:1 Def:0),,0
47762,47:14,MISS Divac 17' Jump Shot\nStojakovic REBOUND (...,2,1,0,0,2001-04-08,20001109,0,1,POR,1,0,0,47:25,Off Short Mid-Range Miss,0,MISS Divac 17' Jump Shot,,0
47962,46:57,Divac STEAL (1 STL): Wallace Lost Ball Turnove...,0,0,0,0,2001-04-08,20001109,0,0,SAC,1,0,2,47:14,Off At Rim Make,1,Divac STEAL (1 STL): Wallace Lost Ball Turnove...,,2
47761,46:56,Divac Lost Ball Turnover (P1.T1)\n,0,0,0,0,2001-04-08,20001109,0,0,POR,1,0,2,46:57,Off Steal,1,Divac Lost Ball Turnover (P1.T1),,2
47961,46:41,MISS Wallace 20' Jump Shot\nWebber REBOUND (Of...,1,0,0,0,2001-04-08,20001109,0,0,SAC,1,0,2,46:56,Off Dead Ball,0,Webber REBOUND (Off:0 Def:1),,2


In [3]:
# 函数：将时间字符串转换为总秒数
def time_str_to_seconds(time_str):
    minutes, seconds = map(int, time_str.split(':'))
    return minutes * 60 + seconds

# 初始化字典以存储所有比赛的连续领先情况
all_games_leads = {}

# 处理每场比赛
for gameid in df['GAMEID'].unique():
    game_data = df[df['GAMEID'] == gameid].sort_values(by='STARTTIME')
    current_state = None
    consecutive_leads = []

    for index, row in game_data.iterrows():
        margin = row['ABSOLUTESCOREDIFFERENTIAL']

        # 检查领先状态是否改变
        if margin > 0:
            state = 'positive'
        elif margin < 0:
            state = 'negative'
        else:
            state = 'neutral'

        if current_state != state:
            if current_state is not None and consecutive_leads:
                # 结束上一个领先段落，只有当列表不为空时
                consecutive_leads[-1]['end_time'] = row['STARTTIME']
            if state != 'neutral':
                # 开始新的领先段落
                consecutive_leads.append({'start_time': row['STARTTIME'],
                                          'end_time': None,
                                          'state': state,
                                          'maxlead': abs(margin),
                                          'maxlead_time': row['STARTTIME']})

        current_state = state

        # 更新连续领先的最大领先分差及其时间
        if consecutive_leads:
            if abs(margin) > consecutive_leads[-1]['maxlead']:
                consecutive_leads[-1]['maxlead'] = abs(margin)
                consecutive_leads[-1]['maxlead_time'] = row['STARTTIME']

    # 确保最后一个领先段落有结束时间
    if consecutive_leads and consecutive_leads[-1]['end_time'] is None:
        consecutive_leads[-1]['end_time'] = game_data.iloc[-1]['ENDTIME']

    all_games_leads[gameid] = consecutive_leads

In [4]:
# 重新格式化结果以包括每次领先的分界线时间和计算建立和消失时间
formatted_results = {}
for gameid, leads in all_games_leads.items():
    formatted_leads = []
    for lead in leads:
        start_time_seconds = time_str_to_seconds(lead['start_time'])
        maxlead_time_seconds = time_str_to_seconds(lead['maxlead_time'])
        end_time_seconds = time_str_to_seconds(lead['end_time'])

        # 计算领先建立和消失的时间
        build_time = maxlead_time_seconds - start_time_seconds if maxlead_time_seconds > start_time_seconds else 0
        disappear_time = end_time_seconds - maxlead_time_seconds if end_time_seconds > maxlead_time_seconds else 0

        formatted_lead = {
            'id': gameid,
            'maxlead': lead['maxlead'],
            'dividing_line': lead['maxlead_time'],
            'starttime': lead['start_time'],
            'endtime': lead['end_time'],
            'state': lead['state'],
            'buildtime': build_time,
            'disappeartime': disappear_time
        }
        formatted_leads.append(formatted_lead)

    formatted_results[gameid] = formatted_leads

In [5]:
# 打印第一场比赛的连续领先情况（作为示例）
first_game_id = next(iter(formatted_results))
print("Formatted Leads - Game ID:", first_game_id)
for lead in formatted_results[first_game_id]:
    print(lead)

Formatted Leads - Game ID: 20000001
{'id': 20000001, 'maxlead': 32, 'dividing_line': '10:25', 'starttime': '00:07', 'endtime': '40:33', 'state': 'negative', 'buildtime': 618, 'disappeartime': 1808}
{'id': 20000001, 'maxlead': 2, 'dividing_line': '40:33', 'starttime': '40:33', 'endtime': '41:55', 'state': 'positive', 'buildtime': 0, 'disappeartime': 82}
{'id': 20000001, 'maxlead': 4, 'dividing_line': '42:31', 'starttime': '41:55', 'endtime': '43:57', 'state': 'negative', 'buildtime': 36, 'disappeartime': 86}
{'id': 20000001, 'maxlead': 2, 'dividing_line': '43:57', 'starttime': '43:57', 'endtime': '44:31', 'state': 'negative', 'buildtime': 0, 'disappeartime': 34}
{'id': 20000001, 'maxlead': 3, 'dividing_line': '44:31', 'starttime': '44:31', 'endtime': '45:11', 'state': 'negative', 'buildtime': 0, 'disappeartime': 40}
{'id': 20000001, 'maxlead': 1, 'dividing_line': '45:11', 'starttime': '45:11', 'endtime': '46:08', 'state': 'positive', 'buildtime': 0, 'disappeartime': 57}
{'id': 20000001,

In [6]:
import numpy as np

def convert_numpy_int(obj):
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, dict):
        return {key: convert_numpy_int(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy_int(element) for element in obj]
    else:
        return obj

# 转换 formatted_results 字典中的所有 NumPy int64 类型的值
converted_formatted_results = convert_numpy_int(formatted_results)

# 转换键为字符串并保存为 JSON 文件
converted_all_games_leads = {str(gameid): leads for gameid, leads in converted_formatted_results.items()}
output_file = 'data_handled/2000.json'
with open(output_file, 'w') as file:
    json.dump(converted_all_games_leads, file)