In [9]:
import json
import time

import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

from pydantic import BaseModel
from typing import List, Optional

from selenium import webdriver

from supabase import create_client, Client

In [10]:
driver = webdriver.Chrome()

In [11]:
whoscored_url = "https://www.whoscored.com/Matches/1729226/Live/England-Premier-League-2023-2024-Manchester-United-Manchester-City"

In [12]:
driver.get(whoscored_url)

In [19]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [20]:
element = soup.select_one('script:-soup-contains("matchCentreData")')

In [21]:
matchdict = json.loads(element.text.split("matchCentreData: ")[1].split(",\n")[0])

In [22]:
matchdict.keys()

dict_keys(['playerIdNameDictionary', 'periodMinuteLimits', 'timeStamp', 'attendance', 'venueName', 'referee', 'weatherCode', 'elapsed', 'startTime', 'startDate', 'score', 'htScore', 'ftScore', 'etScore', 'pkScore', 'statusCode', 'periodCode', 'home', 'away', 'maxMinute', 'minuteExpanded', 'maxPeriod', 'expandedMinutes', 'expandedMaxMinute', 'periodEndMinutes', 'commonEvents', 'events', 'timeoutInSeconds'])

In [23]:
matchdict['events'][55]

{'id': 2610512675.0,
 'eventId': 32,
 'minute': 3,
 'second': 6,
 'teamId': 167,
 'playerId': 303139,
 'x': 57.4,
 'y': 26.2,
 'expandedMinute': 3,
 'period': {'value': 1, 'displayName': 'FirstHalf'},
 'type': {'value': 1, 'displayName': 'Pass'},
 'outcomeType': {'value': 1, 'displayName': 'Successful'},
 'qualifiers': [{'type': {'value': 178, 'displayName': 'StandingSave'}},
  {'type': {'value': 156, 'displayName': 'LayOff'}},
  {'type': {'value': 140, 'displayName': 'PassEndX'}, 'value': '43.7'},
  {'type': {'value': 56, 'displayName': 'Zone'}, 'value': 'Back'},
  {'type': {'value': 212, 'displayName': 'Length'}, 'value': '15.2'},
  {'type': {'value': 141, 'displayName': 'PassEndY'}, 'value': '18.8'},
  {'type': {'value': 213, 'displayName': 'Angle'}, 'value': '3.48'}],
 'satisfiedEventsTypes': [91, 117, 30, 35, 38, 216, 218],
 'isTouch': True,
 'endX': 43.7,
 'endY': 18.8}

In [24]:
match_events = matchdict['events']

In [25]:
df = pd.DataFrame(match_events)

In [26]:
df.head()

Unnamed: 0,id,eventId,minute,second,teamId,x,y,expandedMinute,period,type,...,endY,relatedEventId,relatedPlayerId,blockedX,blockedY,goalMouthZ,goalMouthY,isShot,cardType,isGoal
0,2610509000.0,2,0,0.0,167,0.0,0.0,0,"{'value': 1, 'displayName': 'FirstHalf'}","{'value': 32, 'displayName': 'Start'}",...,,,,,,,,,,
1,2610509000.0,2,0,0.0,32,0.0,0.0,0,"{'value': 1, 'displayName': 'FirstHalf'}","{'value': 32, 'displayName': 'Start'}",...,,,,,,,,,,
2,2610509000.0,3,0,0.0,32,50.2,49.9,0,"{'value': 1, 'displayName': 'FirstHalf'}","{'value': 1, 'displayName': 'Pass'}",...,46.3,,,,,,,,,
3,2610510000.0,4,0,1.0,32,34.8,47.9,0,"{'value': 1, 'displayName': 'FirstHalf'}","{'value': 1, 'displayName': 'Pass'}",...,76.1,,,,,,,,,
4,2610510000.0,5,0,3.0,32,32.5,76.1,0,"{'value': 1, 'displayName': 'FirstHalf'}","{'value': 1, 'displayName': 'Pass'}",...,94.4,,,,,,,,,


In [27]:
df.dropna(subset='playerId', inplace=True)

In [28]:
df = df.where(pd.notnull(df), None)

In [29]:
df.columns

Index(['id', 'eventId', 'minute', 'second', 'teamId', 'x', 'y',
       'expandedMinute', 'period', 'type', 'outcomeType', 'qualifiers',
       'satisfiedEventsTypes', 'isTouch', 'playerId', 'endX', 'endY',
       'relatedEventId', 'relatedPlayerId', 'blockedX', 'blockedY',
       'goalMouthZ', 'goalMouthY', 'isShot', 'cardType', 'isGoal'],
      dtype='object')

In [30]:
df = df.rename (
    {
        'eventId': 'event_id',
        'teamId': 'team_id',
        'expandedMinute': 'expanded_minute',
        'outcomeType': 'outcome_type',
        'isTouch': 'is_touch',
        'playerId': 'player_id',
        'endX': 'end_x',
        'endY': 'end_y',
        'blockedX': 'blocked_x',
        'blockedY': 'blocked_y',
        'goalMouthZ': 'goal_mouth_z',
        'goalMouthY': 'goal_mouth_y',
        'isShot': 'is_shot',
        'cardType': 'card_type',
        'isGoal': 'is_goal'
    },
    axis=1
)

In [31]:
df['period_display_name'] = df['period'].apply(lambda x: x['displayName'])
df['type_display_name'] = df['type'].apply(lambda x: x['displayName'])
df['outcome_type_display_name'] = df['outcome_type'].apply(lambda x: x['displayName'])

In [32]:
df['type'].head()

2    {'value': 1, 'displayName': 'Pass'}
3    {'value': 1, 'displayName': 'Pass'}
4    {'value': 1, 'displayName': 'Pass'}
5    {'value': 1, 'displayName': 'Pass'}
6    {'value': 1, 'displayName': 'Pass'}
Name: type, dtype: object

In [33]:
df.drop(columns=["period","type","outcome_type"], inplace=True)

In [34]:
df = df[[
       'id', 'event_id', 'minute', 'second', 'team_id', 'player_id', 'x', 'y', 'end_x', 'end_y',
       'expanded_minute','qualifiers', 'is_touch','blocked_x', 'blocked_y',
       'goal_mouth_z', 'goal_mouth_y', 'is_shot', 'card_type', 'is_goal', 'type_display_name', 
       'outcome_type_display_name', 'period_display_name'
]]