In [5]:
!pip install bs4
!pip install lxml

Collecting lxml
  Downloading lxml-6.0.1-cp312-cp312-win_amd64.whl.metadata (3.9 kB)
Downloading lxml-6.0.1-cp312-cp312-win_amd64.whl (4.0 MB)
   ---------------------------------------- 0.0/4.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/4.0 MB ? eta -:--:--
   ------------------ --------------------- 1.8/4.0 MB 8.4 MB/s eta 0:00:01
   ------------------------------------ --- 3.7/4.0 MB 8.4 MB/s eta 0:00:01
   ---------------------------------------- 4.0/4.0 MB 7.7 MB/s eta 0:00:00
Installing collected packages: lxml
Successfully installed lxml-6.0.1


In [14]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import re

# 1. URL and Setup
# Use a different player URL to demonstrate the new logic
url = 'https://understat.com/player/1250' # Mohamed Salah

# 2. Fetch and Parse the HTML
res = requests.get(url)
soup = BeautifulSoup(res.content, 'html.parser')

# 3. Find and Extract the Shot Data JSON
scripts = soup.find_all('script')
string_with_json = ''
for script in scripts:
    if 'shotsData' in str(script):
        string_with_json = str(script)
        break

json_data_match = re.search(r"JSON\.parse\('([^']+)'\)", string_with_json)
if json_data_match:
    encoded_json = json_data_match.group(1)
    decoded_json = bytes(encoded_json, 'utf-8').decode('unicode_escape')
    data = json.loads(decoded_json)
else:
    print("Could not find shotsData.")
    data = []

# 4. Create DataFrame and Process Data
if data:
    df = pd.DataFrame(data)
    numeric_cols = ['X', 'Y', 'xG', 'minute', 'season', 'match_id']
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # 5. Get Player Name directly from the DataFrame
    # This is more reliable than scraping the HTML header.
    if not df.empty:
        player_name = df['player'].iloc[0]
        filename_name = player_name.replace(' ', '_')
        
        print(f"Successfully created DataFrame for {player_name}. Here's a preview:")
        print(df.head())

        # 6. Save the DataFrame to a CSV file with the dynamic name
        csv_filename = f"{filename_name}_shots.csv"
        df.to_csv(csv_filename, index=False)
        print(f"\nDataFrame successfully saved to '{csv_filename}'")
    else:
        print("Data was found, but the DataFrame is empty.")

else:
    print("No shot data was found to create a DataFrame.")

Successfully created DataFrame for Mohamed Salah. Here's a preview:
      id  minute       result      X      Y        xG         player h_a  \
0  15217      92  MissedShots  0.823  0.369  0.062365  Mohamed Salah   h   
1  20034      84    SavedShot  0.920  0.532  0.439938  Mohamed Salah   a   
2  46908      87  BlockedShot  0.886  0.566  0.096760  Mohamed Salah   h   
3  47192       0    SavedShot  0.964  0.688  0.049131  Mohamed Salah   a   
4  47197      16  MissedShots  0.900  0.677  0.060320  Mohamed Salah   a   

  player_id situation  season  shotType  match_id      h_team      a_team  \
0      1250  OpenPlay    2014  LeftFoot      4720     Chelsea     Swansea   
1      1250  OpenPlay    2014  LeftFoot      4560   Tottenham     Chelsea   
2      1250  OpenPlay    2014  LeftFoot      4929  Fiorentina    Atalanta   
3      1250  OpenPlay    2014  LeftFoot      4937    Sassuolo  Fiorentina   
4      1250  OpenPlay    2014  LeftFoot      4937    Sassuolo  Fiorentina   

  h_goals a_