# Retrieve the Stats of all the players
using the `players_urls.csv` file

In [33]:
import pandas as pd
import requests
import re

In [34]:
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

In [35]:
player_urls = pd.read_csv('../data/player_urls.csv')
# drop the first column
player_urls = player_urls.drop(player_urls.columns[0], axis=1)
# dont show the index
player_urls

#### Add overall stats and image to the player_urls dataframe

In [36]:
hand_pattern =  r"var hand = \'(\w)\';"

### Get the hand of the player

In [37]:
url = "https://www.tennisabstract.com/cgi-bin/player.cgi?p=StefanosTsitsipas"
response = requests.get(url, headers=headers)
match = re.search(hand_pattern, response.text)
if match:
    hand_value = match.group(1)
    print("Hand:", hand_value)
else:
    print("Hand value not found.")

### Get the image of the player

In [38]:
img = "https://www.tennisabstract.com/photos/stefanos_tsitsipas-sirobi.jpg"
# get the url of the image
response = requests.get(img, headers)
if response.status_code == 200 or 304:
    print("Image found")
else:
    print("Image not found")

### Example for 1 player

In [39]:
from bs4 import BeautifulSoup

url = "https://www.minorleaguesplits.com/tennisabstract/cgi-bin/frags/CasperRuud.js"
response = requests.get(url, headers=headers)
# print the table with id recent-results
soup = BeautifulSoup(response.text, 'html.parser')
# get the table with id recent-results
table = soup.find('table', {'id': 'recent-results'})
# print the table
rows = table.find('tbody').find_all('tr')

In [40]:
ruud_df = pd.DataFrame(columns=['Date', 'Tournament', 'Surface', 'Round in Tournament', 'Ranking at that time', 'Opponent Ranking at that time', 'Result', 'Winner','Loser',  'Set Scores', 'Dominance Ratio', 'Ace Ratio', 'Double Fault Ratio', 'First Serve Percentage', 'First Serve Points Won', 'Second Serve Points Won', 'Break Points Saved', 'Time'])

# print len of rows
for row in rows:
    cells = row.find_all('td')
    date = cells[0].text.strip()
    tournament = cells[1].text.strip()
    surface = cells[2].text.strip()
    round_in_tour = cells[3].text.strip()
    ranking_at_that_time = cells[4].text.strip()
    opponent_ranking_at_that_time = cells[5].text.strip()
    result = cells[6].text.strip()
    set_scores = cells[7].text.strip()
    dominance_ratio = cells[8].text.strip() # Dominance Ratio: % of return points won divided by % of serve points lost.
    ace_ratio = cells[9].text.strip() 
    double_fault_ratio = cells[10].text.strip()
    first_serve_perc = cells[11].text.strip()
    first_serve_points_won = cells[12].text.strip()
    second_serve_points_won = cells[13].text.strip()
    break_points_saved = cells[14].text.strip()
    time = cells[15].text.strip()
    # split the result
    results = result.split("d.")
    winner = results[0].strip()
    # remove everything between [] and () and remove spaces at the beginning and end
    winner = re.sub(r'\[.*?\]|\(.*?\)|\s', '', winner)
    loser = results[1].strip()
    loser = re.sub(r'\[.*?\]|\(.*?\)|\s', '', loser)
    # Append to DataFrame
    row_data = {'Date': date, 'Tournament': tournament, 'Surface': surface, 'Round in Tournament': round_in_tour, 'Ranking at that time': ranking_at_that_time, 'Opponent Ranking at that time': opponent_ranking_at_that_time, 'Result': result, 'Winner': winner, 'Loser': loser, 'Set Scores': set_scores, 'Dominance Ratio': dominance_ratio, 'Ace Ratio': ace_ratio, 'Double Fault Ratio': double_fault_ratio, 'First Serve Percentage': first_serve_perc, 'First Serve Points Won': first_serve_points_won, 'Second Serve Points Won': second_serve_points_won, 'Break Points Saved': break_points_saved, 'Time': time}
    ruud_df = pd.concat([ruud_df, pd.DataFrame([row_data])], ignore_index=True)

ruud_df

### Get the stats for all the players

In [41]:
def clean_player_name(name):
    # Remove everything between [], (), and extra spaces
    cleaned_name = re.sub(r'\[.*?\]|\(.*?\)|\s', '', name).strip()
    return cleaned_name

In [42]:
BASE_URL = "https://www.minorleaguesplits.com/tennisabstract/cgi-bin/frags/"
upcoming_df = pd.DataFrame(columns=[ 'Date', 'Tournament', 'Surface', 'Results'])
x = 0
all_matches_df = pd.DataFrame(columns=['Date', 'Tournament', 'Surface', 'Round in Tournament', 'Ranking at that time', 'Opponent Ranking at that time', 'Result', 'Winner','Loser',  'Set Scores', 'Dominance Ratio', 'Ace Ratio', 'Double Fault Ratio', 'First Serve Percentage', 'First Serve Points Won', 'Second Serve Points Won', 'Break Points Saved', 'Time'])
for player in player_urls.iterrows():
    print(f"Player {x}")
    if x < 500:
        player_name = player[1]['name']
        # remove all spaces
        player_name = re.sub(r'\s', '', player_name)
        url = BASE_URL + player_name + ".js"
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        # get the table with id recent-results
        table = soup.find('table', {'id': 'recent-results'})
        # print the table
        rows = table.find('tbody').find_all('tr')
        for row in rows:
            cells = row.find_all('td')
            date = cells[0].text.strip()
            tournament = cells[1].text.strip()
            surface = cells[2].text.strip()
            round_in_tour = cells[3].text.strip()
            ranking_at_that_time = cells[4].text.strip()
            opponent_ranking_at_that_time = cells[5].text.strip()
            result = cells[6].text.strip()
            set_scores = cells[7].text.strip()
            # check if the set_scores is empty
            if set_scores == "":
                upcoming_df = pd.concat([upcoming_df, pd.DataFrame([{'Date': date, 'Tournament': tournament, 'Surface': surface, 'Results': result}])], ignore_index=True)
                continue
            dominance_ratio = cells[8].text.strip() # Dominance Ratio: % of return points won divided by % of serve points lost.
            ace_ratio = cells[9].text.strip() 
            double_fault_ratio = cells[10].text.strip()
            first_serve_perc = cells[11].text.strip()
            first_serve_points_won = cells[12].text.strip()
            second_serve_points_won = cells[13].text.strip()
            break_points_saved = cells[14].text.strip()
            time = cells[15].text.strip()
            # split the result
            results = result.split("d.")
            winner = clean_player_name(results[0])
            loser = clean_player_name(results[1])
            # Append to DataFrame
            row_data = {'Date': date, 'Tournament': tournament, 'Surface': surface, 'Round in Tournament': round_in_tour, 'Ranking at that time': ranking_at_that_time, 'Opponent Ranking at that time': opponent_ranking_at_that_time, 'Result': result, 'Winner': winner, 'Loser': loser, 'Set Scores': set_scores, 'Dominance Ratio': dominance_ratio, 'Ace Ratio': ace_ratio, 'Double Fault Ratio': double_fault_ratio, 'First Serve Percentage': first_serve_perc, 'First Serve Points Won': first_serve_points_won, 'Second Serve Points Won': second_serve_points_won, 'Break Points Saved': break_points_saved, 'Time': time}
            
            all_matches_df = pd.concat([all_matches_df, pd.DataFrame([row_data])], ignore_index=True)
    x += 1    
all_matches_df

In [43]:
all_matches_df.to_csv("../data/all_matches_data_raw.csv")

In [44]:
all_matches_df

In [45]:
# check the len
len(all_matches_df)

In [46]:
upcoming_df

In [47]:
upcoming_df.to_csv("../data/upcoming_matches_info.csv")