# Player Data Analysis from Transfermarkt
This notebook constructs a graph of players and their most played teammates using data from Transfermarkt. 
The aim is to analyze relationships between players using graph analysis techniques, uncovering key insights such as:

## 1. Importing Necessary Libraries

In this section, we import the libraries required for:
- **Data manipulation** (`pandas`)
- **Graph construction and analysis** (`networkx`)
- **Interactive visualization** (`plotly`)
- **Web scraping** (`selenium` and `BeautifulSoup`)
- **Utility Libraries** (`time` and `re`)

In [21]:
# 1. Data Manipulation
import pandas as pd  # For handling and analyzing structured data

# 2. Graph Construction
import networkx as nx  # For creating and analyzing network graphs

# 3. Data Visualization
import plotly.graph_objs as go  # For interactive graph visualization

# 4. Web Scraping
from selenium import webdriver  # Automates browser interaction
from selenium.webdriver.chrome.service import Service  # For managing the Chrome service
from bs4 import BeautifulSoup  # For parsing HTML data

# 5. Utility Libraries
import time  # For adding delays in scraping
import re  # For regular expressions, used to clean and extract data


## 2. Setting Up WebDriver Service

In this section, we set up the Chrome WebDriver using Selenium. This allows us to interact with web pages for scraping data from Transfermarkt.
Steps:
1. **Configure Chrome Options**: Set any preferences such as headless mode if needed.
2. **Specify the WebDriver Path**: Provide the path to the Chrome WebDriver executable.
3. **Initialize the WebDriver**: Launch Chrome with the configured options.

In [22]:
# 2. Setting up the WebDriver

# Configuring Chrome options (e.g., headless mode can be set here if needed)
chrome_options = webdriver.chrome.options.Options()

# Path to the Chrome WebDriver executable
chrome_driver = "C:\\Users\\Lisandra\\Documents\\webdriver\\chromedriver-win64\\chromedriver.exe"

# Initializing the WebDriver service with the executable path
service_to_pass = Service(executable_path=chrome_driver)

# Launching the WebDriver with the service and options
wd = webdriver.Chrome(service=service_to_pass, options=chrome_options)


In [23]:
url = 'https://www.transfermarkt.com/spieler-statistik/wertvollstespieler/marktwertetop?land_id=0&ausrichtung=alle&spielerposition_id=alle&altersklasse=alle&jahrgang=0&kontinent_id=0&plus=1'
wd.get(url)

In [24]:
def scrape_page(html):
    """
    Extracts the players' information from a web page of most valuable players. The information of each player is added to the players_data list.

    Args:
        html (str): the page source that will be scraped.

    Returns:
        None.
    """
    soup = BeautifulSoup(html, 'html.parser')
    player_rows = soup.find_all('tr', class_=['odd', 'even'])  # Player data is within table rows with odd/even classes

    for player in player_rows:
        name = player.find_all('td')[1].find('img').get('alt')
        #rank = player.find('td', class_='zentriert').text.strip()  # Rank
        player_link = 'https://www.transfermarkt.com' +  player.find('td', class_='hauptlink').find('a')['href']
        #age = player.find_all('td', class_='zentriert')[1].text.strip()  # Player Age
        #club = player.find('td', class_='zentriert').find_next('td').find('a').get('title')  # Club Name
        #market_value = player.find('td', class_='rechts hauptlink').text.strip()  # Market Value
        
        players_data.append({
            'Name': name,
            'Player Page': player_link,
        })

In [25]:
def scrape_teammates(html):
    """
    Extracts players information from a web page of a player most played teammates. The information of each player is added to the players_data list.

    Args:
        html (str): the page source that will be scraped.

    Returns:
        teamates_vet (DataFrame): a Dataframe of all the teammates on the page.
    """
    soup = BeautifulSoup(html, 'html.parser')
    
    # Find the table rows that contain the teammate data
    teammate_table = soup.find('table', {'class': 'items'})
    teamates_vet = []
    count_runs = 0
    for tm in teammate_table.find_all('tr')[1:]:
        if count_runs%3 == 0:
            try:
                teammate_name = tm.find('img').get('alt')
                player_link = 'https://www.transfermarkt.com' +  tm.find('td', class_='hauptlink').find('a')['href']
                
                
                players_data.append({
                    'Name': teammate_name,
                    'Player Page': player_link,
                })
                teamates_vet.append({
                    'Name': teammate_name,
                    'Player Page': player_link,
                })
            except:
                print("There was a problem gathering data from the row below")
                print(tm)
        count_runs += 1

    return pd.DataFrame(teamates_vet)
    
    

In [26]:
def change_page(html):
    """
    Extracts the player's name from a web page of most valuable players. The information of each player is added to the players_data list.

    Args:
        html (str): the page source that will be scraped.

    Returns:
        None.
    """
    soup = BeautifulSoup(html, 'html.parser')
    next_button = soup.find('li', class_='tm-pagination__list-item tm-pagination__list-item--icon-next-page')
    link = 'https://www.transfermarkt.com' + next_button.find('a')['href']
    
    return link

In [27]:
players_data = []
# Pagination: Continue scraping until you gather 100 players
players_count = 0

while players_count < 250:
    time.sleep(2)  # Wait for page to fully load
    
    # Scrape data from the current page
    try:
        scrape_page(wd.page_source)
        players_count = len(players_data)
        
        url = change_page(wd.page_source)
        wd.get(url)
    except:
        print('There was a problem accessing the data or no more players were found')
        break


In [28]:
len(players_data)

250

In [29]:
df_players = pd.DataFrame.from_dict(players_data)

In [30]:
df_players

Unnamed: 0,Name,Player Page
0,Jude Bellingham,https://www.transfermarkt.com/jude-bellingham/...
1,Erling Haaland,https://www.transfermarkt.com/erling-haaland/p...
2,Vinicius Junior,https://www.transfermarkt.com/vinicius-junior/...
3,Kylian Mbappé,https://www.transfermarkt.com/kylian-mbappe/pr...
4,Phil Foden,https://www.transfermarkt.com/phil-foden/profi...
...,...,...
245,Ilya Zabarnyi,https://www.transfermarkt.com/ilya-zabarnyi/pr...
246,Max Kilman,https://www.transfermarkt.com/max-kilman/profi...
247,Pedro Gonçalves,https://www.transfermarkt.com/pedro-goncalves/...
248,Lutsharel Geertruida,https://www.transfermarkt.com/lutsharel-geertr...


In [31]:
def teamates_page(df_row):
    return df_row['Player Page'].replace("profil", "gemeinsameSpiele")

In [32]:
df_players['Teammates Page'] = df_players.apply(teamates_page, axis=1)

In [33]:
def add_to_graph(player, teammates):
    if player['Player Page'] not in G:
        G.add_node(player['Player Page'],data=player)

    for index, row in teammates.iterrows():
        if row['Player Page'] not in G:
            G.add_node(row['Player Page'], data=row)
        if not G.has_edge(player['Player Page'], row['Player Page']) and not G.has_edge(row['Player Page'], player['Player Page']):
            G.add_edge(player['Player Page'], row['Player Page'])

In [34]:
G = nx.Graph()
for index, row in df_players.iterrows():
    url = row['Teammates Page']
    wd.get(url)
    row = row.drop(labels='Teammates Page')
    
    teammate_count = 0
    teammates = pd.DataFrame(columns=['Name','Player Page'])
    while teammate_count < 25:
        page_teammates = scrape_teammates(wd.page_source)
        teammates = pd.concat([teammates,page_teammates[:25]])
        teammate_count = len(teammates)
        try:
            url = change_page(wd.page_source)
            wd.get(url)
        except:
            print('There was a problem accessing the data or no more players were found')
            break
        
        
    print(teammates)
    print(row)
    add_to_graph(row, teammates)

                Name                                        Player Page
0       Mats Hummels  https://www.transfermarkt.com/mats-hummels/pro...
1         Marco Reus  https://www.transfermarkt.com/marco-reus/profi...
2      Julian Brandt  https://www.transfermarkt.com/julian-brandt/pr...
3  Raphaël Guerreiro  https://www.transfermarkt.com/raphael-guerreir...
4           Emre Can  https://www.transfermarkt.com/emre-can/profil/...
Name                                             Jude Bellingham
Player Page    https://www.transfermarkt.com/jude-bellingham/...
Name: 0, dtype: object
             Name                                        Player Page
0   Manuel Akanji  https://www.transfermarkt.com/manuel-akanji/pr...
1           Rodri  https://www.transfermarkt.com/rodri/profil/spi...
2  Bernardo Silva  https://www.transfermarkt.com/bernardo-silva/p...
3         Ederson  https://www.transfermarkt.com/ederson/profil/s...
4      Phil Foden  https://www.transfermarkt.com/phil-foden/profi...
N

In [71]:
df_players = pd.DataFrame.from_dict(players_data)
df_players = df_players.drop_duplicates()

In [73]:
df_players.to_csv('out.csv', index=False)  

In [39]:
def extract_player_name(url):
    # Use regex to find the player name part of the URL
    match = re.search(r'transfermarkt\.com/([^/]+)/', url)
    
    if match:
        # Extract the name and replace hyphens with spaces
        player_name = match.group(1).replace("-", " ")
        # Capitalize the first letters of each word
        player_name = player_name.title()
        return player_name
    return url

In [68]:
len(players_data)

6500

In [40]:
pos = nx.spring_layout(G)

# Extract edge positions
edge_x = []
edge_y = []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.append(x0)
    edge_x.append(x1)
    edge_x.append(None)  # For separating edges in the plot
    edge_y.append(y0)
    edge_y.append(y1)
    edge_y.append(None)

# Create the edge trace (lines)
edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=1, color='black'),
    hoverinfo='none',
    mode='lines')

# Extract node positions
node_x = []
node_y = []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    
node_adjacencies = []
node_text = []
for node, adjacencies in G.adjacency():
    num_connections = len(adjacencies)  # Number of connections for the node
    node_adjacencies.append(num_connections)
    node_text.append(f"Node {extract_player_name(node)} has {len(adjacencies)} connections")


# Create the node trace (points)
node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale=True,
        colorscale='viridis',
        size=10,
        color=node_adjacencies,
        colorbar=dict(
            thickness=15,
            title='Node Connections',
            xanchor='left',
            titleside='right'
        ),
    ))

node_trace.text = node_text



# Create the plot
fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title='Interactive Network Graph',
                    titlefont_size=16,
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=0, l=0, r=0, t=40),
                    annotations=[dict(
                        text="Network visualization using NetworkX and Plotly",
                        showarrow=False,
                        xref="paper", yref="paper",
                        x=0.005, y=-0.002 )],
                    xaxis=dict(showgrid=False, zeroline=False),
                    yaxis=dict(showgrid=False, zeroline=False))
                )

fig.show()


In [49]:
def name_to_url(player_name):
    # Convert the name to lowercase and replace spaces with hyphens
    formatted_name = player_name.lower().replace(" ", "-")
    # Construct the URL
    url = f"https://www.transfermarkt.com/{formatted_name}/profil/spieler/"
    return url

In [59]:
def check_players_in_graph(graph, player1, player2):
    # Check if the nodes exist in the graph
    player1_exists = any(name_to_url(player1) in node for node in graph.nodes)
    player2_exists = any(name_to_url(player2) in node for node in graph.nodes)
    
    # Return the result
    if player1_exists and player2_exists:
        return True
    elif player1_exists:
        print(f"Player '{player1}' exists, but player '{player2}' does not exist in the graph.")
        return False
    elif player2_exists:
        print(f"Player '{player2}' exists, but player '{player1}' does not exist in the graph.")
        return False
    print(f"Neither player '{player1}' nor player '{player2}' exists in the graph.")
    return False

In [65]:
player1 = input("Enter the first player's name: ")
player2 = input("Enter the second player's name: ")
if check_players_in_graph(G, player1, player2):
    try:
        path = nx.shortest_path(G, source=[node for node in G.nodes if name_to_url(player1) in node][0], 
                            target=[node for node in G.nodes if name_to_url(player2) in node][0])
        print(f"The path has a {len(path)} length and it is formed by the following players")
        for node in path:
            print(extract_player_name(node),end =" - ")
    except:
        print(f'There is no known path between {player1} and {player2}')

The path has a 7 length and it is formed by the following players
Lukasz Fabianski - Declan Rice - Jarrod Bowen - Lucas Paqueta - Everton Ribeiro - Joao Gomes - Matheuzinho - 

In [66]:
# Close the browser
wd.quit()