In [None]:
from bs4 import BeautifulSoup
import re
import csv
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Links

In [None]:
WC_SQUAD_LIST_URL = "https://www.skysports.com/football/news/12098/12741629/world-cup-2022-squad-lists-england-brazil-argentina-france-germany-spain-and-more"

In [None]:
class Player:
    def __init__(self, name):
        self.name = name
        self.country = "Unknown"
        self.position = "Unknown"
        self.goals_scored = 0
        self.goals_assisted = 0
        self.tackles = 0
        self.saves = 0
        self.club_name = "Unknown"
        self.league = "Unknown"
        self.league_website = "Unknown"


    def get_name(self):
        return self.name

    def set_name(self, name):
        self.name = name

    def get_country(self):
        return self.country

    def set_country(self, country):
        self.country = country

    def get_position(self):
        return self.position

    def set_position(self, position):
        self.position = position

    def get_goals_scored(self):
        return self.goals_scored

    def set_goals_scored(self, goals):
        self.goals_scored = goals

    def get_goals_assisted(self):
        return self.goals_assisted

    def set_goals_assisted(self, assists):
        self.goals_assisted = assists

    def get_tackles_won(self):
        return self.tackles

    def set_tackles_won(self, tackles):
        self.tackles = tackles

    def get_saves(self):
        return self.saves

    def set_saves(self, saves):
        self.saves = saves

    def get_club_name(self):
        return self.club_name

    def set_club_name(self, club_name):
        self.club_name = club_name

    def get_club_league(self):
        return self.league

    def set_club_league(self, league):
        self.league = league

    def get_league_website(self):
        return self.league_website

    def set_league_website(self, league_website):
        self.league_website = league_website
        
    def get_attributes(self):
        return [self.name , self.country, self.position, self.goals_scored,
                self.goals_assisted, self.tackles, self.saves, self.club_name,
                self.league, self.league_website]

In [None]:
class Country:
    def __init__(self, name):
        self.name = name
        self.players = []

    def get_name(self):
        return self.name

    def get_players(self):
        return self.players

    def add_player(self, player):
        self.players.append(player)

    def add_players(self, players):
        self.players.extend(players)


    def remove_player(self, player):
        players.remove(player)

    def count_win_rate():
        return None

    def count_clean_sheets():
        return None

    def count_total_goals_scored():
        return None

    def count_total_goals_conceeded():
        return None

    def count_passes_completed():
        return None

In [None]:
def create_players_from_list(player_list, position, country):
    players = []
    for p in player_list:
        try:
            player_name = p.split("(")[0]
            player_name = player_name.strip()

            club_name = p.split("(")[1].replace(')', '')
            club_name = club_name.strip()

            player = Player(player_name)
            player.set_position(position)
            player.set_country(country)
            player.set_club_name(club_name)
            players.append(player)
        except Exception as e:
            print(p + "," + position + "," + country)
    return players

In [None]:
def find_players_from_url(squad_url):
    players = []
    try:
        response = requests.get(squad_url)
        soup = BeautifulSoup(response.content, "html.parser")
        
        for i in range(0, 32):
            
            gk_tag = soup.find_all('strong', string = re.compile(r"Goalkeeper|Goalkepeepers"))[i].find_parent("p")
            gk_txt = re.findall("Goalkeepers: " + "(.*)" + ".", gk_tag.get_text().replace("Goalkepeepers", "Goalkeepers"))[0]
            goalkeepers = gk_txt.split("), ")

            df_tag = soup.find_all('strong', string = re.compile(r"Defender"))[i].find_parent("p")
            df_txt = re.findall("Defenders: " + "(.*)" + ".", df_tag.get_text())[0]
            defenders = df_txt.split("), ")

            mf_tag = soup.find_all('strong', string = re.compile(r"Midfielder"))[i].find_parent("p")
            mf_txt = re.findall("Midfielders: " + "(.*)" + ".", mf_tag.get_text())[0]
            midfielders = mf_txt.split("), ")

            fw_tag = soup.find_all('strong', string = re.compile(r"Forward|Striker"))[i].find_parent("p")
            fw_txt = re.findall("Forwards: " + "(.*)" + ".", fw_tag.get_text().replace("Strikers", "Forwards"))[0]
            forwards = fw_txt.split("), ")
            
            
            country_tag = gk_tag.find_previous_sibling("h3")
            country = country_tag.get_text()
            
            players.extend(create_players_from_list(goalkeepers, "GK", country))
            players.extend(create_players_from_list(defenders, "DF", country))
            players.extend(create_players_from_list(midfielders, "MF", country))
            players.extend(create_players_from_list(forwards, "FW", country))
            
    except Exception as e:
        print(e)
        
    return players

In [None]:
def create_players_file(players_list):
    player_header = ["name", "country", "position", "goals", "assists", "tackles", "saves", "club", "league", "league_website"]
    with open('players.csv', 'w') as file:
        writer = csv.writer(file)
        writer.writerow(player_header)
    
        for player in players_list:
            writer.writerow(player.get_attributes())

In [None]:
# Extract players' information from website
players = find_players_from_url(WC_SQUAD_LIST_URL)

In [None]:
# Create a CSV file for players' data
create_players_file(players)

## Assessing the data

In [None]:
# Load your data
df = pd.read_csv('players.csv')

In [None]:
df.head(10)

In [None]:
# Get the number of players per country (pc_pc)
pc_pc = df["country"].value_counts()

In [None]:
# Get the numner of countries playing in the world-cup
countries = len(pc_pc.keys())
countries

In [None]:
# Get countries that have less than 23 players or greater than 26 players (lt_gt)
lt_gt = pc_pc[pc_pc.values < 23]
lt_gt = lt_gt.append(df2[pc_pc.values > 26])
lt_gt

In [None]:
def find_league(club_name):
    try:
        res = requests.get("https://en.wikipedia.org/wiki/" + club_name)
        soup = BeautifulSoup(res.content, "html.parser")
        league = soup.find('th', string = re.compile('League'))
        league_name = league.find_next_siblings("td")[0].get_text()
        return league_name
    except:
        return "Unknown"

In [None]:
# Find the respective leagues of players' clubs
for index, player in enumerate(players):
    if df.at[index, "league"] == "Unknown":
        df.at[index, "league"] = find_league(player.get_club_name())

In [None]:
df["league"].value_counts()

In [None]:
def find_league_website(league):
    res = requests.get("https://en.wikipedia.org/wiki/" + league)
    soup = BeautifulSoup(res.content, "html.parser")
    website = soup.find('th', string = re.compile('Website'))
    website_link = website.find_next_siblings("td")[0].get_text()
    return website_link

In [None]:
# Find the respective websistes of the different club leagues
for index, player in enumerate(players):
    if df.at[index, "league_website"] == "Unknown":
        df.at[index, "league"] = find_league_website(player.get_club_name())
    
df["league"].value_counts()

## Challenges

Internet connectivity