# Table of contents

- [Requests](#Requests)
- [Multi Requests](#Multi-Requests)
- [Multi Threading](#MultiThread)
- [Asynchronus](#Asynchronus)

### Global Variables

In [78]:
# Variables for functions
# Prefix for offset pages, where each offset contains 60 players
BASE_URL = "https://sofifa.com/?offset="
# Prefix for each player's individual link
P_URL = "https://sofifa.com/"
# Number for offsets to reach (large to 20000 is good(
N = 180
# List to store all offset urls
OFFSET_URLS = []
# List to store all player urls
PLAYERS_URLS = []
# List to store bloom filter for check duplicate
from bloom_filter import BloomFilter
BLOOM = BloomFilter(max_elements=25000, error_rate=0.1)

# Variables to store scraped data

# Directories to store output files
OUTDIR = "data/raw"
FILENAME = "fifa22_players"

In [79]:
# Generate offset links
for i in range(0, N, 60):
    OFFSET_URLS.append(BASE_URL + str(i))


def listing():
    listing = []
    ref = '//*[@id="body"]/div[1]/div/div[2]/div/table/tbody/tr[i]/td[2]/a[1]/@href'
    old = ref.split(sep="/")[9]
    for i in range(60):
        new = f"tr[{i+1}]"
        p = ref.replace(old, new)
        listing.append(p)
    return listing



# Generate player links from offsets
def parse_offset(url):
    res = requests.get(url)
    doc = lx.fromstring(res.content)
    out = listing()
    for path in out:
        href = doc.xpath(path)[0]
        # This checks the version (2 digits) of the player 
        ver = href.split(sep="/")[4][0:2]
        # If the player is from older version that isnt 22 then ignore
        if ver != "22":
            continue
        p_url = P_URL + href
        # Checks if this player if already contains in our bloom set to check duplicate
        if BLOOM.__contains__(p_url):
            # print(f"This url is duplicated {p_url}")
            continue
        PLAYERS_URLS.append(p_url)
        BLOOM.add(p_url)


## Requests

### Imports

In [11]:
import requests
from bs4 import BeautifulSoup
import lxml.html as lx
import time
import pandas as pd

### Functions

#### BS4

In [28]:
rbs4_players_scraped = []


def get_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    return soup.find("tbody", {"class": "list"})


def get_players(trs):
    out = []
    for tr in trs:
        try:
            base = "https://sofifa.com/"
            name = tr.select('td.col-name')
            attr = "?attr=classic"
            p_url = name[0].find("a").get("href")
            a, b, c, d, v = p_url.split("/", 4)
            version = v[0:2]
            if version != "22":
                continue
            link = base + p_url + attr
            out.append(extract_info(tr, link))
        except Exception as e:
            # print(f"error parsing link, check!")
            raise e
    return out


def extract_info(tr, link):
    name = tr.select('td.col-name')
    return {
        "name": name[0].find("a").get("aria-label"),
        "country": name[0].find("img").get("title"),
        "age": tr.select('td.col.col-ae')[0].text.strip(),
        "overall": tr.select('td.col.col-oa')[0].text.strip(),
        "potential": tr.select('td.col.col-pt')[0].text.strip(),
        "club": name[1].find("a").text,
        "best_position": name[0].find("span").text,
        "value": tr.select('td.col.col-vl')[0].text.strip(),
        "wage": tr.select('td.col.col-wg')[0].text.strip(),
        "total_stats": tr.select('td.col.col-tt')[0].text.strip(),
    }


def rbs4_scrap(urls):
    for url in urls:
        tbody = get_page(url)
        trs = tbody.findAll("tr")
        rbs4_players_scraped.append(get_players(trs))

#### LXML

In [68]:
rlxml_players_scraped = []


def rlxml_scrap(urls):
    for url in urls:
        rlxml_players_scraped.append(parse_one(url))


def parse_one(link):
    res = requests.get(link)
    tree = lx.fromstring(res.content)

    # Basic information of the player div class col=col-12
    basics = tree.xpath('//*[@id="body"]/div[2]/div/div[2]')[0].getchildren()

    # Bp3 card player, 1st child is info, 2nd child is card spacing
    generic = basics[0].getchildren()[1:]

    # List that contains full name, position, age, dob, height(cm), weight(kg)
    information = generic[0].text_content().split(sep="\n")
    # Dict to store generic infos
    info_dict = parse_info(information)

    # Profile section

    # Use list comprehension to exlude empty strings created by splitting
    # Exclude first element (which is title of the div component)
    profile = [x for x in basics[1].text_content().split(sep="\n") if x][1:]

    profile_dict = parse_profile(profile)

    # Ratings section
    lineup = tree.xpath('//*[@id="body"]/div[2]/div/div[1]/div/div[1]/div')[0].getchildren()

    ratings_dict = parse_ratings(lineup)

    # Attributes section
    all_blocks = tree.xpath('//div[@class="block-quarter"]')[8:][:-1]
    attributes_dict = parse_attributes(all_blocks)
    return info_dict | profile_dict | ratings_dict | attributes_dict

##### Parsers

In [8]:
# Function to parse info
def parse_info(par):
    # Assigning processed variable
    first_name = par[0].split(sep=" ")[0]
    last_name = par[0].split(sep=" ")[-1]
    # This refers to height and weight
    measures = par[1].split(sep=" ")[1:]

    # Storing data
    result_dict = {}
    result_dict["full_name"] = par[0]
    result_dict["first_name"] = first_name
    result_dict["last_name"] = last_name

    # Helper function called to return height and weight
    result_dict["height"], result_dict["weight"] = parse_hw(measures[-2:])
    result_dict["age"] = measures[-6][:2]
    # These are players that play 3 positions
    if len(measures) == 9:
        # print("Plays 3 positions")
        result_dict["position"] = ",".join(measures[:3])
    # These are players that play 2 positions
    elif len(measures) == 8:
        # print("Plays 2 positions")
        result_dict["position"] = ",".join(measures[:2])
    # These are players that play 1 position only
    else:
        result_dict["position"] = measures[0]
    return result_dict


# Function to parse profile
def parse_profile(par):
    p_dict = {}
    for c in par:
        if c[:9] in "Preferred Foot":
            p_dict[c[:14]] = c[14:]
        elif c[:9] in 'Work Rate':
            p_dict[c[:9]] = c[9:].replace(" ", "")
        elif c[:4] in 'Body Type':
            continue
        else:
            p_dict[c[3:]] = c[0]
    return p_dict

In [9]:
# all_blocks = tree.xpath('//div[contains(@class, "block-quarter")]')
# Returns all div elements that contains different stats for attributes
# Last one is Special Traits(excluded) could be added later
def parse_attributes(all_blocks):
    stat_dict = {}
    for block in all_blocks:
        b = [x for x in block.text_content().split(sep="\n") if x][1:]
        for stat in b:
            if any([s in stat for s in ["GK Diving", "GK Handling", "GK Kicking"]]):
                (attr, r) = stat[2:], stat[:2]
                stat_dict[attr] = r
            else:
                (attr, r) = stat[3:], stat[:2]
                stat_dict[attr] = r

    return stat_dict

In [2]:
def parse_hw(hw):
    height = hw[0][:3]
    weight = hw[1][:2]
    return height, weight

In [1]:
# This is div class lineup
def parse_ratings(lineup):
    all_ratings = {}
    # for each grid (represents 3 - 5 positions next to each other)
    # Note some grid contains empty ("\xa0"), will be skipeed
    for grid in lineup:
        # As mentioned each grid contains 3 - 5 ratings for positions
        for rating in grid.getchildren():
            # stored to variable to check duplicate
            texts = rating.text_content()
            if texts in "\xa0":
                continue
            # Except last 4 are the position names (3 - 6 characters long)
            # Last 4 are the ratings 
            (pos, oa) = texts[:-4], texts[-4:]
            # Append each position and overall rating as key value pair
            # and store it to the dictionary
            all_ratings[pos] = oa
    return all_ratings

#### Running

##### bs4

In [30]:
# BS4
print("#" * 20)
# Starts timer
t1 = time.time()
rbs4_scrap(OFFSET_URLS)
df_rbs4 = pd.DataFrame(rbs4_players_scraped[0])
print(f"Time taken for normal requests plus BS4: {time.time() - t1}")
df_rbs4.head()

####################
Time taken for normal requests plus BS4: 2.424346923828125


Unnamed: 0,name,country,age,overall,potential,club,best_position,value,wage,total_stats
0,Vítor Machado Ferreira,Portugal,21,78,88,FC Porto,CM,€31.5M,€12K,1999
1,Alen Halilović,Croatia,25,69,72,Reading,RM,€1.8M,€13K,1728
2,Isaak Touré,France,18,63,79,Le Havre AC,CB,€1.1M,€550,1438
3,Noa Lang,Netherlands,22,78,85,Club Brugge KV,LW,€28.5M,€27K,2012
4,Chidera Ejuke,Nigeria,23,76,81,PFC CSKA Moscow,LM,€11.5M,€35K,1905


##### lxml

In [80]:
# lxml
print("#" * 20)
# Starts timer
t2 = time.time()
for url in OFFSET_URLS:
    parse_offset(url)
rlxml_scrap(PLAYERS_URLS)
df_rlxml = pd.DataFrame(rlxml_players_scraped)
print(f"Time taken for normal requests plus lxml: {time.time() - t2}")
df_rlxml.head()

####################
Time taken for normal requests plus lxml: 107.69247579574585


Unnamed: 0,full_name,first_name,last_name,height,weight,age,position,Preferred Foot,Weak Foot,Skill Moves,...,ong Shots,ositioning,inishing,olleys,ribbling,nterceptions,LAM8,CAM8,RAM8,K Accuracy
0,Vítor Machado Ferreira,Vítor,Ferreira,173,64,21,"CM,CAM",Right,3,3,...,,,,,,,,,,
1,Alen HaliloviÄ,Alen,HaliloviÄ,169,69,25,"RM,CAM",Left,2,4,...,,,,,,,,,,
2,Souleymane Isaak TourÃ©,Souleymane,TourÃ©,204,10,18,CB,Left,3,2,...,,,,,,,,,,
3,Noa Noëll Lang,Noa,Lang,179,69,22,"LW,RW,ST",Right,3,4,...,,,,,,,,,,
4,Chidera Ejuke,Chidera,Ejuke,174,72,23,"LM,RM",Right,3,4,...,,,,,,,,,,


## Multi-Requests

## MultiThread

## Asynchronus

## Others