# Table of contents

- [Requests](#Requests)
- [Multi Requests](#Multi-Requests)
- [Multi Threading](#MultiThread)
- [Asynchronus](#Asynchronus)

### Global Variables

In [13]:
# Variables for functions
# Prefix for offset pages, where each offset contains 60 players
BASE_URL = "https://sofifa.com/?offset="
# Prefix for each player's individual link
P_URL = "https://sofifa.com/"
# Number for offsets to reach (large to 20000 is good(
N = 180
# List to store all offset urls
OFFSET_URLS = []
# List to store all player urls
PLAYERS_URLS = []
# List to store bloom filter for check duplicate
from bloom_filter import BloomFilter
BLOOM = BloomFilter(max_elements=25000, error_rate=0.1)

# Variables to store scraped data

# Directories to store output files
OUTDIR = "data/raw"
FILENAME = "fifa22_players"

## Requests

#### Imports

In [20]:
import requests
from bs4 import BeautifulSoup
import lxml.html as lx
import time
import pandas as pd

#### Functions

In [21]:
requests_players_scraped = []


def get_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    return soup.find("tbody", {"class": "list"})


def get_players(trs):
    out = []
    for tr in trs:
        try:
            base = "https://sofifa.com/"
            name = tr.select('td.col-name')
            attr = "?attr=classic"
            p_url = name[0].find("a").get("href")
            a, b, c, d, v = p_url.split("/", 4)
            version = v[0:2]
            if version != "22":
                continue
            link = base + p_url + attr
            out.append(extract_info(tr, link))
        except Exception as e:
            # print(f"error parsing link, check!")
            raise e
    return out


def extract_info(tr, link):
    name = tr.select('td.col-name')
    return {
        "name": name[0].find("a").get("aria-label"),
        "country": name[0].find("img").get("title"),
        "age": tr.select('td.col.col-ae')[0].text.strip(),
        "overall": tr.select('td.col.col-oa')[0].text.strip(),
        "potential": tr.select('td.col.col-pt')[0].text.strip(),
        "club": name[1].find("a").text,
        "best_position": name[0].find("span").text,
        "value": tr.select('td.col.col-vl')[0].text.strip(),
        "wage": tr.select('td.col.col-wg')[0].text.strip(),
        "total_stats": tr.select('td.col.col-tt')[0].text.strip(),
    }


def request_scrap(urls):
    for url in urls:
        tbody = get_page(url)
        trs = tbody.findAll("tr")
        requests_players_scraped.append(get_players(trs))

#### Running

In [22]:
print("#" * 20)
# Starts timer
t1 = time.time()
request_scrap(OFFSET_URLS)
df_request_bs4 = pd.DataFrame(requests_players_scraped[0])
print(f"Time taken for normal requests plus BS4: {time.time() - t1}")
df_request_bs4

####################
Time taken for normal requests plus BS4: 2.92509126663208


Unnamed: 0,name,country,age,overall,potential,club,best_position,value,wage,total_stats
0,Alen Halilović,Croatia,25,69,72,Reading,RM,€1.8M,€13K,1728
1,Vítor Machado Ferreira,Portugal,21,78,88,FC Porto,CM,€31.5M,€12K,1999
2,Isaak Touré,France,18,63,79,Le Havre AC,CB,€1.1M,€550,1438
3,Noa Lang,Netherlands,22,78,85,Club Brugge KV,LW,€28.5M,€27K,2012
4,Chidera Ejuke,Nigeria,23,76,81,PFC CSKA Moscow,LM,€11.5M,€35K,1905
5,Richarlison de Andrade,Brazil,24,81,86,Everton,ST,€39.5M,€90K,2121
6,Kristjan Asllani,Albania,19,63,78,Empoli,CDM,€1.1M,€4K,1680
7,Owen Wijndal,Netherlands,21,79,84,AZ Alkmaar,LB,€25M,€10K,2035
8,Kayky da Silva Chagas,Brazil,18,66,87,Manchester City,RW,€2.7M,€12K,1567
9,Raphael Dias Belloli,Brazil,24,83,86,Leeds United,RM,€49.5M,€95K,2124


## Multi-Requests

## MultiThread

## Asynchronus

## Others