In [7]:
# Before web crawling, it's important to keep these in mind:
# 1. Do not cause website overload by using time.sleep(1)
# 2. Check for possibilities of copyright issues
# 3. Check robots.txt of the website and make sure that User-agent: * is 'Allow'
# (You can usually check this by visiting 'www.website_url.com/robots.txt')

### Transfermarkt Soccer Players Value Crawling
Now let's try to crawl multiple pages and get data of 50 players,
Through this process, we can automate the crawling.

In [8]:
# install beautiful soup
%pip install beautifulsoup4

# import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# set headers using your browser's user agent (search 'my user agent' on google)
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'}

# make an empty array to store the players' data
number = []
name = []
position = []
age = []
nation = []
team = []
value = []

# to crawl multiple pages with different url, nest in for loop
for page_num in range(1, 3):

	# set the url variable to the url you want to crawl
	url = f'https://www.transfermarkt.com/spieler-statistik/wertvollstespieler/marktwertetop/plus/ajax/yw1/0/fonts/Roboto-Regular.ttf/page/20//page/{page_num}'

	# make a request
	r = requests.get(url, headers = headers)

	# receive status code (if 200, connected succesfully)
	r.status_code

	# setup beautiful soup by linking it to requested url
	soup = BeautifulSoup(r.text, 'html.parser') 

	# find the html tag and class that includes players' info
	player_info = soup.find_all('tr', class_=['odd', 'even'])

	# make a for loop to find and append data to arrays
	for info in player_info:
		# find all <td> tag and create an array with its content
		player = info.find_all('td')

		# append first array's text data into corresponding arrays
		number.append(player[0].text)
		name.append(player[3].text)
		position.append(player[4].text)
		age.append(player[5].text)
		nation.append(player[6].img['alt'])
		team.append(player[7].a['title'])
		value.append(player[8].a.text)

	# rest for 1 second
	time.sleep(1)

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [9]:
# save as pandas dataframe
df = pd.DataFrame(
	{
		"number" : number,
		"name" : name,
		"position" : position,
		"age" : age,
		"nation" : nation,
		"team" : team,
		"value" : value,
	}
)

# check df
df

Unnamed: 0,number,name,position,age,nation,team,value
0,1,Jude Bellingham,Attacking Midfield,20,England,Real Madrid,€180.00m
1,2,Erling Haaland,Centre-Forward,23,Norway,Manchester City,€180.00m
2,3,Kylian Mbappé,Centre-Forward,25,France,Paris Saint-Germain,€180.00m
3,4,Vinicius Junior,Left Winger,23,Brazil,Real Madrid,€150.00m
4,5,Bukayo Saka,Right Winger,22,England,Arsenal FC,€120.00m
5,6,Jamal Musiala,Attacking Midfield,20,Germany,Bayern Munich,€110.00m
6,7,Phil Foden,Right Winger,23,England,Manchester City,€110.00m
7,8,Lautaro Martínez,Centre-Forward,26,Argentina,Inter Milan,€110.00m
8,9,Victor Osimhen,Centre-Forward,25,Nigeria,SSC Napoli,€110.00m
9,10,Declan Rice,Defensive Midfield,25,England,Arsenal FC,€110.00m


In [10]:
# save as csv file
df.to_csv('soccer_player_value_50.csv', index=False)
