In [2]:
# Before web crawling, it's important to keep these in mind:
# 1. Do not cause website overload by using time.sleep(1)
# 2. Check for possibilities of copyright issues
# 3. Check robots.txt of the website and make sure that User-agent: * is 'Allow'
# (You can usually check this by visiting 'www.website_url.com/robots.txt')

### Transfermarkt Soccer Players Value Crawling

In [3]:
# install beautiful soup
%pip install beautifulsoup4

# import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:
# set headers using your browser's user agent (search 'my user agent' on google)
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'}

# set the url variable to the url you want to crawl
url = 'https://www.transfermarkt.com/spieler-statistik/wertvollstespieler/marktwertetop'

# make a request
r = requests.get(url, headers = headers)

# receive status code (if 200, connected succesfully)
r.status_code

200

In [5]:
# setup beautiful soup by linking it to requested url
soup = BeautifulSoup(r.text, 'html.parser') 
# instead of r.text, you can also use r.content

# print the soup to check connectivity
# print(soup)

In [6]:
# find the html tag and class that includes players' info
player_info = soup.find_all('tr', class_=['odd', 'even'])

# check the data
print(player_info[0])

# check length of the array
print(f'number of players: ' + str(len(player_info)))

<tr class="odd">
<td class="zentriert">1</td><td class=""><table class="inline-table"><tr><td rowspan="2"><a href="#"><img alt="Jude Bellingham" class="bilderrahmen-fixed" src="https://img.a.transfermarkt.technology/portrait/small/581678-1693987944.jpg?lm=1" title="Jude Bellingham"/></a></td><td class="hauptlink"><a href="/jude-bellingham/profil/spieler/581678" title="Jude Bellingham">Jude Bellingham</a></td></tr><tr><td>Attacking Midfield</td></tr></table></td><td class="zentriert">20</td><td class="zentriert"><img alt="England" class="flaggenrahmen" src="https://tmssl.akamaized.net/images/flagge/verysmall/189.png?lm=1520611569" title="England"/><br/><img alt="Ireland" class="flaggenrahmen" src="https://tmssl.akamaized.net/images/flagge/verysmall/72.png?lm=1520611569" title="Ireland"/></td><td class="zentriert"><a href="/real-madrid/startseite/verein/418" title="Real Madrid"><img alt="Real Madrid" class="" src="https://tmssl.akamaized.net/images/wappen/verysmall/418.png?lm=1697726166"

In [7]:
# make an empty array to store the players' data
number = []
name = []
position = []
age = []
nation = []
team = []
value = []

In [8]:
# make a for loop to find and append data to arrays
for info in player_info:
	# find all <td> tag and create an array with its content
	player = info.find_all('td')

	# append first array's text data into corresponding arrays
	number.append(player[0].text)
	name.append(player[3].text)
	position.append(player[4].text)
	age.append(player[5].text)
	nation.append(player[6].img['alt'])
	team.append(player[7].a['title'])
	value.append(player[8].a.text)

# once the loop is over, all the data will be appended to data arrays
# check the data
print(number)
print(name)
print(position)
print(age)
print(nation)
print(team)
print(value)


['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25']
['Jude Bellingham', 'Erling Haaland', 'Kylian Mbappé', 'Vinicius Junior', 'Bukayo Saka', 'Jamal Musiala', 'Phil Foden', 'Lautaro Martínez', 'Victor Osimhen', 'Declan Rice', 'Rodri', 'Harry Kane', 'Florian Wirtz', 'Rodrygo', 'Federico Valverde', 'Moisés Caicedo', 'Pedri', 'Gavi', 'Eduardo Camavinga', 'Julián Álvarez', 'Aurélien Tchouaméni', 'Rafael Leão', 'Martin Ødegaard', 'Gabriel Martinelli', 'Bruno Guimarães']
['Attacking Midfield', 'Centre-Forward', 'Centre-Forward', 'Left Winger', 'Right Winger', 'Attacking Midfield', 'Right Winger', 'Centre-Forward', 'Centre-Forward', 'Defensive Midfield', 'Defensive Midfield', 'Centre-Forward', 'Attacking Midfield', 'Right Winger', 'Central Midfield', 'Defensive Midfield', 'Central Midfield', 'Central Midfield', 'Central Midfield', 'Second Striker', 'Defensive Midfield', 'Left Winger', 'Attacking Midfield'

In [9]:
# save as pandas dataframe
df = pd.DataFrame(
	{
		"number" : number,
		"name" : name,
		"position" : position,
		"age" : age,
		"nation" : nation,
		"team" : team,
		"value" : value,
	}
)

# check df
df

Unnamed: 0,number,name,position,age,nation,team,value
0,1,Jude Bellingham,Attacking Midfield,20,England,Real Madrid,€180.00m
1,2,Erling Haaland,Centre-Forward,23,Norway,Manchester City,€180.00m
2,3,Kylian Mbappé,Centre-Forward,25,France,Paris Saint-Germain,€180.00m
3,4,Vinicius Junior,Left Winger,23,Brazil,Real Madrid,€150.00m
4,5,Bukayo Saka,Right Winger,22,England,Arsenal FC,€120.00m
5,6,Jamal Musiala,Attacking Midfield,20,Germany,Bayern Munich,€110.00m
6,7,Phil Foden,Right Winger,23,England,Manchester City,€110.00m
7,8,Lautaro Martínez,Centre-Forward,26,Argentina,Inter Milan,€110.00m
8,9,Victor Osimhen,Centre-Forward,25,Nigeria,SSC Napoli,€110.00m
9,10,Declan Rice,Defensive Midfield,25,England,Arsenal FC,€110.00m


In [10]:
# save as csv file
df.to_csv('soccer_player_value_25.csv', index=False)