In [1]:
#import required libraries

from bs4 import BeautifulSoup
import requests
import random
import pandas as pd

In [2]:
#check we can get the data from the page

#here we use 'query' for the end of the url, this allows us to quickly change it
query = 'AFC_Wimbledon'

url = 'https://en.wikipedia.org/wiki/' + query
response = requests.get(url)
bs_html = BeautifulSoup(response.text, features="html.parser")

#this grabs us the html of the entire page

In [3]:
# we can preview the html contents
print(bs_html.prettify())

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   AFC Wimbledon - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-e

In [4]:
#this will check if the request was sucsessful. we want it to be 200, or at least start with a 2... anything else is a problem.

print(response.status_code)

200


In [5]:
#now I want to find only the link on this page
#first we create an array for the links

links = []

#we are looking for all of the <a> anchor tags.
# we do this with a for loop, we use 'try' and 'except' as some of the anchors may not have an 'href'. we ignore these otherwise it could cause an error.

for a in bs_html.find_all("a"):
    try:
        links.append(a["href"])
    except:
        pass


    #then another for loop to cycle though the array and print each link
for link in links:
    print(link)

#bodyContent
/wiki/Main_Page
/wiki/Wikipedia:Contents
/wiki/Portal:Current_events
/wiki/Special:Random
/wiki/Wikipedia:About
//en.wikipedia.org/wiki/Wikipedia:Contact_us
/wiki/Help:Contents
/wiki/Help:Introduction
/wiki/Wikipedia:Community_portal
/wiki/Special:RecentChanges
/wiki/Wikipedia:File_upload_wizard
/wiki/Main_Page
/wiki/Special:Search
https://donate.wikimedia.org/?wmf_source=donate&wmf_medium=sidebar&wmf_campaign=en.wikipedia.org&uselang=en
/w/index.php?title=Special:CreateAccount&returnto=AFC+Wimbledon
/w/index.php?title=Special:UserLogin&returnto=AFC+Wimbledon
https://donate.wikimedia.org/?wmf_source=donate&wmf_medium=sidebar&wmf_campaign=en.wikipedia.org&uselang=en
/w/index.php?title=Special:CreateAccount&returnto=AFC+Wimbledon
/w/index.php?title=Special:UserLogin&returnto=AFC+Wimbledon
/wiki/Help:Introduction
/wiki/Special:MyContributions
/wiki/Special:MyTalk
#
#History
#Foundation
#Badge
#Non-League_football_(2002–2011)
#Combined_Counties_League_(2002–2004)
#Isthmian_Lea

In [6]:
#many of the links are from outside wikipedia. in this case we only want internal links

#we can then filter the array to only include links starting with /wiki/. so only internal links will show.

filtered = []

for link in links:
  if link.startswith('/wiki/'):
    filtered.append(link)

for f in filtered:
    print(f)

/wiki/Main_Page
/wiki/Wikipedia:Contents
/wiki/Portal:Current_events
/wiki/Special:Random
/wiki/Wikipedia:About
/wiki/Help:Contents
/wiki/Help:Introduction
/wiki/Wikipedia:Community_portal
/wiki/Special:RecentChanges
/wiki/Wikipedia:File_upload_wizard
/wiki/Main_Page
/wiki/Special:Search
/wiki/Help:Introduction
/wiki/Special:MyContributions
/wiki/Special:MyTalk
/wiki/AFC_Wimbledon
/wiki/Talk:AFC_Wimbledon
/wiki/AFC_Wimbledon
/wiki/AFC_Wimbledon
/wiki/Special:WhatLinksHere/AFC_Wimbledon
/wiki/Special:RecentChangesLinked/AFC_Wimbledon
/wiki/Wikipedia:File_Upload_Wizard
/wiki/Special:SpecialPages
/wiki/Wikipedia:Protection_policy#semi
/wiki/AFC_Wimbledon_Women
/wiki/File:AFC_Wimbledon_(2020)_logo.svg
/wiki/Plough_Lane
/wiki/Johnnie_Jackson
/wiki/EFL_League_Two
/wiki/2023%E2%80%9324_EFL_League_Two
/wiki/EFL_League_Two
/wiki/Kit_(association_football)
/wiki/Away_colours
/wiki/Third_jersey
/wiki/2024%E2%80%9325_AFC_Wimbledon_season
/wiki/Association_football
/wiki/Wimbledon,_London
/wiki/Lon

In [7]:
#there are still a lot of links to stuff we dont want eg. pictures, help files ect. We can use ignore to filter them out.

ignores = ['png', 'jpg', 'jpeg', 'isbn', 'svg', 'identifier', \
           'File', 'Special', 'Template', 'Mailto', 'Portal', \
           'Help', 'Category', 'Talk', 'Wikipedia', 'Main_Page']

filtered = []

#this line states only links that are to wiki pages are valid
for link in links:
    if link.startswith('/wiki/'):
        valid = True

        # this line then makes all our ingnored links invalid
        for ignore in ignores:
            if ignore in link:
                valid = False
                break

        # if the link is valid we then add it to our 'filtered' array
        if valid:
            filtered.append(link)

for f in filtered:
    print(f)

/wiki/AFC_Wimbledon
/wiki/AFC_Wimbledon
/wiki/AFC_Wimbledon
/wiki/AFC_Wimbledon_Women
/wiki/Plough_Lane
/wiki/Johnnie_Jackson
/wiki/EFL_League_Two
/wiki/2023%E2%80%9324_EFL_League_Two
/wiki/EFL_League_Two
/wiki/Kit_(association_football)
/wiki/Away_colours
/wiki/Third_jersey
/wiki/2024%E2%80%9325_AFC_Wimbledon_season
/wiki/Association_football
/wiki/Wimbledon,_London
/wiki/London_Borough_of_Merton
/wiki/London
/wiki/EFL_League_Two
/wiki/English_football_league_system
/wiki/Wimbledon_F.C.
/wiki/The_Football_Association
/wiki/Relocation_of_Wimbledon_F.C._to_Milton_Keynes
/wiki/Milton_Keynes
/wiki/Milton_Keynes_Dons_F.C.
/wiki/London_Football_Association
/wiki/Surrey_County_Football_Association
/wiki/Combined_Counties_Football_League
/wiki/Promotion_and_relegation
/wiki/Kingsmeadow,_Kingston_upon_Thames
/wiki/Isthmian_League#Premier_Division
/wiki/Kingstonian_F.C.
/wiki/Chelsea_F.C._Women
/wiki/Plough_Lane
/wiki/Wimbledon_Stadium
/wiki/Plough_Lane_(1912%E2%80%931998)
/wiki/Relocation_of_W

In [9]:
# get the response in the form of html
wikiurl="https://en.wikipedia.org/wiki/AFC_Wimbledon"

# check the request was sucsessful (code 200)
response=requests.get(wikiurl)
print(response.status_code)

# parse data from the html into a beautifulsoup object
bs_html = BeautifulSoup(response.text, 'html.parser')

# here we find any element with the table tag, there are some of these we dont want on this page.
# So we specify only tables using the "wikitable" class

tabledata=bs_html.find('table',{'class':"wikitable"})

#read the table data
df=pd.read_html(str(tabledata))

# convert list to pandas dataframe
df=pd.DataFrame(df[0])
print(df.head())

#write the data to a .csv file
df.to_csv('team_info.csv', sep='\t', encoding='utf-8')

200
          Period[73] Kit manufacturer         Shirt sponsor
0  2002 (pre season)            Umbro  Championship Manager
1          2002–2012   Tempest Sports    Sports Interactive
2          2012–2014   Tempest Sports      Football Manager
3          2014–2018          Admiral      Football Manager
4          2018–2022             Puma      Football Manager


  df=pd.read_html(str(tabledata))
