In [1]:
#import required libraries

from bs4 import BeautifulSoup
import requests
import random
import pandas as pd

In [None]:
#check we can get the data from the page

#here we use 'query' for the end of the url, this allows us to quickly change it
query = 'AFC_Wimbledon'

url = 'https://en.wikipedia.org/wiki/' + query
response = requests.get(url)
bs_html = BeautifulSoup(response.text, features="html.parser")

#this grabs us the html of the entire page

In [None]:
# we can preview the html contents
print(bs_html.prettify())

In [None]:
#this will check if the request was sucsessful. we want it to be 200, or at least start with a 2... anything else is a problem.

print(response.status_code)

In [None]:
#now I want to find only the link on this page
#first we create an array for the links

links = []

#we are looking for all of the <a> anchor tags.
# we do this with a for loop, we use 'try' and 'except' as some of the anchors may not have an 'href'. we ignore these otherwise it could cause an error.

for a in bs_html.find_all("a"):
    try:
        links.append(a["href"])
    except:
        pass


    #then another for loop to cycle though the array and print each link
for link in links:
    print(link)

In [None]:
#many of the links are from outside wikipedia. in this case we only want internal links

#we can then filter the array to only include links starting with /wiki/. so only internal links will show.

filtered = []

for link in links:
  if link.startswith('/wiki/'):
    filtered.append(link)

for f in filtered:
    print(f)

In [None]:
#there are still a lot of links to stuff we dont want eg. pictures, help files ect. We can use ignore to filter them out.

ignores = ['png', 'jpg', 'jpeg', 'isbn', 'svg', 'identifier', \
           'File', 'Special', 'Template', 'Mailto', 'Portal', \
           'Help', 'Category', 'Talk', 'Wikipedia', 'Main_Page']

filtered = []

#this line states only links that are to wiki pages are valid
for link in links:
    if link.startswith('/wiki/'):
        valid = True

        # this line then makes all our ingnored links invalid
        for ignore in ignores:
            if ignore in link:
                valid = False
                break

        # if the link is valid we then add it to our 'filtered' array
        if valid:
            filtered.append(link)

for f in filtered:
    print(f)

In [None]:
# get the response in the form of html
wikiurl="https://en.wikipedia.org/wiki/AFC_Wimbledon"

# check the request was sucsessful (code 200)
response=requests.get(wikiurl)
print(response.status_code)

# parse data from the html into a beautifulsoup object
bs_html = BeautifulSoup(response.text, 'html.parser')

# here we find any element with the table tag, there are some of these we dont want on this page.
# So we specify only tables using the "wikitable" class

tabledata=bs_html.find('table',{'class':"wikitable"})

#read the table data
df=pd.read_html(str(tabledata))

# convert list to pandas dataframe
df=pd.DataFrame(df[0])
print(df.head())

#write the data to a .csv file
df.to_csv('team_info.csv', sep='\t', encoding='utf-8')