In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Use SHIFT+TAB keys to popup inplace code help
%config IPCompleter.greedy = True

# Output multiple statements from one input cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [8]:
table_from_top = 2
wikipedia_page = 'List_of_indie_game_developers'
trace = False

In [9]:
wikipedia_url = 'https://en.wikipedia.org/wiki/{}'.format(wikipedia_page)
page = requests.get(wikipedia_url)
soup = BeautifulSoup(page.content, 'lxml')
tables = soup.find_all('table', {'class': 'wikitable'})
table = tables[table_from_top - 1]

In [10]:
feature_names = []

header_row = table.find('tr')
for header in header_row.find_all('th'):
    feature_name = ' '.join(header.find_all(text=True))
    feature_name.replace('\n', '')
    feature_names.append(feature_name)

'Developer'

'City'

'Autonomous area '

'Country'

'Notable games'

'Notes'

In [11]:
feature_names

['Developer\n',
 'City\n',
 'Autonomous area \n',
 'Country\n',
 'Notable games\n',
 'Notes\n']

In [12]:
def has_coords(tag):
    if tag.has_attr('class'):
        if tag['class'][0] == 'latitude' or tag['class'][0] == 'longitude':
            return True
    return False

def get_coords(child):
    coords = []
    for coord in child.find_all(has_coords):
        coords.append(coord.string)
    if coords:
        if trace:
            return 'C = {}'.format(' '.join(coords))
        else:
            return ' '.join(coords)
    else:
        return ''

samples = []
sample_rows = table.find_all('tr')[1:]
for sample_row in sample_rows:
    features = []
    for feature_col in sample_row.find_all('td'):
        feature_value = ''
        text = feature_col.string
        if text:
            if trace:
                features.append('T = {}'.format(text))
            else:
                features.append(text)
            continue
        
        for child in feature_col.children:
            if child.name == 'span':
                if child.has_attr('class'):
                    if child['class'] == 'display:none':
                        continue
                if child.find_all(has_coords):
                    feature_value = get_coords(child)
                    if feature_value:
                        break
                    else:
                        continue
            if child.name == 'sup':
                continue
            if child.name == 'a':
                if child.string[0] == '[':
                    continue            
            if child.name == 'a':
                if trace:
                    feature_value = 'A = {}'.format(child.string)
                else:
                    feature_value = child.string
                break
            if child.name == 'font':
                if trace:
                    feature_value = 'F = {}'.format(child.string)
                else:
                    feature_value = child.string
                break
            try:
                # feature_value = '' for any tags not covered above
                content = child.contents
            except AttributeError:
                # Handle whitespace between child tags, treated as a child string
                if child.isspace():
                    continue
                if trace:
                    feature_value = 'E = {}'.format(child)
                else:
                    feature_value = child
                break
        features.append(feature_value)
    samples.append(dict(zip(feature_names, features)))

In [13]:
df = pd.DataFrame(samples)
df.head()
df.tail()

Unnamed: 0,Developer\n,City\n,Autonomous area \n,Country\n,Notable games\n,Notes\n
0,11 bit studios,Warsaw,\n,Poland,,\n
1,ACE Team,Santiago,Santiago,Chile,,\n
2,Akupara Games,Los Angeles,California,United States,Whispering Willows,\n
3,Alec Holowka,Winnipeg,Manitoba,Canada,,\n
4,Alientrap,Saskatoon,Saskatchewan,Canada,,\n


Unnamed: 0,Developer\n,City\n,Autonomous area \n,Country\n,Notable games\n,Notes\n
196,Zachary Barth,Seattle,Washington,United States,,\n
197,Zoetrope Interactive,Istanbul,\n,Turkey,,\n
198,Zoink,Gothenburg,\n,Sweden,,\n
199,Zoonami,Cambridge,England,United Kingdom,,\n
200,ZootFly,Ljubljana,\n,Slovenia,,\n


In [14]:
to_drop = ['City\n', 'Autonomous area \n', 'Country\n', 'Notable games\n', 'Notes\n']
copy = df.copy()
copy.drop(to_drop, axis = 1, inplace=True)

In [15]:
copy

Unnamed: 0,Developer\n
0,11 bit studios
1,ACE Team
2,Akupara Games
3,Alec Holowka
4,Alientrap
...,...
196,Zachary Barth
197,Zoetrope Interactive
198,Zoink
199,Zoonami


In [17]:
copy.columns = ['Developers']

In [18]:
copy

Unnamed: 0,Developers
0,11 bit studios
1,ACE Team
2,Akupara Games
3,Alec Holowka
4,Alientrap
...,...
196,Zachary Barth
197,Zoetrope Interactive
198,Zoink
199,Zoonami


In [20]:
dataset_file_name = 'D:\Actual education\ML Project\{}.csv'.format(wikipedia_page)
if not trace:
    copy.to_csv(dataset_file_name, index=False)