In [None]:
# world, players, location, type, activity

# 1 - extract 'world', 'location', 'type', 'activity'
# 2 - extract 'world', 'datetime', 'players'

In [1]:
import numpy as np
import pandas as pd
import re
import requests

from bs4 import BeautifulSoup

In [18]:
def drop_even_columns(data):
    data = data[data.columns[1::2]]
    return data


def select(data, columns):
    data = data[columns]
    return data


def rename_columns(data):
    columns = ['world', 'players', 'location', 'type', 'activity']
    new_columns = dict(zip(data.columns, columns))
    renamed_data = data.rename(columns=new_columns)
    return renamed_data


def get_content(tag, index=0):
    try:
        content = tag.contents[index]
    except:
        content = np.nan

    return content


def get_contents(data):
    data['world'] = data['world'].apply(lambda t: get_content(t, index=1))
    
    for column in data.columns:
        data[column] = data[column].apply(lambda t: get_content(t))
        
    return data


def get_number(pattern, string):
    try:
        match = re.search(pattern, string)
        number = match.group(0)
    except:
        number = np.nan
    return number


def get_numbers(data):
    columns = ['world', 'players']
    pattern = re.compile(r'\d+')
    for column in columns:
        data[column] = data[column].apply(lambda s: get_number(pattern, s))
        try:
            data[column] = data[column].astype(np.int16)
        except:
            data[column] = data[column].astype(np.float)

    return data


def transform_data(data, columns):
    data = data.copy()
    transformed_data = (
        data.pipe(drop_even_columns)
            .pipe(rename_columns)
            .pipe(get_contents)
            .pipe(get_numbers)
            .pipe(select, columns=columns)
    )
    return transformed_data

In [None]:
with open('osrs_worlds.html', 'rb') as page:
    page_content = page.read()
    soup = BeautifulSoup(page_content, 'lxml')

In [34]:
def scrape_osrs_world_select():    
    world_select_url = ('https://oldschool.runescape.com/slu')
    response = requests.get(world_select_url)
    return response

def extract_data(response):
    soup = BeautifulSoup(response.text, 'lxml')
    data = soup.findAll('tr', {'class':'server-list__row'})
    data = pd.DataFrame(np.array(data, dtype=object))
    return data

In [26]:
response = scrape_osrs_world_select()

In [35]:
data = extract_data(response)

In [44]:
response.text

'<!doctype html>\r\n    <!--[if lt IE 7]><html class=\'no-js lt-ie10 lt-ie9 lt-ie8 lt-ie7\' lang=\'en\'><![endif]-->\r\n    <!--[if (IE 7)&!(IEMobile)]><html class=\'no-js lt-ie10 lt-ie9 lt-ie8\' lang=\'en\'><![endif]-->\r\n    <!--[if (IE 8)&!(IEMobile)]><html class=\'no-js lt-ie10 lt-ie9\' lang=\'en\'><![endif]-->\r\n    <!--[if (IE 9)&!(IEMobile)]><html class=\'no-js lt-ie10\' lang=\'en\'><![endif]-->\r\n    <!--[if gt IE 9]><!--><!-- x --> <html class=\'no-js no-autoplay\' lang=\'en\'> <!--<![endif]-->\r\n<head>\r\n    <title>Play Old School RuneScape - World Server List</title>\r\n\t\t<meta name=\'description\' content="Challenging levelling system and risk-it-all PvP. Experience the world of Old School RuneScape."/>\r\n    <meta name=\'author\' content=\'Jagex\'/>\r\n    <meta http-equiv=\'X-UA-Compatible\' content=\'IE=edge\'>\r\n    <link rel=\'dns-prefetch\' href=\'//www.google-analytics.com\' />\r\n    <link rel=\'dns-prefetch\' href=\'//ajax.googleapis.com\' />\r\n    <link 

In [29]:
print(response.ok)
print(response.status_code)

True
200


In [None]:
# check_logs
# update_logs

In [41]:
transformed_data['players'].sum()

94264.0

In [38]:
world_info = transform_data(data, columns=['world', 'location', 'type', 'activity'])
world_info

Unnamed: 0,world,location,type,activity
0,168,United States,Free,500 skill total
1,183,United States,Free,-
2,172,United States,Free,-
3,117,United States,Free,-
4,94,United States,Free,Clan Wars - Free
...,...,...,...,...
258,234,Australia,Members,-
259,88,Australia,Members,Theatre of Blood
260,87,Australia,Members,Blast Furnace
261,89,Australia,Members,Wintertodt


In [39]:
transformed_data = transform_data(data, columns=['world', 'players'])
transformed_data

Unnamed: 0,world,players
0,168,107.0
1,183,197.0
2,172,225.0
3,117,226.0
4,94,237.0
...,...,...
258,234,497.0
259,88,520.0
260,87,532.0
261,89,533.0


In [40]:
transformed_data[transformed_data['world'] == 30]

Unnamed: 0,world,players
98,30,
