In [1]:
import requests
from bs4 import BeautifulSoup
# I run multiple files with the same code to get different battle formats
formats = ['ru', 'ubers']

In [2]:
links = []
base_url = 'https://replay.pokemonshowdown.com/search'
for format_name in formats:
    for i in range(1,11):
        r = requests.get(base_url+\'/?format=%s&rating&output=html&page=%d'\
                         %(format_name, i))
        soup = BeautifulSoup(r.content)
        for link in soup.findAll('a'):
            li = link.get('href')
            if li not in links:
                links.append(li)

In [3]:
import signal, re
# Add a signal handler to raise a timeout error if the code takes
# too long to run
def handler(signum, frame):
    raise TimeoutError
# Because players can nickname their pokemon, we need to parse
# and return dictionaries recording the pokemons and their nickname
def get_teams(text):
    """Parse for the two teams"""
    p1a = {}
    p2a = {}
    m = re.findall('switch\|(.*): (.*)\|(.*?)[,\|]', text)
    m.extend(re.findall('formechange.(.*): (.*)\|(.*)', text))
    for i in m:
        if i[0]=='p1a':
            p1a[i[1]] = i[2]
        else:
            p2a[i[1]] = i[2]
    return p1a, p2a

# Because re.findall can take a lot of time to run, I write
# three different functions using different heuristics to
# allow for better runtime
def get_pair1(p1a, p2a, fainted):
    """
    Look for the last attack the fainted pokemon took
    before fainting to see what's the opponent
    Doesn't work if only one pokemon was knocked out,
    or the pokemon knocked itself out from an attack move
    """
    pairs = []
    for f in fainted:
        # Search for a move by another pokemon to this fainted pokemon
        regex = '(?:(?!faint).)*move\|p.a: (.*?)\|.*?\|'+f[0]+': '+f[1]
        opponent = re.search(regex, r.text, re.DOTALL).group(1)
        if f[0] == 'p1a':
            pokemon1 = p1a[f[1]]
            pokemon2 = p2a[opponent]
            winner = pokemon2
        else:
            pokemon1 = p1a[opponent]
            pokemon2 = p2a[f[1]]
            winner = pokemon1
        pairs.append([pokemon1, pokemon2, winner])
    return pairs

def get_pair2(p1a, p2a, fainted):
    """
    Look for all fainted pokemons and the pokemons in 
    the turn they faint.
    In certain cases where the same pokemon knocks out
    multiple pokemons before being taken down, it will
    take very long to run
    """
    pairs = []
    regex = '(?:(?!faint).)*move\|(p.a): (.*?)\|.*?\|(p.a): (.+?)\n.*?\|faint\|(p.a): .*?\n'
    match = re.findall(regex, r.text, re.DOTALL)
    for pair in match:
        # The pokemon may use a status move that knocks itself out
        # in which case we don't count the knock out
        if pair[0]==pair[2]:
            continue
        if pair[0]=='p1a':
            pokemon1 = p1a[pair[1]]
            pokemon2 = p2a[pair[3]]
        else:
            pokemon1 = p2a[pair[1]]
            pokemon2 = p1a[pair[3]]
        if pair[4]==pair[0]:
            winner = pokemon2
        else:
            winner = pokemon1
        pairs.append([pokemon1, pokemon2, winner])
    return pairs

def get_pair3(p1a, p2a, fainted):
    """
    Look for the last attack the fainted pokemon took
    before fainting to see what's the opponent,
    using findall
    """
    pairs = []
    for f in fainted:
        try:
            # if it takes too long to run, just skip this pair
            signal.signal(signal.SIGALRM, handler)
            signal.alarm(180)
            regex = '(?:(?!faint).)*move\|(p.a): (.*?)\|.*?\|'+f[0]+': '+f[1]
            match = re.search(regex, r.text, re.DOTALL)
            if match.group(1) != f[0]:
                opponent = match.group(2)
            else:
                match = re.findall(regex, r.text, re.DOTALL)
                if match[-1][0] != f[0]:
                    opponent = match[-1][1]
                else:
                    regex1 = '(?:(?!faint).)*move\|'+f[0]+': '+f[1]+'\|.*?\|(p.a): (.+?)\n'
                    match = re.findall(regex1, r.text, re.DOTALL)
                    # The pokemon may use a status move that knocks itself out
                    # (the pokemon intentionally knocks itself out)
                    # in which case we don't count the knock out
                    if match[-1][0] == f[0]:
                        continue
                    opponent = match[-1][1]
            if f[0] == 'p1a':
                pokemon1 = p1a[f[1]]
                pokemon2 = p2a[opponent]
                winner = pokemon2
            else:
                pokemon1 = p1a[opponent]
                pokemon2 = p2a[f[1]]
                winner = pokemon1
            pairs.append([pokemon1, pokemon2, winner])
        except:
            signal.alarm(0)
            continue
    return pairs

In [4]:
data = []
for link in links:
    r = requests.get('https://replay.pokemonshowdown.com/%s.log' %link)
    if 'faint' in r.text:
        p1a, p2a = get_teams(r.text)
        fainted = re.findall('\n\|faint\|(p.a): (.*?)\n', r.text)
        try:
            pairs = get_pair1(p1a, p2a, fainted)
            data.extend(pairs)
        except:
            try:
                # Set the signal handler and alarm
                signal.signal(signal.SIGALRM, handler)
                # We expect it to spend no more than 30 seconds
                # on a pair on average, else we skip
                signal.alarm(30*len(fainted))
                pairs = get_pair2(p1a, p2a, fainted)
                data.extend(pairs)
            except:
                signal.alarm(0)
                pairs = get_pair3(p1a, p2a, fainted)
                data.extend(pairs)

In [5]:
import pandas as pd
headers = ['Pokemon 1', 'Pokemon 2', 'Winner']
df = pd.DataFrame(data, columns=headers)

In [6]:
df.to_csv('battle_data2.csv', index=False)

In [8]:
df.head()

Unnamed: 0,Pokemon 1,Pokemon 2,Winner
0,Gurdurr,Hitmontop,Gurdurr
1,Registeel,Qwilfish,Registeel
2,Gurdurr,Alomomola,Alomomola
3,Emboar,Tangrowth,Emboar
4,Emboar,Registeel,Emboar
