# Get problem texts from Topcoder

This notebook contains functions to scrape all problem statements from the competitive programming platform, topcoder, since no API is provided.

Note that this data was ultimately not included in the codeforces prediction model due to lack of time.

**Notes**

API information:
* http://docs.tcapi.apiary.io/#reference/data/top-ranked-srm-members/top-ranked-srm-members

Getting data on users:
* http://api.topcoder.com/v2/users/yj12/statistics/data/srm

Getting list of users:
* Note this only goes by 200 at a time
* http://www.topcoder.com/tc?cc=&sc=&sd=&cc=&module=AlgoRank&nr=5000&sr=4000

In [1]:
import requests
import bs4
from bs4 import BeautifulSoup
import re
import psycopg2
import pandas as pd

## Functions

### url request

#### list of all contests

In [None]:
url = "https://www.topcoder.com/tc?module=ProblemArchive&sr=0&er=5000&sc=&sd=&class=&cat=&div1l=&div2l=&mind1s=&mind2s=&maxd1s=&maxd2s=&wr="
r = requests.get(url)
with open('TC_questions_list.txt', 'w') as f:
    f.write(r.content)

In [119]:
with open('TC_questions_list.txt') as f:
    r = f.readlines()
r = ''.join(r)

len(r)

In [6]:
soup = BeautifulSoup(r.content, 'html.parser')

soup_problems = soup.findAll('table', {'class': 'paddingTable2'})

tr = soup_problems[1].findAll('tr')
problem_list = []

cnt = 0
for t in tr:
#    print t.contents
    td = t.findAll('td')
    row = []
    links = []
    for d in td:
        a = d.find_all('a')
        links.extend(a)
        
        txt = d.text.encode('ascii', 'ignore').strip()
        txt = re.sub('\n[ ]+', ' ', txt)
        txt = re.sub('\. ', '_', txt)
        txt = re.sub(' ', '_', txt)
        txt = re.sub('%', '', txt)
        txt = txt.lower()
        row.append(txt)
    
    href = []
    if len(links) == 4:
        for a in links:
            # problem, round, memberprofile, detail
            href.append(a.attrs['href'])
    
    row.extend(href)
    
    if len(row) == 12:
        row.extend([
            'href_problem',
            'href_round',
            'href_writer',
            'href_detail'
        ])
        
    if len(row) != 16:
        continue
        
    
    problem_list.append(row)
        
#    cnt += 1
#    if cnt > 5:
#        break

In [121]:
df_list = pd.DataFrame.from_dict(problem_list[1:])
problem_list[0][0] = 0
problem_list[0][10] = 10
problem_list[0][11] = 11
df_list.columns = problem_list[0]
df_list.drop([0, 10, 11], axis=1, inplace=True)

#### get problem details

In [123]:
base_url = "https://www.topcoder.com/"

df_list.head()

Unnamed: 0,problemname,challenge,date,writer,categories,div_1level,div_1success_rate,div_2level,div_2success_rate,href_problem,href_round,href_writer,href_detail
0,classictowers,srm_715,05.30.2017,lg5293,dynamic_programming,2.0,68.97,,,/stat?c=problem_statement&pm=14591,/stat?c=round_overview&rd=16884,/tc?module=MemberProfile&cr=22858920&tab=alg,/tc?module=ProblemDetail&rd=16884&pm=14591
1,imagecompression,srm_715,05.30.2017,lg5293,string_manipulation,,,1.0,77.6,/stat?c=problem_statement&pm=14612,/stat?c=round_overview&rd=16884,/tc?module=MemberProfile&cr=22858920&tab=alg,/tc?module=ProblemDetail&rd=16884&pm=14612
2,inprepost,srm_715,05.30.2017,lg5293,greedy,,,3.0,31.43,/stat?c=problem_statement&pm=14610,/stat?c=round_overview&rd=16884,/tc?module=MemberProfile&cr=22858920&tab=alg,/tc?module=ProblemDetail&rd=16884&pm=14610
3,maximumrange,srm_715,05.30.2017,lg5293,greedy,1.0,97.97,,,/stat?c=problem_statement&pm=14613,/stat?c=round_overview&rd=16884,/tc?module=MemberProfile&cr=22858920&tab=alg,/tc?module=ProblemDetail&rd=16884&pm=14613
4,maximumrangediv2,srm_715,05.30.2017,lg5293,"brute_force,_string_manipulation",,,2.0,84.43,/stat?c=problem_statement&pm=14611,/stat?c=round_overview&rd=16884,/tc?module=MemberProfile&cr=22858920&tab=alg,/tc?module=ProblemDetail&rd=16884&pm=14611


In [2]:
from time import sleep
def getproblemtext(contest, problem):
    maxtries = 5
    tries = 0
    url = 'http://codeforces.com/problemset/problem/' + str(contest) + '/' + str(problem)
    print url
    
    while tries < maxtries:
        try:
            r = requests.get(url)
            return r.content
        except:
            tries += 1
            sleep(5)
    print "ERROR GETTING CONTEST INFO FOR", contest, problem_data.csv

### scientific notation -> 0's

In [245]:
# turn scientific notation into 0s
def sci2num(txt):
    #txt = txt.encode('utf-8')
    exponents = map(int, re.findall('10<sup class="upper-index">(.)</sup>', txt))
    for e in exponents:
        estr = '10' + ''.join(map(str, [0]*(e-1)))
        txt = re.sub('10<sup class="upper-index">' + str(e) + '</sup>', estr, txt)
    #return BeautifulSoup(txt, 'html.parser')
    return txt



### Ignore first tag

In [None]:
def ignore1sttag(bs4obj):
    inputlist = []
    for cont in bs4obj.contents:
        if type(cont) == bs4.NavigableString:
            inputlist.append(cont)
        elif len(cont.attrs) == 0:
            inputlist.append(cont.text)
    out = ''.join(inputlist)
    return out

### parsing

In [381]:
def get_problem_dict(contestid, problemid):
    r = getproblemtext(contestid, problemid)
    r = sci2num(r)

    soup = BeautifulSoup(r, 'html.parser')
    #print(soup.prettify())

    statement = soup.find("div", { "class" : "problem-statement" })
    problem_name = soup.find("div", { "class" : "title" }).string

    tmp = soup.find("div", { "class" : "time-limit" })
    timelimit = tmp.contents[-1]
    timelimit = timelimit.split(' ')[0]
    timelimit = float(timelimit)

    tmp = soup.find("div", { "class" : "memory-limit" })
    memlimit = tmp.contents[-1]
    memlimit = memlimit.split(' ')[0]
    memlimit = int(memlimit)

    children = []
    for c in statement.children:
        children.append(c)

    description = children[1]
    assert(len(description.attrs) == 0)
    txt_descr = description.text

    txt_input = ignore1sttag(children[2])
        
    txt_output = ignore1sttag(children[3])
    
    if (len(children) >= 6):
        txt_note = ignore1sttag(children[5])
    else:
        txt_note = ''

    assert(children[4].attrs['class'][0] == u'sample-tests')

    #print problem_name, '\n'
    #print timelimit, '\n'
    #print memlimit, '\n'
    #print txt_descr, '\n'
    #print txt_input, '\n'
    #print txt_output, '\n'
    #print txt_note, '\n'

    problem_dict = {
        'contestid':     contestid,
        'problemid':     problemid,
        'problemname':   problem_name,
        'timelimit' :    timelimit,
        'memlimit' :     memlimit,
        'txt_descr' :    txt_descr,
        'txt_input' :    txt_input,
        'txt_output':    txt_output,
        'txt_note':      txt_note
    }

    return problem_dict

## Calling functions

In [327]:
db = 'codeforces'
usr = 'Joy'
con = psycopg2.connect(database = db, user = usr)
cur = con.cursor()

In [358]:
query = """
SELECT contestid, problemid FROM problem_info;
"""
cur.execute(query)
cid_pid = cur.fetchall()
df_cid_pid = pd.read_sql(query, con)

### number of problems in database
* total: 4,248
* have some tags: 3,021

In [334]:
df_cid_pid.shape

(4248, 2)

In [342]:
con.rollback()

query = """
SELECT COUNT(*) FROM (SELECT DISTINCT contestid, problemid FROM tags) AS temp;
"""
cur.execute(query)
cur.fetchall()[0][0]

3021L

### get question texts from CF

In [362]:
import os.path

In [None]:
txtlist = []
idx = 0
last_idx = 48

out_file = 'problem_texts.tsv'

for cid, pid in cid_pid[last_idx:]:
    print last_idx + idx
    txtlist.append(get_problem_dict(cid, pid))
    
    if idx%5==0:
        df_out = pd.DataFrame.from_dict(txtlist)
        if (os.path.exists(out_file)):
            df_out.to_csv(out_file, sep='\t', header=False, index=False, encoding='utf-8', mode='a')
        else:
            df_out.to_csv(out_file, sep='\t', header=True, index=False, encoding='utf-8', mode='w')
        txtlist = []
        print "------------------------- written to file", out_file
        
    idx+=1;