# The codingbat.com web page scraper

## Step 1: Obtain problem statements

In [None]:
import requests

In [None]:
base_url = 'https://codingbat.com'

In [None]:
# url = 'https://codingbat.com/java/Recursion-1'
url = 'https://codingbat.com/java/Recursion-2'
page = requests.get(url)

In [None]:
from bs4 import BeautifulSoup

In [None]:
soup = BeautifulSoup(page.content, 'html.parser')

In [None]:
nodes = soup.find('div', class_='tabin').find('table').findAll('a')
links = list(map(lambda node: (node.text, f"{base_url}{node['href']}"), nodes))
links

In [None]:
from bs4.element import NavigableString

class Problem:
    def __init__(self, url, name, statement, code, examples, tests=[]):
        self.url = url
        self.name = name
        self.statement = statement
        self.code = code
        self.tests = tests
        self.examples = examples
        

def parse_problem(name, url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')

    statement = soup.find('p', class_='max2').get_text('\n', strip=True)
    
    node = soup.find('p', class_='max2').next.next
    tests = []
    while node.name != 'p':
        if isinstance(node, NavigableString):
            tests.append(str(node))
        node = node.next
        
    code = soup.find('form', {'name': 'codeform'}).get_text('\n', strip=True)
    return Problem(url, name, statement, code, tests)
    

In [None]:
problems = list(map(lambda problem: parse_problem(*problem), links))

In [None]:
len(problems)

## Step 2. Obtain test cases

### Step 2.1. Login into account which has all the problems already solved

In [None]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')

driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)

url = 'https://codingbat.com/java'
driver.get(url)

Provide credentials

In [None]:
username = ''
password = ''

In [None]:
driver.find_element_by_xpath('/html/body/div[1]/table/tbody/tr[1]/td[2]/input').send_keys(username)
driver.find_element_by_xpath('/html/body/div[1]/table/tbody/tr[2]/td[2]/input').send_keys(password)

Login

In [None]:
driver.find_element_by_xpath('/html/body/div[1]/table/tbody/tr[3]/td[2]/input').click()

### Step 2.2. Start scraping test cases

In [None]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import re

def fix_inputs(inputs):
    inputs = inputs.replace('true', 'True')
    inputs = inputs.replace('false', 'False')
    return inputs

def resolve_tests(problem):
    url = problem.url
    
    # open problem page
    driver.get(url)
    
    # submit previous solution
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "/html/body/div[4]/div/div/table/tbody/tr/td[1]/p/button"))).click()
    
    # wait until results are ready
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "/html/body/div[4]/div/div/table/tbody/tr/td[2]/div/div")))
    
    # extract test cases
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    def extract_test(node):
        value = node.find('td').text
        match = re.match(f'(.*) → (.*)', value)
        return fix_inputs(match[1]), fix_inputs(match[2])
    
    rows = soup.find('div', id='tests').findAll('tr')[1:-2]
    tests = list(map(extract_test, rows))
    
    problem.tests = tests
    

In [None]:
for problem in problems:
    print(f'Resolving tests {problem.name}...')
    resolve_tests(problem)

## Step 3. Export problems

In [None]:
import re
import keyword

KEYWORDS = set(keyword.kwlist)
NAME_LOOKUP = {'str': 's'}
TYPE_LOOKUP = {'String': 'str', 'boolean': 'bool', 'int[]': 'List[int]'}


def fix_name(name):
    if name in KEYWORDS:
        return f'{name}_'
    
    return NAME_LOOKUP.get(name, name)


def fix_type(type_name):
    return TYPE_LOOKUP.get(type_name, type_name)


def parse_args(args):
    res = []
    for token in args.split(','):
        type_, name = token.split()
        res.append(f'{fix_name(name)}: {fix_type(type_)}')
    
    return res
    

def java_to_python(code):
    match = re.match(f'public ([\w\d]+) ([\w\d_]+)\((.*?)\)', code)
    return_type, name, args = fix_type(match[1]), fix_name(match[2]), match[3]
    args = parse_args(args)
    
    return f'def {name}({", ".join(args)}) -> {return_type}:'

In [None]:
TAB = '    '

def break_lines(lines, max_len):
    res = []
    for line in lines:
        if len(line) == 0:
            res.append('')
            continue
        
        words = []
        line_len = 0
        for word in line.split():
            if line_len > 0:
                line_len += 1
            line_len += len(word)
            
            if line_len > max_len:
                res.append(' '.join(words))
                words = [word]
                line_len = len(word)
            else:
                words.append(word)
        
        if words:
            res.append(' '.join(words))
            
    return res

def format_statement(statement, max_len=116):
    lines = break_lines(statement.split('\n'), max_len)
    return '\n'.join(f'{TAB}{line}' for line in lines)
    

In [None]:
def export_problem(problem, filename):
    declaration = java_to_python(problem.code)
    statement =  format_statement('"""\n' + problem.statement + '\n\nExamples:\n' + '\n'.join(problem.examples) + '\n"""')
    
    lines = []
    lines.append('from unittest import TestCase')
    if re.search(r'List\[.*?\]', declaration):
        lines.append('from typing import List')
    lines.append('')
    lines.append('')
        
    lines.append(declaration)
    lines.append(statement)
    lines.append(TAB + 'pass')
    
    lines.append('')
    lines.append('')
    lines.append('class Test(TestCase):')
    for i, (code, value) in enumerate(problem.tests):
        if i > 0:
            lines.append('')
        lines.append(f'{TAB}def test{i + 1}(self):')
        if value in ['True', 'False']:
            lines.append(f'{TAB}{TAB}self.assert{value}({code})')
        else:
            lines.append(f'{TAB}{TAB}self.assertEqual({value}, {code})')
    
    lines.append('')

    with open(filename, 'w') as f:
        f.write('\n'.join(lines))


for idx, problem in enumerate(problems):
    filename = f'R{idx:02d}_{problem.name}.py'
    print(f'Exporting {filename}...')
    export_problem(problem, filename)


In [None]:
driver.quit()