In [169]:
import requests
from bs4 import BeautifulSoup
from requests_html import HTMLSession
import json

In [None]:
def get_conjugations_translations(soup):
    conj_tables = soup.find_all('div', class_='conjugation-table collapsable')

    conjugations = {}
    translations = {}

    for conj_table in conj_tables:
        # Find all div elements within the current conjugation table
        divs = conj_table.find_all('div')
        current_tense = 'None'

        # Iterate through the div elements
        for div in divs:
            # Check the class of the div
            class_name = div.get('class')

            if 'pronounColumn' in class_name:
                continue

            if "conjugation-cell conjugation-cell-four tense-title" in ' '.join(class_name):
                current_tense = div.text.strip()
                conjugations[current_tense] = []
                translations[current_tense] = []
            elif "conjugation-cell conjugation-cell-four" in ' '.join(class_name):
                meta_form = div.find('div', class_='meta-form')
                if meta_form:
                    conjugations[current_tense].append(meta_form.text.strip())
                meta_translation = div.find('div', class_='meta-translation')
                if meta_translation:
                    translations[current_tense].append(meta_translation.text.strip())
                    
    return conjugations, translations

def get_infinitive(soup):
    infinitive = soup.find('span', id='mainform').get('data-default')
    infinitive_translation = soup.find('span', id='mainform').text.strip().split('(')[1].split(')')[0]
    return infinitive, infinitive_translation

def getAllBaseForms(conjugations, infinitive):
    baseForms = {}
    for tense in conjugations:
        baseForms[tense] = getBaseForm(conjugations[tense], infinitive)
    return baseForms

## baseForm
def getBaseForm(conjugations, infinitive):
    takeFirstWord = True
    firstConjugation = conjugations[0].split(' ')
    if firstConjugation[0] == 'będę':
        takeFirstWord = False
    wordsToTest = []
    for conjugation in conjugations:
        if takeFirstWord:
            wordsToTest.append(conjugation.split(' ')[0])
        else:
            wordsToTest.append(conjugation.split(' ')[1])
    firstTestWord = wordsToTest[0]
    for i in range(len(firstTestWord)):
        subWord = firstTestWord[:(len(firstTestWord) - i)]
        
        allMatch = True
        for compareWord in wordsToTest:
            if compareWord[:len(subWord)] != subWord:
                allMatch = False
                
        if allMatch:
            break
    if subWord == infinitive:
        return subWord
    if subWord in wordsToTest:
        return subWord[:-1]
    else:
        return subWord
    
def construct_json(infinitive, infinitive_translation, conjugations, translations, baseForms):
    output_json = {}
    output_json['verb'] = infinitive
    output_json['translation'] = infinitive_translation
    output_json['tenses'] = {}
    for tense in conjugations:
        tense_parent = {}
        tense_parent['conjugations'] = conjugations[tense]
        tense_parent['translations'] = translations[tense]
        tense_parent['baseForm'] = baseForms[tense]
        output_json['tenses'][tense] = tense_parent
    return output_json

In [177]:
base_url = 'https://cooljugator.com/pl/'
headers = {"Accept-Language": "pl"}

verbs = ['wiedzieć', 'kopać', 'zjeść', 'iść', 'pisać', 'dawać', 'nieść', 'płakać',
        'płacić', 'ganić', 'wrócić', 'suszyć', 'czytać', 'mieć', 'padać', 'wołać', 'jeść', 'umieć']
data_output = []

for verb in verbs:
    s = HTMLSession()
    response = s.get(base_url+verb, headers=headers)
    response.encoding = response.apparent_encoding
    soup = BeautifulSoup(response.text, features='html.parser')
    
    infinitive, infinitive_translation = get_infinitive(soup)
    conjugations, translations = get_conjugations_translations(soup)
    baseForms = getAllBaseForms(conjugations, infinitive)
    data_output.append(construct_json(infinitive, infinitive_translation, conjugations, translations, baseForms))

In [176]:
## być breaks the script, need to find out why
verb

'być'

In [181]:
## json.dumps(data_output, ensure_ascii=False)
with open('verbs.json', 'w', encoding='utf-8') as outfile:
    json.dump(data_output, outfile, ensure_ascii=False)