In [None]:
import requests
from bs4 import BeautifulSoup
from requests_html import HTMLSession
import json

## Verb data from cooljugator

In [None]:
def get_conjugations_translations(soup):
    conj_tables = soup.find_all('div', class_='conjugation-table collapsable')

    conjugations = {}
    translations = {}

    for conj_table in conj_tables:
        # Find all div elements within the current conjugation table
        divs = conj_table.find_all('div')
        current_tense = 'None'

        # Iterate through the div elements
        for div in divs:
            # Check the class of the div
            class_name = div.get('class')

            if 'pronounColumn' in class_name:
                continue

            if "conjugation-cell conjugation-cell-four tense-title" in ' '.join(class_name):
                current_tense = div.text.strip()
                conjugations[current_tense] = []
                translations[current_tense] = []
            elif "conjugation-cell conjugation-cell-four" in ' '.join(class_name):
                meta_form = div.find('div', class_='meta-form')
                if meta_form:
                    conjugations[current_tense].append(meta_form.text.strip())
                meta_translation = div.find('div', class_='meta-translation')
                if meta_translation:
                    translations[current_tense].append(meta_translation.text.strip())
                    
    return conjugations, translations

def get_infinitive(soup):
    infinitive = soup.find('span', id='mainform').get('data-default')
    infinitive_translation = soup.find('span', id='mainform').text.strip().split('(')[1].split(')')[0]
    return infinitive, infinitive_translation

def getAllBaseForms(conjugations, infinitive):
    baseForms = {}
    for tense in conjugations:
        baseForms[tense] = getBaseForm(conjugations[tense], infinitive)
    return baseForms

## baseForm
def getBaseForm(conjugations, infinitive):
    takeFirstWord = True
    firstConjugation = conjugations[0].split(' ')
    if firstConjugation[0] == 'będę':
        takeFirstWord = False
    wordsToTest = []
    for conjugation in conjugations:
        if takeFirstWord:
            wordsToTest.append(conjugation.split(' ')[0])
        else:
            wordsToTest.append(conjugation.split(' ')[1])
    firstTestWord = wordsToTest[0]
    for i in range(len(firstTestWord)):
        subWord = firstTestWord[:(len(firstTestWord) - i)]
        
        allMatch = True
        for compareWord in wordsToTest:
            if compareWord[:len(subWord)] != subWord:
                allMatch = False
                
        if allMatch:
            break
    if subWord == infinitive:
        return subWord
    if subWord in wordsToTest:
        return subWord[:-1]
    else:
        return subWord
    
def construct_json(infinitive, infinitive_translation, conjugations, translations, baseForms):
    output_json = {}
    output_json['verb'] = infinitive
    output_json['translation'] = infinitive_translation
    output_json['tenses'] = {}
    for tense in conjugations:
        tense_parent = {}
        tense_parent['conjugations'] = conjugations[tense]
        tense_parent['translations'] = translations[tense]
        tense_parent['baseForm'] = baseForms[tense]
        output_json['tenses'][tense] = tense_parent
    return output_json

In [None]:
base_url = 'https://cooljugator.com/pl/'
headers = {"Accept-Language": "pl"}

verbs = ['wiedzieć', 'kopać', 'zjeść', 'iść', 'pisać', 'dawać', 'nieść', 'płakać',
        'płacić', 'ganić', 'wrócić', 'suszyć', 'czytać', 'mieć', 'padać', 'wołać', 'jeść', 'umieć']

data_output = []

for verb in verbs:
    s = HTMLSession()
    response = s.get(base_url+verb, headers=headers)
    response.encoding = response.apparent_encoding
    soup = BeautifulSoup(response.text, features='html.parser')
    
    infinitive, infinitive_translation = get_infinitive(soup)
    conjugations, translations = get_conjugations_translations(soup)
    baseForms = getAllBaseForms(conjugations, infinitive)
    data_output.append(construct_json(infinitive, infinitive_translation, conjugations, translations, baseForms))

In [None]:
## być breaks the script, need to find out why
verb

In [None]:
## json.dumps(data_output, ensure_ascii=False)
with open('verbs.json', 'w', encoding='utf-8') as outfile:
    json.dump(data_output, outfile, ensure_ascii=False)

## Nouns / Adjectives from online-polish-dictionary

In [None]:
def add_all_base_forms(grammar):
    for count in grammar:
        if 'declensions' in grammar[count]:
            declensions = grammar[count]['declensions']
            grammar[count]['baseForm'] = get_base_form(declensions)
        else:
            for gender in grammar[count]:
                declensions = grammar[count][gender]['declensions']
                grammar[count][gender]['baseForm'] = get_base_form(declensions)
                
def get_base_form(declensions):
    words_to_test = []
    for key in declensions:
        words_to_test.append(declensions[key])
    first_word = words_to_test[0]
    for i in range(len(first_word)):
        subWord = first_word[:(len(first_word) - i)]
        allMatch = True
        for compareWord in words_to_test:
            if compareWord[:len(subWord)] != subWord:
                allMatch = False
        if allMatch:
            break
    return subWord

def get_noun_adj_grammar(soup, lookup_word, word_type):
    data = {}
    
    # Find the relevant elements
    translation_div = soup.find('div', {'data-name': 'translation'})
    translation_text = translation_div.find('h3', class_='translation').text.strip()
    gender_text = soup.find('p', class_='grammar').text.strip().split(' ')[-1]

    # Extract English translation and gender
    english_translation = translation_text.split(' ')[-1]
    
    grammar_table = soup.find('table', class_='grammar')
    count = {}
    gender = {}

    # Table can have both Count & Gender (Singular + Masculine), or just Count (Singular)
    subheaders = grammar_table.findAll('th', {'class':'text-center subheader'})
    if len(subheaders) > 0:
        # adjective
        headers = grammar_table.findAll('th', {'class': None})
        headers[0] = 'CASE' # override empty header - represents the case
        group_count = 1
        for subheader in subheaders:
            for i in range(int(subheader.get('colspan'))):
                count[group_count + i] = subheader.text.lower()
            group_count = group_count + int(subheader.get('colspan'))

        for i in range(1, len(headers)):
            gender[i] = headers[i].text.lower().split(' ')[0]
    else:
        # noun
        headers = grammar_table.findAll('th')
        headers[0] = 'CASE' # override empty header - represents the case
        for i in range(1, len(headers)):
            count[i] = headers[i].text.lower()
            gender[i] = None

    grammar = {}
    for key in count:
        if gender[key]:
            if count[key] not in grammar:
                grammar[count[key]] = {}
            grammar[count[key]][gender[key]] = {'declensions' : {}}
        else:
            grammar[count[key]] = {'declensions' : {}}
            
    for row in grammar_table.find('tbody').find_all('tr', class_='clear'):
        column_values = row.find_all('td')

        # Extract data from columns
        case = column_values[0].strong.text.strip().lower()

        for i in range(1, len(column_values)):
            if gender[i]:
                # adjective
                grammar[count[i]][gender[i]]['declensions'][case] = column_values[i].span.text.strip()
            else:
                # noun
                grammar[count[i]]['declensions'][case] = column_values[i].span.text.strip()
    
    add_all_base_forms(grammar)
    # Assemble the json
    data[word_type] = lookup_word
    data['translation'] = english_translation
    if word_type == 'noun':
        data['gender'] = gender_text
    data['grammar'] = grammar
    return data

In [None]:
base_url = 'https://online-polish-dictionary.com/word/'
headers = {"Accept-Language": "pl"}

nouns = ['książka', 'pies', 'kot', 'zamek', 'ulica', 'dziewczynka', 'kobieta', 'dziecko', 'chłopiec', 'mężczyzna',
        'adres', 'aktor', 'aktorką', 'badanie', 'bilet', 'centrum', 'chleb', 'chwila', 'chłopak', 'ciasteczko',
        'czas', 'człowiek', 'farmie', 'godzina', 'granica', 'głos', 'herbata', 'impreza', 'inżynier',
        'jabłko', 'kaczka', 'kalendarz', 'kawa', 'konie', 'kolacja', 'krab', 'kuchni', 'lew',
        'ludzie', 'mleko', 'naród', 'obiad', 'owoc', 'państwa', 'personel', 'pająk'b, 'pytanie', 'rolnik']

noun_data = []

for noun in nouns:
    s = HTMLSession()
    response = s.get(base_url+noun, headers=headers)
    response.encoding = response.apparent_encoding
    soup = BeautifulSoup(response.text, features='html.parser')
    noun_data.append(get_noun_adj_grammar(soup, noun, 'noun'))

In [None]:
with open('../public/nouns.json', 'w', encoding='utf-8') as outfile:
    json.dump(noun_data, outfile, ensure_ascii=False)

In [None]:
base_url = 'https://online-polish-dictionary.com/word/'
headers = {"Accept-Language": "pl"}

adjectives = ['dobry', 'ładny', 'miły', 'biedna', 'brudno', 'brzydkie', 'ciemny', 'droga', 'duże', 'mały',
              'nowe', 'nudna', 'stary', 'źle'
             ]

adjective_data = []

for adjective in adjectives:
    s = HTMLSession()
    response = s.get(base_url+adjective, headers=headers)
    response.encoding = response.apparent_encoding
    soup = BeautifulSoup(response.text, features='html.parser')
    adjective_data.append(get_noun_adj_grammar(soup, adjective, 'adjective'))

In [None]:
with open('../public/adjectives.json', 'w', encoding='utf-8') as outfile:
    json.dump(adjective_data, outfile, ensure_ascii=False)