In [None]:
import requests
from bs4 import BeautifulSoup
from requests_html import HTMLSession
import json

## Verb data from cooljugator

In [None]:
def get_conjugations_translations(soup):
    conj_tables = soup.find_all('div', class_='conjugation-table collapsable')

    conjugations = {}
    translations = {}

    for conj_table in conj_tables:
        # Find all div elements within the current conjugation table
        divs = conj_table.find_all('div')
        current_tense = 'None'

        # Iterate through the div elements
        for div in divs:
            # Check the class of the div
            class_name = div.get('class')

            if 'pronounColumn' in class_name:
                continue

            if "conjugation-cell conjugation-cell-four tense-title" in ' '.join(class_name):
                current_tense = div.text.strip()
                conjugations[current_tense] = []
                translations[current_tense] = []
            elif "conjugation-cell conjugation-cell-four" in ' '.join(class_name):
                meta_form = div.find('div', class_='meta-form')
                if meta_form:
                    conjugations[current_tense].append(meta_form.text.strip())
                meta_translation = div.find('div', class_='meta-translation')
                if meta_translation:
                    translations[current_tense].append(meta_translation.text.strip())
                    
    return conjugations, translations

def get_infinitive(soup):
    infinitive = soup.find('span', id='mainform').get('data-default')
    infinitive_translation = soup.find('span', id='mainform').text.strip().split('(')[1].split(')')[0]
    return infinitive, infinitive_translation

def getAllBaseForms(conjugations, infinitive):
    baseForms = {}
    for tense in conjugations:
        baseForms[tense] = getBaseForm(conjugations[tense], infinitive)
    return baseForms

## baseForm
def getBaseForm(conjugations, infinitive):
    takeFirstWord = True
    firstConjugation = conjugations[0].split(' ')
    if firstConjugation[0] == 'będę':
        takeFirstWord = False
    wordsToTest = []
    for conjugation in conjugations:
        if takeFirstWord:
            wordsToTest.append(conjugation.split(' ')[0])
        else:
            wordsToTest.append(conjugation.split(' ')[1])
    firstTestWord = wordsToTest[0]
    for i in range(len(firstTestWord)):
        subWord = firstTestWord[:(len(firstTestWord) - i)]
        
        allMatch = True
        for compareWord in wordsToTest:
            if compareWord[:len(subWord)] != subWord:
                allMatch = False
                
        if allMatch:
            break
    if subWord == infinitive:
        return subWord
    if subWord in wordsToTest:
        return subWord[:-1]
    else:
        return subWord
    
def construct_json(infinitive, infinitive_translation, conjugations, translations, baseForms):
    output_json = {}
    output_json['verb'] = infinitive
    output_json['translation'] = infinitive_translation
    output_json['tenses'] = {}
    for tense in conjugations:
        tense_parent = {}
        tense_parent['conjugations'] = conjugations[tense]
        tense_parent['translations'] = translations[tense]
        tense_parent['baseForm'] = baseForms[tense]
        output_json['tenses'][tense] = tense_parent
    return output_json

In [None]:
base_url = 'https://cooljugator.com/pl/'
headers = {"Accept-Language": "pl"}

verbs = ['wiedzieć', 'kopać', 'zjeść', 'iść', 'pisać', 'dawać', 'nieść', 'płakać',
        'płacić', 'ganić', 'wrócić', 'suszyć', 'czytać', 'mieć', 'padać', 'wołać', 'jeść', 'umieć']

data_output = []

for verb in verbs:
    s = HTMLSession()
    response = s.get(base_url+verb, headers=headers)
    response.encoding = response.apparent_encoding
    soup = BeautifulSoup(response.text, features='html.parser')
    
    infinitive, infinitive_translation = get_infinitive(soup)
    conjugations, translations = get_conjugations_translations(soup)
    baseForms = getAllBaseForms(conjugations, infinitive)
    data_output.append(construct_json(infinitive, infinitive_translation, conjugations, translations, baseForms))

In [None]:
## być breaks the script, need to find out why
verb

In [None]:
## json.dumps(data_output, ensure_ascii=False)
with open('verbs.json', 'w', encoding='utf-8') as outfile:
    json.dump(data_output, outfile, ensure_ascii=False)

## Nouns from online-polish-dictionary

In [22]:
def get_noun_grammar(soup):
    noun = {}
    
    # Find the relevant elements
    translation_div = soup.find('div', {'data-name': 'translation'})
    translation_text = translation_div.find('h3', class_='translation').text.strip()
    gender_text = soup.find('p', class_='grammar').text.strip()

    # Extract English translation and gender
    english_translation = translation_text.split(' ')[-1]
    gender = gender_text.split(', ')[-1]
    
    grammar_table = soup.find('table', class_='grammar')
    
    grammar = {}
    for row in grammar_table.find('tbody').find_all('tr', class_='clear'):
        columns = row.find_all('td')

        # Extract data from columns
        case = columns[0].strong.text.strip()
        singular = columns[1].span.text.strip()
        plural = columns[2].span.text.strip()
        
        grammar[case] = {}
        grammar[case]['singular'] = singular
        grammar[case]['plural'] = plural
        
    # Assemble the json
    noun['translation'] = english_translation
    noun['gender'] = gender
    noun['grammar'] = grammar
    
    return noun

In [23]:
base_url = 'https://online-polish-dictionary.com/word/'
headers = {"Accept-Language": "pl"}

nouns = ['książka', 'pies', 'kot']

noun_data = []

for noun in nouns:
    s = HTMLSession()
    response = s.get(base_url+noun, headers=headers)
    response.encoding = response.apparent_encoding
    soup = BeautifulSoup(response.text, features='html.parser')
    noun_data.append(get_noun_grammar(soup))

In [24]:
noun_data

[{'translation': 'book',
  'gender': 'Feminine',
  'grammar': {'Nominative': {'singular': 'książka', 'plural': 'książki'},
   'Genitive': {'singular': 'książki', 'plural': 'książek'},
   'Dative': {'singular': 'książce', 'plural': 'książkom'},
   'Accusative': {'singular': 'książkę', 'plural': 'książki'},
   'Instrumental': {'singular': 'książką', 'plural': 'książkami'},
   'Locative': {'singular': 'książce', 'plural': 'książkach'},
   'Vocative': {'singular': 'książko', 'plural': 'książki'}}},
 {'translation': 'dog',
  'gender': 'Masculine',
  'grammar': {'Nominative': {'singular': 'pies', 'plural': 'psy'},
   'Genitive': {'singular': 'psa', 'plural': 'psów'},
   'Dative': {'singular': 'psu', 'plural': 'psom'},
   'Accusative': {'singular': 'psa', 'plural': 'psy'},
   'Instrumental': {'singular': 'psem', 'plural': 'psami'},
   'Locative': {'singular': 'psie', 'plural': 'psach'},
   'Vocative': {'singular': 'psie!', 'plural': 'psy!'}}},
 {'translation': 'cat',
  'gender': 'Masculine',


In [25]:
base_url = 'https://online-polish-dictionary.com/word/'
headers = {"Accept-Language": "pl"}

nouns = ['dobry']

noun_data = []

for noun in nouns:
    s = HTMLSession()
    response = s.get(base_url+noun, headers=headers)
    response.encoding = response.apparent_encoding
    soup = BeautifulSoup(response.text, features='html.parser')
    noun_data.append(get_noun_grammar(soup))

In [26]:
noun_data

[{'translation': 'good',
  'gender': 'adjective',
  'grammar': {'Nominative': {'singular': 'dobry', 'plural': 'dobra'},
   'Genitive': {'singular': 'dobrego', 'plural': 'dobrej'},
   'Dative': {'singular': 'dobremu', 'plural': 'dobrej'},
   'Accusative': {'singular': 'dobrego', 'plural': 'dobrą'},
   'Instrumental': {'singular': 'dobrym', 'plural': 'dobrą'},
   'Locative': {'singular': 'dobrym', 'plural': 'dobrej'},
   'Vocative': {'singular': 'dobry', 'plural': 'dobra'}}}]