# Dataset creating

This notebook includes some basic examples of creating the word-to-syllables dataset.

In [3]:
import requests
from bs4 import BeautifulSoup
import re
import os
import time

## Getting syllables from "Ústav jazyka českého" website

[https://prirucka.ujc.cas.cz/](https://prirucka.ujc.cas.cz/?slovo=cht%C3%ADt)

Using address: prirucka.ujc.cas.cz/?slovo={word}

NOT WORKING:
Unfortunately the server seems to monitor web-scraping and therefore doesn't allow script to run repeatedly.

In [13]:
words = ['mravenečník', 'který', 'mít', 'jako', 'moci', 'svůj', 'tento',
         'všechen', 'jeden', 'člověk', 'vypadat', 'pohled', 'dokonce', 'možnost']

input_file = os.path.join('..', 'dataset', 'syn2015_lemma_cba.txt')

with open(input_file) as f:
    words = [line.rstrip() for line in f]

print(f'{len(words)} words loaded from {input_file}')
print(f'for example: {words[:min(10, len(words))]}')

38815 words loaded from ../dataset/syn2015_lemma_cba.txt
for example: ['být', 'a', 'se', 'v', 'na', 'ten', 's', 'že', 'on', 'z']


In [None]:
# word = 'mravenečník'
separated_words = []
error_words = []
timeout_words = []
for word in words:
    print(word, end=' =>', flush=True)
    try:
        response = requests.get(f'https://prirucka.ujc.cas.cz/?slovo={word}', timeout=10)
    except requests.exceptions.Timeout:
        timeout_words.append(word)
        print()
        continue

    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    classes = soup.findAll("p", {"class": "polozky"})
    if not classes:
        error_words.append(word)
        print()
        continue
    syllables_line = classes[0].text
    syllables = syllables_line.replace('dělení: ', '')
    syllables = re.match(r'[a-záéěíóúůýčďňřšťž\-]+', syllables).group(0)

    # TODO: write more complex checks (syllables.replace('-', '') == word etc.)

    print('\t', syllables, end='')
    print()
    separated_words.append((word, syllables))
    print(separated_words)
    time.sleep(10)

print(f'{len(separated_words)} words separated')
print(f'for example: {separated_words[:min(10, len(separated_words))]}')

print(f'{len(error_words)} words with errors')
print(f'for example: {error_words[:min(10, len(error_words))]}')


## Getting syllables from pronounciation dataset


In [161]:
import pandas as pd

csv_file = os.path.join('..', 'dataset', 'ssc_29-06-16', 'ssc_29-06-16.csv')
df = pd.read_csv(csv_file, sep=';')
df = df[['Ortho', 'Syllab']]
df.rename(columns={'Ortho': 'Word', 'Syllab': 'Pronounciation'}, inplace=True)
# all string in df['Pronounciation] to lowercase
df['Pronounciation'] = df['Pronounciation'].str.lower()
df = df[~df["Pronounciation"].str.contains('-')]
df = df[~df["Word"].str.contains(' ')]
df['Word'] = df['Word'].str.lower()
df['Syllables'] = ''
# df = df.head(20)
df

Unnamed: 0,Word,Pronounciation,Syllables
0,a,a,
1,abatyše,a.ba.ti.še,
2,abbé,a.bē,
3,abdikace,ap.di.ka.tse,
4,abdikovat,ap.di.ko.vat,
...,...,...,...
49358,žvatlavost,žva.tla.vost,
49359,žvatlavý,žva.tla.vī,
49360,žvýkací,žvī.ka.tsī,
49361,žvýkačka,žvī.katš.ka,


In [182]:
df['Word_len']  = df['Word'].str.len()

Unnamed: 0,Word,Pronounciation,Syllables,Word_len
2,abbé,a.bē,,4
172,achilles,a.xi.les,,8
170,achát,a.xāt,,5
171,achátový,a.xā.to.vī,,8
119,aerotaxi,a.e.ro.tak.si,,8
...,...,...,...,...
49191,žínka,žīn.ka,žín-ka,5
49192,žír,žīr,žír,3
49197,žírný,žīr.nī,žír-ný,5
49198,žít,žīt,žít,3


In [162]:
def pronounciation_to_syllables(pronounciation, word):
    syllables = ''
    for char in pronounciation:
        if char == '.':
            syllables += '-'
        else:
            syllables += word[0]
            word = word[1:]
    return syllables

def pron_in_list(pron_list, pronounciation):
    for pron in pron_list:
        if pronounciation.startswith(pron):
            return True, len(pron)
    return False, 0


def get_syllables(row):
    pronounciation_len = len(row['Pronounciation'].replace('.', ''))
    if pronounciation_len == len(row['Word']):
        row['Syllables'] = pronounciation_to_syllables(row['Pronounciation'], row['Word'])
    else:
        pronounciation = row['Pronounciation']
        word = row['Word']
        print(word, pronounciation)
        syllables = ''
        # offset = 0
        while len(word) > 0:
            if not pronounciation:
                break
            if (word[0] == pronounciation[0] or 
                (len(pronounciation) > 1 and pronounciation[1] == '.') or
                (len(word) > 1 and len(pronounciation) > 1 and word[1] == pronounciation[1])):
                syllables += word[0]
                word = word[1:]
                pronounciation = pronounciation[1:]
            elif word[0] in pronounciation_pairs.keys():
                pron_list = [pronounciation_pairs[word[0]]] if isinstance(pronounciation_pairs[word[0]], str) else pronounciation_pairs[word[0]]

                is_pron_in_list, pron_len = pron_in_list(pron_list, pronounciation)
                if is_pron_in_list:
                    syllables += word[0]
                    word = word[1:]
                    pronounciation = pronounciation[pron_len:]
                else:
                    print(f'\tBREAKING! word: {word}, syllables: {syllables}, pronounciation: {pronounciation}')
                    break
            else:
                print(f'\tNothing found. BREAKING! word: {word}, syllables: {syllables}, pronounciation: {pronounciation}')
                break

            if pronounciation and pronounciation[0] == '.':
                syllables += '-'
                pronounciation = pronounciation[1:]
        if syllables.replace('-', '') == row['Word']:
            row['Syllables'] = syllables
            print(f'\t{syllables}')
        else:
            print(f'\tLengths do not match. Word: {row["Word"]}, syllables: {syllables}, pronounciation: {row["Pronounciation"]} ({row["Word"]})')

    return row

pronounciation_pairs = {
    'á': ['ā', 'jā'],
    'b': 'p',
    'c': 'ts',
    'č': 'tš',
    'e': ['je', 'ē'],
    'é': ['ē', 'jē'],
    'ě': 'je',
    'i': 'ī',
    'í': 'ī',
    'n': 'ň',
    'p': 'b',
    'o': 'jo',
    'ó': 'ō',
    'u': 'ju',
    'ů': 'ū',
    'ú': 'ū',
    'y': 'i',
    'ý': 'ī',
    'x': ['ks', 'k.s'],
    'z': 's',
    # 'ř': 'rž',
    # 'š': 'šč',
    # 'ž': 'zž',
    # 'ť': 'tť',
    # 'ď': 'dď',
}

df_new = df[df['Syllables'] == ''].apply(get_syllables, axis=1)
df_old = df[df['Syllables'] != '']

df = pd.concat([df_new, df_old])

df.sort_values(by=['Syllables', 'Word'], inplace=True)

df

abbé a.bē
	BREAKING! word: bé, syllables: a-b, pronounciation: ē
	Lengths do not match. Word: abbé, syllables: a-b, pronounciation: a.bē (abbé)
abdikace ap.di.ka.tse
	ab-di-ka-ce
abeceda a.be.tse.da
	a-be-ce-da
abecední a.be.tse.dňī
	a-be-ce-dní
abiturient a.bi.tu.ri.jent
	a-bi-tu-ri-ent
abiturientka a.bi.tu.ri.jen.tka
	a-bi-tu-ri-en-tka
abiturientský a.bi.tu.ri.jents.kī
	a-bi-tu-ri-ents-ký
absces aps.tses
	abs-ces
absence ap.sen.tse
	ab-sen-ce
absolutisticky ap.so.lu.tis.tits.ki
	ab-so-lu-tis-tic-ky
absolutistický ap.so.lu.tis.tits.kī
	ab-so-lu-tis-tic-ký
absolutorium ap.so.lu.to.ri.jum
	ab-so-lu-to-ri-um
absorpce ap.sorp.tse
	ab-sorp-ce
absorpční ap.sorp.tšňī
	ab-sorp-ční
abstence ap.sten.tse
	ab-sten-ce
abstinence ap.sti.nen.tse
	ab-sti-nen-ce
abstrakce aps.trak.tse
	abs-trak-ce
abstrakční aps.trak.tšňī
	abs-trak-ční
aceton a.tse.ton
	a-ce-ton
acetonový a.tse.to.no.vī
	a-ce-to-no-vý
acetylen a.tse.ti.lēn
	a-ce-ty-len
acetylenový a.tse.ti.le.no.vī
	a-ce-ty-le-no-vý
acylpyrin a.tsil.p

Unnamed: 0,Word,Pronounciation,Syllables
2,abbé,a.bē,
172,achilles,a.xi.les,
170,achát,a.xāt,
171,achátový,a.xā.to.vī,
119,aerotaxi,a.e.ro.tak.si,
...,...,...,...
49191,žínka,žīn.ka,žín-ka
49192,žír,žīr,žír
49197,žírný,žīr.nī,žír-ný
49198,žít,žīt,žít


In [163]:
annotated_len = df[df['Syllables'] != ''].shape[0]
len_all = df.shape[0]
print(f'Annotated {annotated_len} ouf of {len_all} words ({annotated_len / len_all * 100:.2f} %)')

Annotated 39339 ouf of 45097 words (87.23 %)


Save annonated words to `txt` file.

In [183]:
annotated = df[df['Syllables'] != '']
annotated['Syllables'].to_csv(os.path.join('..', 'dataset', 'annotated.txt'), index=False)

Get stats about length of words WITHOUT syllables signes.

In [184]:
df.describe()

Unnamed: 0,Word_len
count,45097.0
mean,8.080959
std,2.380436
min,1.0
25%,6.0
50%,8.0
75%,10.0
max,23.0


### Example of how to normalize special czech characters

In [103]:
import unicodedata
my_unicode = 'ē'
output = unicodedata.normalize('NFD', my_unicode).encode('ascii', 'ignore').decode()
output

'e'

## Simple example of webScrapping with python's BeautifulSoup library
original code available at [https://github.com/DIFS-Teaching/webscraping](https://github.com/DIFS-Teaching/webscraping/blob/main/python-beautiful-soup/bsoup.py)

In [10]:
from bs4 import BeautifulSoup
import requests

html = requests.get("https://www.fit.vut.cz/study/courses/").text
soup = BeautifulSoup(html, "html.parser")
rows = soup.select("#list")[0].find_all("tr")

# Get headers
for cell in rows[0].find_all('th'):
    print(cell.text, end=";")
print()

# Get data
for row in rows[1:]:
    cells = row.find_all('td')
    out = "";
    for cell in cells:
        out = out + cell.text + ";"
    print(out)

Title;Abbrv;Sem;Cred;Compl;Dept;
Mathematical Software;0MS;S;3;Cr;ÚM;
Calculus with MAPLE;0MV;S;2;Cr;ÚM;
Computer Physics I;T1F;S;3;ClCr;ÚFI;
Computer Physics II;T2F;W;3;ClCr;ÚFI;
General Algebra;SOA;S;5;Cr+Ex;ÚM;
History of Design 2 - winter;DEDE2-Z;W;3;Ex;KTDU;
Architecture of the 20th century;ACHE20;W;3;Ex;KTDU;
History of Design 1 - winter;DEDE1-Z;W;3;Ex;KTDU;
Copyright;AUP-L;S;1;Cr;KTDU;
History of Design 1 - summer;DEDE1-L;S;3;Ex;KTDU;
History of Design 2 - summer;DEDE2-L;S;3;Ex;KTDU;
Linear Algebra I;SLA;W;6;Cr+Ex;ÚM;
German 1;N1;W;2;Cr;ÚJ;
German 2;N2;S;3;Cr+Ex;ÚJ;
German 3;N3;W;2;Cr;ÚJ;
German 4;N4;S;3;Cr+Ex;ÚJ;
Russian 1;R1;W;2;Cr;ÚJ;
Russian 2;R2;S;3;Cr+Ex;ÚJ;
French 1;F1;W;2;Cr;ÚJ;
French 2;F2;S;3;Cr+Ex;ÚJ;
History and Philosophy of Technology;FIT;S;2;ClCr;ICV;
History and Philosophy of Technology;FIT;W;2;ClCr;ICV;
Fundamentals of Law;PRM;S;2;ClCr;ICV;
Fundamentals of Law;PRM;W;2;ClCr;ICV;
Rhetoric;RET;S;2;ClCr;ICV;
Rhetoric;RET;W;2;ClCr;ICV;
Porn studies;KPO-Z;W;3;Ex;ATD;
