In [2]:
import requests
from bs4 import BeautifulSoup


result = requests.get("https://en.wiktionary.org/wiki/Category:Norwegian_rhymes")
assert result.status_code==200  

In [3]:
src = result.content
document = BeautifulSoup(src, 'lxml')

In [4]:
links = [e for e in document.find_all("a") if e.get("title") and "Rhymes:Norwegian/" in e.get("title")]
len(links)

33

In [5]:
stem = "https://en.wiktionary.org"
docs = []

for e in links:
    result = requests.get(f"{stem}{e.get('href')}")
    assert result.status_code==200
    src = result.content
    document = BeautifulSoup(src, 'lxml')
    docs.append(document)

In [6]:
rhymes = {}

for d in docs:
    content = d.find(class_="mw-parser-output")
    pronunciation = content.find(class_="IPA")
    rhymes[pronunciation.text] = []
    words = content.find_all(class_="Latn")
    if words:
        words = [w.text for w in words if len(w.text) > 1]
        rhymes[pronunciation.text] += words
        continue
    
    lists = content.find_all("ul")    
    for e in lists:
        l = e.find_all("li")
        if l[0].get("class"):
            continue
        words = [x.text for x in l]
        rhymes[pronunciation.text] += words

In [7]:
rhymes

{'-eː': ['be',
  'ble',
  'bre',
  'det',
  'fe',
  'fred',
  'gre',
  'kje',
  'kle',
  'kne',
  'kre',
  'kve',
  'le',
  'med',
  'ne',
  'ned',
  'pre',
  're',
  'se',
  'skje',
  'spe',
  'ste',
  'sve',
  'te',
  'tre',
  've',
  'ved',
  'be',
  'ble',
  'bre',
  'de',
  'det',
  'fe',
  'fred',
  'gje',
  'gle',
  'kje',
  'kle',
  'kne',
  'kre',
  'kve',
  'le',
  'me',
  'med',
  'ne',
  'ned',
  'pre',
  're',
  'skje',
  'spe',
  'ste',
  'sve',
  'tre',
  've',
  'ved',
  'deg',
  'eg',
  'meg',
  'seg',
  'CD',
  'cd',
  'diskre',
  'diskré',
  'idet',
  'i det',
  'ide',
  'idé',
  'passe',
  'passé',
  'CD',
  'cd',
  'diskre',
  'diskré',
  'idet',
  'i det',
  'idé',
  'ide',
  'passé',
  'passe',
  'kabaret',
  'kabaret'],
 '-eːʈ': ['blasert',
  'bornert',
  'fingert',
  'forsert',
  'halvert',
  'kupert',
  'markert',
  'musert',
  'møblert',
  'plassert',
  'polert',
  'sjenert',
  'skolert',
  'studert',
  'abortert',
  'affektert',
  'alliert',
  'antikvert',
 

In [8]:
for e in rhymes.values():
    print(len(e))

81
140
310
156
1
12
3
15
42
2
9
42
16
28
40
6
28
84
2
87
1
37
22
27
2
119
150
15
2
4
2
6
11


In [9]:
sum(len(l) for l in rhymes.values())

1502

# Create rhyme pairs

In [10]:
import itertools

#rhyme_pairs = {"word_a" : [], "word_b" : []}

def get_mirrored_dupes(pairs):
    l = list(pairs)
    rem = set()

    for i,(a,b) in enumerate(l):
        if (b,a) in l[i:]:
            rem.add((b, a))
    return rem

pairs = set()

for w in rhymes.values():
    c = list(itertools.combinations(w, 2))
    if c:
        rem = get_mirrored_dupes(c)
        c = set(c) - rem
        pairs.update(c)

In [11]:
import pandas as pd

w_a, w_b = zip(*pairs)
rhyme_pairs = {"word_a": w_a, "word_b": w_b}

df = pd.DataFrame(rhyme_pairs)
df

Unnamed: 0,word_a,word_b
0,rutebil,ørepil
1,mobil,servicebil
2,jugendstil,varebil
3,onani,lureri
4,bifili,selleri
...,...,...
80358,kvartmil,konebil
80359,møllehjul,balansehjul
80360,sti,infami
80361,kjemi,levkemi


In [12]:
df.to_csv("wiktionary_rhyme_pairs.tsv", sep="\t", index=False)

In [26]:
for k, v in rhymes.items():
    print(k)
    print(v[0], len(v))

-eː
be 81
-eːʈ
blasert 140
-iː
bi 310
-iːl
bil 156
-iːn
parafin 1
-iːʈ
firt 12
-oːʈ
lårt 3
-oːɳ
gården 15
-uʈ
bort 42
-uɳ
ekorn 2
-uːʈ
dort 9
-uːɳ
forn 42
-yːʈ
dyrt 16
-æʈ
ert 28
-æːʈ
bært 40
-øːn
føn 6
-øːɳ
bjørn 28
-œn
brønn 84
-ɔtsk
skotsk 2
-ɔʈ
fort 87
-ɔɳ
morn 1
-ɛkst
rekst 37
-ʉʈ
furt 22
-ʉːʈ
burt 27
-ʉːɳ
turn 2
-ʉːɽ
bul 119
-ɪl
dill 150
-ʏsk
brysk 15
-ʏʈ
styrt 2
-ɑskt
dvaskt 4
-ɑtsk
hatsk 2
-ɑʃ
hasj 6
-ɑːʃ
sars 11


In [35]:
# we dont want to create negative pairs from almost-rhymes
almost_rhyme = [("-ɑʃ", "-ɑːʃ"), ("-æːʈ", "-æʈ"), ("-uʈ", "-uːʈ"), ("-ʉːʈ", "-ʉʈ")]
rhymes_list = []

for a, b in almost_rhyme:
    l = rhymes[a] + rhymes[b]
    rhymes_list.append(l)

almost_rhyme = [e for x in almost_rhyme for e in x]

for k, v in rhymes.items():
    if k in almost_rhyme:
        continue
    rhymes_list.append(v)
len(rhymes_list)

29

## Create negative pairs

In [32]:
neg_pairs = set()

for i, l1 in enumerate(rhymes_list):
    for w in l1:
        for l2 in rhymes_list[i+1:]:
            for w2 in l2:
                neg_pairs.add((w, w2))

In [34]:
len(neg_pairs)

877429

In [36]:
w_a, w_b = zip(*neg_pairs)
neg_rhyme_pairs = {"word_a": w_a, "word_b": w_b}

df = pd.DataFrame(neg_rhyme_pairs)
df

Unnamed: 0,word_a,word_b
0,idet,sennepsgul
1,kupert,paradisfugl
2,kombinert,lettekorn
3,ste,vestrøn
4,uttært,mort
...,...,...
877424,cøliaki,avløpsventil
877425,tært,hermefugl
877426,bordbønn,ulykkesfugl
877427,stjert,føn


In [37]:
df.to_csv("wiktionary_negative_rhyme_pairs.tsv", sep="\t", index=False)