In [30]:
import warnings
warnings.filterwarnings('ignore')
import dotenv 
from openai import OpenAI
from tqdm import tqdm

assert dotenv.load_dotenv('.env')

import requests
import re
from bs4 import BeautifulSoup
from joblib import Parallel, delayed, Memory
import backoff
from markdownify import markdownify as md
from IPython.display import display, Markdown, Latex
from tqdm import tqdm
import pandas as pd


@backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=60)
def fetch_sitemap(url):
    response = requests.get(url)
    response.raise_for_status()
    return response.text


def fetch_urls_from_sitemap_with_pattern(sitemap_url, pattern):
    sitemap_content = fetch_sitemap(sitemap_url)
    if sitemap_content:
        soup = BeautifulSoup(sitemap_content, 'xml')
        urls = [loc.text for loc in soup.find_all('loc') if re.match(pattern, loc.text)]
        return urls
    else:
        print(f"Failed to fetch sitemap from {sitemap_url}")
        return []
    
def extract_urls_from_category_page(category_url):
    response = requests.get(category_url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.find_all('a', class_='entry_title')
        title = soup.find('h1', class_='archive_title').text.strip()
        urls = [link.get('href') for link in links]
        return title, urls
    else:
        print(f"Failed to fetch category page from {category_url}")
        return '', []

def extract_post_content(post_url):
    response = requests.get(post_url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract title
        title = soup.find('h1', class_='post_title').text.strip()
        
        # Extract content
        content = soup.find('article', id='post_body')
        markdown_content = f'# {title} \n\n'
        text = f'# {title} \n\n'
        tables = []
        for element in content.children:
            if element.name == 'p':
                markdown_content += md(str(element)) + '\n\n'
                text += md(str(element)) + '\n\n'
            elif element.name == 'table':
                markdown_content += md(str(element)) + '\n\n'
                df = pd.read_html(str(element), header=0)[0]  # Assume only one table per element
                tables.append(df)
            elif element.name == 'div':
                pass
            elif element.name == 'ul':
                # Ignore unordered lists
                pass
        
        return {'title': title, 
                'text': text,
                'tables': tables,
                'content': markdown_content.strip()}
    else:
        print(f"Failed to fetch post page from {post_url}")
        return {'title': '', 'content': '', 'text':'', 'tables': []}
    
# Instead of lru_cache, we have to use joblib.Memory, as per https://github.com/joblib/joblib/issues/226
mem = Memory('.joblib_cache')
fetch_sitemap = mem.cache(fetch_sitemap, verbose=False)
extract_post_content = mem.cache(extract_post_content, verbose=False)

def main():
    sitemap_url = "https://uusikielemme.fi/category-sitemap.xml"
    pattern = r'https://uusikielemme.fi/category/finnish-.*'  # Example regex pattern

    urls = fetch_urls_from_sitemap_with_pattern(sitemap_url, pattern)

    # Print the matched URLs
    jobs = []
    for url in urls:
        def fun(url):
            # print(url)
            # print("Extracted URLs from category page")
            category_title, post_urls = extract_urls_from_category_page(url)
            post_data = []
            for post_url in post_urls:
                page_data = extract_post_content(post_url)
                post_data.append(page_data)
            return dict(category=category_title, posts=post_data)
                # print("Title:", post_data['title'])
                # print("Content:", post_data['content'])
                # display(post_data['title'])
                # display(Markdown(post_data['content']))
                # for table in post_data['tables']:
                    # display(table)
        jobs.append(delayed(fun)(url))
    data = Parallel(64)(tqdm(jobs)) # Bottleneck isn't CPU but waiting for requests.get, so this should be reasonable
    return data

In [31]:
data = main()

100%|██████████| 37/37 [00:00<00:00, 70094.51it/s]


Failed to fetch category page from https://uusikielemme.fi/category/finnish-grammar/verbs
Failed to fetch post page from https://uusikielemme.fi/finnish-grammar/syntax/sentence-types/kysymyslause-making-questions-in-finnish-lausetyypit




Failed to fetch post page from https://uusikielemme.fi/finnish-grammar/finnish-cases/interesting-inflection/inflection-of-foreign-names-in-finnish
Failed to fetch post page from https://uusikielemme.fi/finnish-grammar/syntax/rections/noun-rections-noun-noun
Failed to fetch post page from https://uusikielemme.fi/finnish-vocabulary/word-types/adjectives/compound-adjectives-hyvannakoinen-mutta-pahanhajuinen




Failed to fetch post page from https://uusikielemme.fi/finnish-grammar/syntax/rections/so-called-partitive-verbs-not-always-partitive
Failed to fetch post page from https://uusikielemme.fi/finnish-culture/finnish-song-lyrics/lahtisitko-finnish-song-lyrics-analyzed
Failed to fetch post page from https://uusikielemme.fi/finnish-grammar/consonant-gradation/consonant-gradation-astevaihtelu-kpt-vaihtelu




Failed to fetch post page from https://uusikielemme.fi/finnish-vocabulary/conversation-questions/daily-routine-questions-finnish-conversation-prompts




Failed to fetch post page from https://uusikielemme.fi/finnish-vocabulary/conversation-questions/colors-conversation-questions
Failed to fetch post page from https://uusikielemme.fi/finnish-grammar/morphology/overview-articles-regarding-derivational-suffixes
Failed to fetch post page from https://uusikielemme.fi/finnish-grammar/morphology/homonyms-finnish-words-with-multiple-meanings
Failed to fetch post page from https://uusikielemme.fi/finnish-vocabulary/vocabulary-lists/elections-vaalit-finnish-vocabulary
Failed to fetch post page from https://uusikielemme.fi/finnish-grammar/verbs/verb-tenses-and-moods/passive-perfect-tense-on-tehty-has-been-doneFailed to fetch post page from https://uusikielemme.fi/finnish-vocabulary/conversation-questions/hobbies-harrastukset-finnish-conversation-questions

Failed to fetch post page from https://uusikielemme.fi/finnish-grammar/morphology/what-is-derivation-finnish-derivational-suffixesFailed to fetch post page from https://uusikielemme.fi/finnish-



In [4]:
display(data[2]['category'])
display(Markdown(data[2]['posts'][0]['content'][:1090]))

'Authentic Language Use'

# Stretching – Yoga in Finnish – Jooga suomeksi 

In this article, you can find some vocabulary and phrases for yoga in Finnish. I’m basing this article on a video by [Ninan jooga](https://www.youtube.com/channel/UCQcjlVqbs8wmyOe1u-NZBRQ). I’ve picked a small part of [this video](https://youtu.be/-tc86UJlWR0?t=32) to analyze. I specifically picked the part of the video that deals with stretching left and right over your head. It’s a fairly simple thing to do, so perfect to start with. If you are interested in more of this type of content, let me know in the comments!



You can also check out the general article with [yoga vocabulary](https://uusikielemme.fi/finnish-vocabulary/vocabulary-lists/jooga-finnish-yoga-vocabulary-poses-and-verbs/) to learn more words!



The part of the video we will be looking at starts at 00:32 and ends at 3:06. It’s the part where Nina is stretching her back to the left and the right.





Below, you can find the transcription of the part of the video we will be looking at today. Underlined are the spoken language elements Nina uses in her vide

In [32]:
import pickle
pickle.dump(data, open('uusikelemme-fi-4-3-2024.pickle', 'wb'))

In [2]:
import pickle
data = pickle.load(open('uusikelemme-fi-4-3-2024.pickle','rb'))

In [33]:
import genanki
import random

unique_model_id = random.randrange(1 << 30, 1 << 31)
unique_deck_id = random.randrange(1 << 30, 1 << 31)

In [34]:
model = genanki.Model(
  unique_model_id,
  'Simple model',
  fields=[
    {'name': 'Question'},
    {'name': 'Answer'},
  ],
  templates=[
    {
      'name': 'Card 1',
      'qfmt': '{{Question}}',
      'afmt': '{{FrontSide}}<hr id="answer">{{Answer}}',
    },
  ])

In [35]:
deck = genanki.Deck(unique_deck_id, 'Uusikielemme')

posts = []
for category in data: # List of dicts
    # print(category['category'])
    for post in category['posts']: # List of dicts
        if post['content']:
            # posts.append(f"{category['category']} - {post['title']}")
            posts.append(post['content'])

            # print(post)
            # display(post['content'])
            # if random.randint(0, 100) < 5:
            # display(Markdown(post['content'][:1000]))
            # lim -= 1
            # if not lim: return

            # for table in post['tables']: # List of dataframes
            #     display(table)
                    # return
                    # for 
                    # note = genanki.Note(model=model, fields=['Capital of Argentina', 'Buenos Aires'])
                    # display(table)
                    # break
                    # return
        # print(category['posts'])
        # break

In [29]:
len(posts)

244

In [36]:
# print('\n'.join(posts))
# print(posts[0])

In [29]:

deck.add_note(note)
genanki.Package(deck).write_to_file('output.apkg')

In [30]:
# i = 0
# for cat in data:
#     for page in cat:
#         display(Markdown(page['content']))
#         i += 1
#         if i > 10: break

In [None]:
genanki.Note(
    model = model,
    fields=[]
)