<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Scraping-linkedin-Posts" data-toc-modified-id="Scraping-linkedin-Posts-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Scraping linkedin Posts</a></span><ul class="toc-item"><li><span><a href="#Utils" data-toc-modified-id="Utils-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Utils</a></span></li><li><span><a href="#Login" data-toc-modified-id="Login-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Login</a></span></li><li><span><a href="#Load-posts-page-&amp;-scroll-to-bottom" data-toc-modified-id="Load-posts-page-&amp;-scroll-to-bottom-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Load posts page &amp; scroll to bottom</a></span></li><li><span><a href="#Retrieve-data-from-loaded-page" data-toc-modified-id="Retrieve-data-from-loaded-page-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Retrieve data from loaded page</a></span></li><li><span><a href="#Saving-blog-posts-to-files" data-toc-modified-id="Saving-blog-posts-to-files-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>Saving blog posts to files</a></span></li></ul></li></ul></div>

# Scraping linkedin Posts

In [1]:
try:
    from selenium import webdriver
except:
    %pip install selenium
    from selenium import webdriver

try:
    import unidecode
except:
    %pip install unidecode
    import unidecode

try:
    import pandas as pd
except:
    %pip install pandas
    import pandas as pd

In [28]:
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup as bs
import time
from datetime import datetime, timedelta
import re as re
from importlib.metadata import version
from typing import Any, Dict, Optional
import os
import types
import logging
import tkinter as tk
import json
from utils import getNowAsString, writeDictToFile, readDictFromFile

In [3]:
PAGE = 'https://www.linkedin.com/company/mgm-technology-partners-gmbh'
SCROLL_PAUSE_TIME = 1.5
DATA_DIRECTORY = os.getenv('DATA_DIRECTORY') or 'data'
os.makedirs(DATA_DIRECTORY, exist_ok=True)

BLOGS_DIRECTORY = os.getenv('BLOGS_DIRECTORY') or f"{DATA_DIRECTORY}/blogs"
os.makedirs(BLOGS_DIRECTORY, exist_ok=True)

TMP_DIRECTORY = os.getenv('TMP_DIRECTORY') or f"{DATA_DIRECTORY}/tmp_linkedin"
os.makedirs(TMP_DIRECTORY, exist_ok=True)

FILENAME_SOUP = "linkedin_soup.html"
INTERNAL_DATE_FORMAT = "%Y-%m-%d"
NO_DATE = "__no_date__"

FILENAME_RAW_POSTS = f"{TMP_DIRECTORY}/raw_posts.json"

In [4]:
try:
    f= open("credentials.txt","r")
    contents = f.read()
    username = contents.replace("=",",").split(",")[1]
    password = contents.replace("=",",").split(",")[3]
except:
    f= open("credentials.txt","w+")
    username = input('Enter your linkedin username: ')
    password = input('Enter your linkedin password: ')
    f.write("username={}, password={}".format(username,password))
    f.close()

## Utils

In [5]:
def get_logger(name, log_level=logging.WARN):
    # Get a logger with the given name
    logger = logging.getLogger(name)
    logger.propagate = False  # Disable propagation to the root logger. Makes sense in Jupyter only...
    logger.setLevel(log_level)

    # Check if the logger has handlers already
    if not logger.handlers:
        # Create a handler
        handler = logging.StreamHandler()

        # Set a format that includes the logger's name
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        handler.setFormatter(formatter)
        logger.addHandler(handler)

    return logger
    
def transformDate2String(dateToTransform: datetime) -> str:
    logger = get_logger(transformDate2String.__name__)
    try:
        dateStr = dateToTransform.strftime(INTERNAL_DATE_FORMAT)
    except:
        logger.error(f"Error transforming date: {dateToTransform}. Continuing with empty date string.")
        dateStr = ""
    return dateStr

def transformString2Date(stringToTransform: str) -> Optional[datetime]:
    """Transforms a String that holds a date in my standard format to a Date. 
        In case it can't transform it, it return None."""
    try:
        dateObj = datetime.strptime(stringToTransform, INTERNAL_DATE_FORMAT)
    except:
        log("transformString2Date", "Error transforming string to date: ",
            stringToTransform)
        dateObj = None
    return dateObj

def getNowAsString() -> str:
    return transformDate2String(datetime.now())

def getMinDateAsString() -> str:
    return transformDate2String(datetime(1970, 1, 1))

def stripBlanks(str):
    return str.strip(" \t")
import logging

## Login

In [6]:
def get_loggedin_browser():
    #access Webriver
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--incognito")
    #chrome_options.add_argument("--headless=new")
    browser = webdriver.Chrome(options=chrome_options)
    
    #Open login page
    browser.get('https://www.linkedin.com/login?fromSignIn=true&trk=guest_homepage-basic_nav-header-signin')
    
    #Enter login info:
    elementID = browser.find_element(by=By.ID, value='username')   #.find_element_by_id('username')
    elementID.send_keys(username)
    
    elementID = browser.find_element(by=By.ID, value='password')#find_element_by_id('password')
    elementID.send_keys(password)
    #Note: replace the keys "username" and "password" with your LinkedIn login info
    elementID.submit()
    return browser


## Load posts page & scroll to bottom

In [7]:
def browser_go_to_page(browser, max_pages=0):
    logger = get_logger(browser_go_to_page.__name__, logging.INFO)
    #Go to webpage
    company_posts_page = PAGE + '/posts/'
    logger.info(f"{company_posts_page=}")
    browser.get(company_posts_page)
    
    # Get scroll height
    last_height = browser.execute_script("return document.body.scrollHeight")
    scroll_page = 0
    
    while True:
        # Scroll down to bottom
        #click_visible_menues(browser)
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    
        scroll_page += 1
        logger.info(f"Scrolling page {scroll_page}")
        
        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)
    
        # Calculate new scroll height and compare with last scroll height
        new_height = browser.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
        if max_pages > 0:
            if scroll_page == max_pages:
                break
                
    return 

def get_page_source(browser, max_pages=0):
    logger = get_logger(get_page_source.__name__, logging.INFO)
    browser_go_to_page(browser, max_pages)

    company_page = browser.page_source   
    return company_page

In [8]:
def get_linkedin_browser(max_pages=0):
    browser = get_loggedin_browser()
    browser_go_to_page(browser, max_pages=max_pages)
    return browser

## Retrieve data from loaded page

In [9]:
def retrieve_container_elements(max_pages):
    logger = get_logger(get_container_elements.__name__, logging.INFO)
    browser = get_linkedin_browser(max_pages=max_pages)
    container_elements = browser.find_elements(By.CLASS_NAME, "occludable-update")
    logger.info(f"No of container elements before filter: {len(container_elements)}")
    container_elements = [element for element in container_elements if len(element.find_elements(By.CLASS_NAME,"update-components-actor")) > 0]
    logger.info(f"No of container elements after filter: {len(container_elements)}")
    return container_elements, browser

In [29]:
def is_element_in_viewport(driver, element):
    return driver.execute_script("""
        var elem = arguments[0];
        var rect = elem.getBoundingClientRect();
        return (
            rect.top >= 0 &&
            rect.left >= 0 &&
            rect.bottom <= (window.innerHeight || document.documentElement.clientHeight) &&
            rect.right <= (window.innerWidth || document.documentElement.clientWidth)
        );
    """, element)

def get_post_url(browser):
    logger = get_logger(get_post_url.__name__, logging.WARN)
    elements = browser.find_elements(By.XPATH, "//*[text()='Copy link to post']")
    if len(elements) != 1:
        logger.warning(f"Number of list of elements that should give me the URL of the blogpost: {len(elements)}")
        return None
    try:
        elements[0].click()
        root = tk.Tk()
        blog_post_url = root.clipboard_get()
        logger.info(f"URL of blog post: {blog_post_url}")
        return blog_post_url
    except Exception as e:
        logger.warn(f"Could not extract blog post url, retrurning None. Error: {e}")
        return None

def extract_blog_post_url_from_container_element(browser, container_element):
    logger = get_logger(extract_blog_post_url_from_container_element.__name__, logging.INFO)
    #logger.info(f"Extracting from container of type {type(container_element)}")
    buttons = container_element.find_elements(By.CLASS_NAME, 'feed-shared-control-menu__trigger')  
    if len(buttons) != 1:
        logger.info(f"No of buttons found in container: {len(buttons)}. Cannot process this container.")
        return None
        
    button = buttons[0]
    actions = ActionChains(browser)
    actions.send_keys(Keys.ESCAPE).perform()
    browser.execute_script('arguments[0].scrollIntoView({ behavior: "smooth", block: "center", inline: "nearest" });', button)
    
    if not button.is_displayed():  
        logger.warn("Button not displayed, cannot process container")
        return None
        
    actions.send_keys(Keys.ESCAPE).perform()
    time.sleep(1)  
    button.click()
    time.sleep(5)  
    url = get_post_url(browser)                
    actions.send_keys(Keys.ESCAPE).perform()
    return url
    
def write_blog_containers_to_file(blogs):
    logger = get_logger(write_blog_containers_to_file.__name__, logging.INFO)
    # Prepare blogs to be saveable i.e. serializable
    blogs_to_save = []
    for blog in blogs:
        blog_to_save = blog
        blog_to_save["soup"] = blog["soup"].prettify()
        blogs_to_save.append(blog_to_save)
    try:
        f= open(FILENAME_RAW_POSTS,"w+")
        json.dump(blogs_to_save, f)
        f.close()
        logger.info(f"Wrote {len(blogs)} blog containers to file {FILENAME_RAW_POSTS}.")
    except Exception as e:
        logger.warn(f"could not write {len(blogs)} blog containers to file {FILENAME_RAW_POSTS}: {e}")        
    return 

def write_blog_containers_to_file(blogs):
    logger = get_logger(write_blog_containers_to_file.__name__, logging.INFO)
    # Prepare blogs to be saveable i.e. serializable
    blogs_to_save = []
    for blog in blogs:
        blog_to_save = blog
        blog_to_save["soup"] = blog["soup"].prettify()
        blogs_to_save.append(blog_to_save)
    try:
        writeDictToFile(dictionary=blogs_to_save,fullFilename=FILENAME_RAW_POSTS)
    except Exception as e:
        logger.warn(f"could not write {len(blogs)} blog containers to file {FILENAME_RAW_POSTS}: {e}")        
    return 


def read_blog_containers_from_file():
    logger = get_logger(read_blog_containers_from_file.__name__, logging.INFO)
    blogs = []
    try:
        f= open(FILENAME_RAW_POSTS,"r+")
        blogs = json.load(f)
        f.close()
    except Exception as e:
        logger.warn(f"Could not read blog containers from file {FILENAME_RAW_POSTS}. Returning None.")
        return None
    for blog in blogs:
        blog["soup"] = bs(blog["soup"], "html.parser")  # convert string to BeautifulSoup object
    logger.info(f"Read {len(blogs)} blog containersc from file {FILENAME_RAW_POSTS}.")
    return blogs


def read_blog_containers_from_file():
    logger = get_logger(read_blog_containers_from_file.__name__, logging.INFO)
    blogs = []
    try:
        blogs = readDictFromFile(fullFilename=FILENAME_RAW_POSTS)
    except Exception as e:
        logger.warn(f"Could not read blog containers from file {FILENAME_RAW_POSTS}. Returning None.")
        return None
    for blog in blogs:
        blog["soup"] = bs(blog["soup"], "html.parser")  # convert string to BeautifulSoup object
    logger.info(f"Read {len(blogs)} blog containersc from file {FILENAME_RAW_POSTS}.")
    return blogs

def extract_blogs_from_container_elements(browser, container_elements):
    logger = get_logger(extract_blogs_from_container_elements.__name__, logging.INFO)
    blogs = []
    for container_element in container_elements:
        blog_url = extract_blog_post_url_from_container_element(browser, container_element)
        blog_source = container_element.get_attribute('outerHTML')
        blog_soup = bs(blog_source.encode("utf-8"), "html")
        blog = {
            "url": blog_url,
            "source": blog_source,
            "soup": blog_soup,
            "scrape_date": getNowAsString()
        }
        blogs.append(blog)
                         
    logger.info(f"No of extracted blogs: {len(blogs)}")
    write_blog_containers_to_file(blogs)
    return blogs


In [30]:
def get_blog_containers(force_retrieval=False, max_pages=0):
    logger = get_logger(get_blog_containers.__name__, logging.INFO)
    if force_retrieval:
        logger.info(f"Retrieving blog containers: {force_retrieval=} {max_pages=}")
        container_elements, browser = retrieve_container_elements(max_pages)
        blog_containers = extract_blogs_from_container_elements(browser, container_elements)   
        return blog_containers
    try:
        blog_containers = read_blog_containers_from_file()
        return blog_containers
    except Exception as e:
        container_elements, browser = retrieve_container_elements(max_pages)
        blog_containers = extract_blogs_from_container_elements(browser, container_elements)   
        return blog_containers

blog_container = get_blog_containers(force_retrieval=False, max_pages=3)

  logger.warn(f"Could not read blog containers from file {FILENAME_RAW_POSTS}. Returning None.")


In [12]:
# TODO Delete these functions

def get_linkedin_soup_from_website(max_pages=0):
    logger = get_logger(get_linkedin_soup_from_website.__name__, logging.INFO)
    browser = get_loggedin_browser()
    company_page = get_page_source(browser, max_pages)
    linkedin_soup = bs(company_page.encode("utf-8"), "html")
    f= open(FILENAME_SOUP,"w+")
    f.write(linkedin_soup.prettify())
    f.close()
    logger.info("Scraped soup from website")
    return linkedin_soup, browser

def get_linkedin_soup_from_file():
    logger = get_logger(get_linkedin_soup_from_file.__name__, logging.INFO)
    f= open(FILENAME_SOUP,"r+")
    linkedin_html = f.read()
    f.close()
    linkedin_soup = bs(linkedin_html, "html.parser")  # convert string to BeautifulSoup object
    logger.info("Read soup from file")
    return linkedin_soup

def get_linkedin_soup(force_retrieval, max_pages):
    logger = get_logger(get_linkedin_soup.__name__, logging.INFO)
    browser = None
    if force_retrieval:
        logger.info(f"Retrieving linkedin soup: {force_retrieval=} {max_pages=}")
        linkedin_soup, browser = get_linkedin_soup_from_website(max_pages=max_pages)
        return linkedin_soup, browser
        
    try:
        linkedin_soup = get_linkedin_soup_from_file()
    except:
        linkedin_soup = get_linked_soup_from_website(max_pages=max_pages)
    return linkedin_soup, browser
    
#linkedin_soup = get_linkedin_soup(force_retrieval=True)

In [13]:
# TODO Delete these functions

def get_containers(force_retrieval=False, max_pages=0):
    logger = get_logger(get_containers.__name__, logging.INFO)
    
    linkedin_soup, browser = get_linkedin_soup(force_retrieval=force_retrieval, max_pages=max_pages)
    containers = linkedin_soup.find_all(class_="occludable-update")
    
    # We also need to check that the conatiner contains "update-components-actor" to filter out ads
    containers = [container for container in containers if container.find(class_="update-components-actor")]    
    logger.info(f"Number of container: {len(containers)}")

    if not browser is None:
        for container in containers:
            blog_url = extract_blog_post_url_from_container(container, browser)
            
    # Write it to disk for analyzing
    containers_to_write = [0,1,2,3, 5]
    for container_no in containers_to_write:
        if len(containers) > container_no:
            filename = f"{TMP_DIRECTORY}/container_{container_no}.html" 
            logger.info(f"Writing container {container_no} to {filename}")
            f= open(filename,"w+")
            f.write(containers[container_no].prettify())
            f.close()

    return containers, browser
    
#containers, browser = get_containers(force_retrieval=True, max_pages=3)

In [14]:
def extract_date_string_from_soup(soup: bs):
    logger = get_logger(extract_date_string_from_soup.__name__, log_level=logging.WARN)

    # Looking for the relative date (in d, w, mo, yr)
    # It has the shape: "1yr •"
    p = re.compile(r'\d{1,2}(h|d|w|mo|yr)\s•')
    m = re.compile(r'\d{1,2}(h|d|w|mo|yr)\s•').search(soup.prettify())
    dateHumanReadable = ""
    if m:
        dateHumanReadable = m.group()
        logger.info(f"Match found: {dateHumanReadable}")
        return dateHumanReadable
    else:
        logger.error(f"Could not extract human readable date from soup! soup: {soup}")
        return NO_DATE

def test_extract_date_string_from_soup():
    containers = get_blog_containers()
    human_readable_date = extract_date_string_from_soup(containers[0]["soup"])
    print(human_readable_date)

test_extract_date_string_from_soup()

2023-09-04 07:36:34,125 - read_blog_containers_from_file - INFO - Read 22 blog containersc from file data/tmp_linkedin/raw_posts.json.


23h •


In [15]:
def linkedin_rel_date2datetime(relative_date):
    """Transforms a relative date from LinkedIn to a datetime object.
    Transform "6d •" to a proper datetime"""

    logger = get_logger(linkedin_rel_date2datetime.__name__, log_level=logging.WARN)
    
    p = re.compile('\d{1,2}')
    m = p.search(relative_date)
    if m is None:
        logger.error(f"Amount not found in {relative_date}")
        exit
    amount = float(m.group())
    p = re.compile('(h|d|w|mo|yr)')
    m = p.search(relative_date)
    logger.info(f"m: {m}, type(m): {type(m)}")
    if m is None:
        logger.error(f"Unit not found in {relative_date}")
        exit
    unit = m.group()
    if unit == 'yr':
        amount *= 365*24
    elif unit == 'mo':
        amount *= 30*24
    elif unit == 'w':
        amount *= 7*24
    elif unit == 'd':
        amount *= 24
    logger.info(f" {relative_date} --> Amount in hours: {amount}")
    # Calculate the date from today's, and return it
    howRecent = timedelta(hours=amount)
    todaysDate = datetime.now()
    date = (todaysDate - howRecent)
    return date

# Some tests
rel_dates = ['2h •', '3d •', '1w •']
for rel_date in rel_dates:
    print(f"{rel_date} --> {linkedin_rel_date2datetime(rel_date)}")

2h • --> 2023-09-04 05:36:34.139794
3d • --> 2023-09-01 07:36:34.139962
1w • --> 2023-08-28 07:36:34.140006


In [16]:
def extract_text_from_soup(soup: bs):
    logger = get_logger(extract_text_from_soup.__name__, log_level=logging.INFO)

    # In 'container', find the first <div> element with class 'feed-shared-update-v2__description-wrapper'.
    # Assign this element to 'text_box'.
    text_box = soup.find("div", {"class":"feed-shared-update-v2__description-wrapper"})
    
    # If 'text_box' is not None (i.e., if such an element was found in 'container')...
    if text_box:
        # ...find the first <span> element within 'text_box' that has the 'dir' attribute set to 'ltr'.
        # Extract its text content, strip leading and trailing whitespace, and assign this cleaned text to 'text'.
        text = text_box.find("span", {"dir":"ltr"}).text.strip()
        
        # Return 'text'.
        return text
    else:
        # If 'text_box' is None (i.e., if no such <div> element was found in 'container')...
        # ...print an error message.
        logger.warn(f"Could not extract text from soup!")
        
        # Uncomment the following line to print the 'container' for debugging purposes.
        # print(f"Container: {container}")
        
        # Return an empty string.
        return ""

In [17]:
def simplify_content(content):
    content = re.sub('\n +', '\n', content)
    content = re.sub('\n+', '\n\n', content)
    content = content.replace("{", "&#123;").replace("}", "&#125;")
    return content
    
def extract_all_from_container(container):
    logger = get_logger(extract_all_from_container.__name__, logging.INFO)
    blog_post = {}
    blog_post["date_human_readable"] = extract_date_string_from_soup(container["soup"])
    blog_post["posted_date"] = linkedin_rel_date2datetime(blog_post["date_human_readable"])
    blog_post["text"] = simplify_content(extract_text_from_soup(container["soup"]))
    blog_post["original_url"] = container["url"]
    logger.info(f"{blog_post['posted_date']} - {blog_post['text'][:30]}")
    return blog_post

def extract_all_from_containers():
    logger = get_logger(extract_all_from_containers.__name__, logging.INFO)
    containers = get_blog_containers()
    blog_posts = []
    
    for container_no, container in enumerate(containers):
        try:
            logger.info(f"Processing container # {container_no}")
            blog_post = extract_all_from_container(container)
            blog_posts.append(blog_post)
        except Exception as e:
            logger.warning(f"Container # {container_no} not added: {str(e)}")
            pass
    return blog_posts

blog_posts = extract_all_from_containers();

2023-09-04 07:36:34,442 - read_blog_containers_from_file - INFO - Read 22 blog containersc from file data/tmp_linkedin/raw_posts.json.
2023-09-04 07:36:34,442 - extract_all_from_containers - INFO - Processing container # 0
2023-09-04 07:36:34,449 - extract_all_from_container - INFO - 2023-09-03 08:36:34.448883 - Der Einsatz von Modellen in de
2023-09-04 07:36:34,450 - extract_all_from_containers - INFO - Processing container # 1
2023-09-04 07:36:34,457 - extract_all_from_container - INFO - 2023-08-31 07:36:34.457391 - Skalierung in der Softwareentw
2023-09-04 07:36:34,458 - extract_all_from_containers - INFO - Processing container # 2
2023-09-04 07:36:34,466 - extract_all_from_container - INFO - 2023-08-14 07:36:34.465461 - „Gut getestet aber auch sicher
2023-09-04 07:36:34,466 - extract_all_from_containers - INFO - Processing container # 3
2023-09-04 07:36:34,473 - extract_all_from_container - INFO - 2023-08-21 07:36:34.473167 - Wesen und Stärke von

#OpenSou
2023-09-04 07:36:34,474 -

In [18]:
if (len(blog_posts) != len(get_blog_containers())):
    print("Not all containers could be transformed to blog_posts! No of conatiner: {len(containers)}, no of blog posts: {len(blog_posts)}")

2023-09-04 07:36:34,835 - read_blog_containers_from_file - INFO - Read 22 blog containersc from file data/tmp_linkedin/raw_posts.json.


In [19]:
blog_post_index = 1
print(blog_posts[blog_post_index])
#blog_posts

{'date_human_readable': '4d •', 'posted_date': datetime.datetime(2023, 8, 31, 7, 36, 34, 457391), 'text': 'Skalierung in der Softwareentwicklung bezieht sich auf die Projektgröße und die Rolle von Einzelpersonen im Laufe der Zeit.\n\nDie Zusammenarbeit in Teams birgt neue Herausforderungen, ermöglicht aber auch die Entwicklung nachhaltigerer und wertvollerer Systeme.\n\nLesen Sie mehr dazu in unserem Blogbeitrag:\n\n➡️\n\nhttps://lnkd.in/gpKBJp2z\n\n#softwareengineering\n\n#programmierung\n\n#enterprisesoftwareengineering\n\n#mgmtechnologypartners', 'original_url': 'https://www.linkedin.com/posts/mgm-technology-partners-gmbh_die-prinzipien-des-enterprise-software-engineerings-activity-7097879746960596992-pMtH?utm_source=share&utm_medium=member_desktop'}


## Saving blog posts to files

In [20]:
def simplify_text(some_text: str) -> str:
    simplified_text = some_text.replace('"', "'")
    simplified_text = unidecode.unidecode(simplified_text)
    simplified_text = re.sub("[^A-Za-z\-_]+", "_", simplified_text)
    simplified_text = re.sub('_+', '_', simplified_text)
    return simplified_text

In [21]:
def build_title(blog_post):
    LEN_OF_TITLE = 35
    title = blog_post["text"][:LEN_OF_TITLE].replace('\n', ' ')
    return title

def build_title(blog_post):
    LEN_OF_TITLE = 35
    text = blog_post["text"]
    title = text[:LEN_OF_TITLE]
    
    if len(text) > LEN_OF_TITLE and text[LEN_OF_TITLE] != ' ':
        # Extend to the end of the current word
        while len(text) > len(title) and text[len(title)] != ' ':
            title += text[len(title)]
    
    # Replace newlines with spaces in the final title
    title = title.replace('\n', ' ')
    return title

def build_simplified_title(blog_post: Dict) -> str:
    simplified_title = simplify_text(build_title(blog_post))
    return simplified_title

In [22]:
def build_filename(blog_post: Dict) -> str:
    logger = get_logger(build_filename.__name__, logging.INFO)
    LEN_OF_FILENAME = 45
    posted_date = blog_post["posted_date"]
    try:
        posted_date_for_filename = posted_date.strftime(INTERNAL_DATE_FORMAT)
    except:
        createdDateStrForFilename = "_no_date_"    
    simplified_title = build_simplified_title(blog_post)[:LEN_OF_FILENAME-13]
    filename = f"{BLOGS_DIRECTORY}/{posted_date_for_filename}-{simplified_title}.md"
    logger.info(filename)
    return filename

In [23]:
for blog_post in blog_posts:
    print(build_filename(blog_post))

2023-09-04 07:36:34,866 - build_filename - INFO - data/blogs/2023-09-03-Der_Einsatz_von_Modellen_in_der_.md
2023-09-04 07:36:34,867 - build_filename - INFO - data/blogs/2023-08-31-Skalierung_in_der_Softwareentwic.md
2023-09-04 07:36:34,872 - build_filename - INFO - data/blogs/2023-08-14-_Gut_getestet_aber_auch_sicher_i.md
2023-09-04 07:36:34,874 - build_filename - INFO - data/blogs/2023-08-21-Wesen_und_Starke_von_OpenSourceS.md
2023-09-04 07:36:34,875 - build_filename - INFO - data/blogs/2023-08-29-Enterprise_Low_Code-Plattformen_.md
2023-09-04 07:36:34,876 - build_filename - INFO - data/blogs/2023-08-28-Fur_unsere_A_Low_Code-Plattform_.md
2023-09-04 07:36:34,876 - build_filename - INFO - data/blogs/2023-08-28-In_einer_Software-getriebenen_Or.md
2023-09-04 07:36:34,877 - build_filename - INFO - data/blogs/2023-08-28-Die_modellbasierte_Softwareentwi.md
2023-09-04 07:36:34,877 - build_filename - INFO - data/blogs/2023-08-28-Die_Offenheit_der_mgm_A_Low-Code.md
2023-09-04 07:36:34,878 - bu

data/blogs/2023-09-03-Der_Einsatz_von_Modellen_in_der_.md
data/blogs/2023-08-31-Skalierung_in_der_Softwareentwic.md
data/blogs/2023-08-14-_Gut_getestet_aber_auch_sicher_i.md
data/blogs/2023-08-21-Wesen_und_Starke_von_OpenSourceS.md
data/blogs/2023-08-29-Enterprise_Low_Code-Plattformen_.md
data/blogs/2023-08-28-Fur_unsere_A_Low_Code-Plattform_.md
data/blogs/2023-08-28-In_einer_Software-getriebenen_Or.md
data/blogs/2023-08-28-Die_modellbasierte_Softwareentwi.md
data/blogs/2023-08-28-Die_Offenheit_der_mgm_A_Low-Code.md
data/blogs/2023-08-28-Bei_der_Sicherheitsplanung_in_ei.md
data/blogs/2023-08-21-Programmierung_ist_ein_wichtiger.md
data/blogs/2023-08-21-Die_Globalisierung_hat_in_den_le.md
data/blogs/2023-08-28-Um_die_nationalen_und_internatio.md
data/blogs/2023-08-05-Die_Bedeutung_einer_nahtlosen_Cu.md
data/blogs/2023-08-05-In_der_neuen_Ausgabe_von_DIE_MAC.md
data/blogs/2023-08-21-Heute_Grenoble_morgen_Hamburg_Da.md
data/blogs/2023-08-21-Der_Bedarf_an_schnellen_digitale.md
data/blogs/202

In [24]:
def build_frontmatter(blog_post):
    posted_date = blog_post["posted_date"]
    title = build_title(blog_post)
    original_url = blog_post["original_url"]
    frontMatter = ("---\n"
           "layout: post\n"
           "date: " + transformDate2String(posted_date) + "\n"
           'title: "' + title + '"\n'
           "originalUrl: \"" + original_url + "\"\n")
           #"tags: linkedin " + linkedin_user_based_tags + "\n" +
           #"author: \"" + author + "\"\n")
    frontMatter += "---\n\n"
    return frontMatter

In [25]:
def save_blog_post_to_file(blog_post: Dict) -> None:
    content = blog_post["text"]
    filename = build_filename(blog_post)
    frontmatter = build_frontmatter(blog_post)
    path = os.path.dirname(filename)
    #log("saveToFile", "Saving to file ", filename)
    os.makedirs(path, exist_ok=True)
    with open(filename, 'w') as file:
        file.write(frontmatter)
        file.write(content)
        file.close()

def save_blog_posts_to_file(blog_posts):
    for blog_post in blog_posts:
        save_blog_post_to_file(blog_post)

save_blog_posts_to_file(blog_posts)

2023-09-04 07:36:34,896 - build_filename - INFO - data/blogs/2023-09-03-Der_Einsatz_von_Modellen_in_der_.md
2023-09-04 07:36:34,897 - build_filename - INFO - data/blogs/2023-08-31-Skalierung_in_der_Softwareentwic.md
2023-09-04 07:36:34,899 - build_filename - INFO - data/blogs/2023-08-14-_Gut_getestet_aber_auch_sicher_i.md
2023-09-04 07:36:34,901 - build_filename - INFO - data/blogs/2023-08-21-Wesen_und_Starke_von_OpenSourceS.md
2023-09-04 07:36:34,903 - build_filename - INFO - data/blogs/2023-08-29-Enterprise_Low_Code-Plattformen_.md
2023-09-04 07:36:34,904 - build_filename - INFO - data/blogs/2023-08-28-Fur_unsere_A_Low_Code-Plattform_.md
2023-09-04 07:36:34,905 - build_filename - INFO - data/blogs/2023-08-28-In_einer_Software-getriebenen_Or.md
2023-09-04 07:36:34,907 - build_filename - INFO - data/blogs/2023-08-28-Die_modellbasierte_Softwareentwi.md
2023-09-04 07:36:34,908 - build_filename - INFO - data/blogs/2023-08-28-Die_Offenheit_der_mgm_A_Low-Code.md
2023-09-04 07:36:34,909 - bu