## Imports

In [1]:
from collections import defaultdict
import pandas as pd
import re

## 1. Load and process data

**Steps:**
1. Load `.txt` file as dataframe
2. Apply regex pattern on the `word`-column
3. Filter dataframe to keep values which contain 1 or more trigrams
3. Sort values based on `frequency`-column

# 2. Get trigrams of umlauts and eszett characters

In [2]:
df = pd.read_csv('deu_news_2022_1M-words.txt', delimiter='\t',
                 usecols=[1,2], skiprows=48, header=None,
                 index_col=False, names=['word', 'frequency'], encoding='utf-8')

In [3]:
pattern_umlaut = re.compile(r'([a-zA-ZäöüÄÖÜ][äöüÄÖÜß][a-zA-ZäöüÄÖÜß]|[äöüÄÖÜß][a-zA-ZäöüÄÖÜß]{2})')
df['trigram_umlaut'] = df['word'].apply(lambda x: pattern_umlaut.findall(x) if isinstance(x, str) else [])

In [4]:
df.head()

Unnamed: 0,word,frequency,trigram_umlaut
0,beiden,8131,[]
1,könnte,8103,[kön]
2,nicht mehr,8097,[]
3,Unternehmen,8047,[]
4,laut,8014,[]


In [5]:
df_umlauts = df[df['trigram_umlaut'].apply(lambda x: x != [])]
df_umlauts = df_umlauts.sort_values(by='frequency', ignore_index=True, ascending=False)

**Steps:**
1. Create an empty `defaultdict(int)`
2. Iterate over the dataframe rows and add values to dictionary
3. Add frequency values as values to a dictionary
4. Convert it to the dataframe with 2 columns `['trigram', 'frequency']`
5. Save dataframe as csv file

In [6]:
trigrams_umlauts_dict = defaultdict(int)

In [7]:
for index, row in df_umlauts.iterrows():
    trigrams = row['trigram_umlaut']
    frequency = row['frequency']
    for trigram in trigrams:
        trigrams_umlauts_dict[trigram] += frequency

In [8]:
df_trigrams_umlauts = pd.DataFrame(trigrams_umlauts_dict.items(), columns=['trigram', 'frequency'])
df_trigrams_umlauts = df_trigrams_umlauts.sort_values(by='frequency', ascending=False)

In [9]:
umlauts_total = sum(df_trigrams_umlauts['frequency'].tolist())
df_trigrams_umlauts['percentage'] = df_trigrams_umlauts['frequency'].apply(lambda x: x / umlauts_total)

In [10]:
df_trigrams_umlauts.to_csv('umlaut_trigrams.csv', index=False, encoding='utf-8')

# 3. Get trigrams of word beginnings

In [11]:
pattern_beginning = re.compile(r'\b([\wäöüÄÖÜß]{3})')
df['trigram_beginning'] = df['word'].apply(lambda x: pattern_beginning.findall(x) if isinstance(x, str) else [])

In [12]:
df.head()

Unnamed: 0,word,frequency,trigram_umlaut,trigram_beginning
0,beiden,8131,[],[bei]
1,könnte,8103,[kön],[kön]
2,nicht mehr,8097,[],"[nic, meh]"
3,Unternehmen,8047,[],[Unt]
4,laut,8014,[],[lau]


In [13]:
df_beginning = df[df['trigram_beginning'].apply(lambda x: x != [])]
df_beginning = df_beginning.sort_values(by='frequency', ignore_index=True, ascending=False)

In [14]:
trigrams_beginning_dict = defaultdict(int)

In [15]:
for index, row in df_beginning.iterrows():
    trigrams = row['trigram_beginning']
    frequency = row['frequency']
    for trigram in trigrams:
        trigrams_beginning_dict[trigram] += frequency

In [16]:
df_trigrams_beginning = pd.DataFrame(trigrams_beginning_dict.items(), columns=['trigram', 'frequency'])
df_trigrams_beginning = df_trigrams_beginning.sort_values(by='frequency', ascending=False)

In [17]:
beginnings_total = sum(df_trigrams_beginning['frequency'].tolist())
df_trigrams_beginning['percentage'] = df_trigrams_beginning['frequency'].apply(lambda x: x / beginnings_total)

In [18]:
df_trigrams_beginning.to_csv('beginning_trigrams.csv', index=False, encoding='utf-8')