.txt file is a property of some kind person on the internet: https://pastebin.com/BjcwEuuN 

(found here https://www.reddit.com/r/German/comments/67lbwn/duolingo_german_vocabulary_list/)

In [53]:
import pandas as pd
import numpy as np

import re
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stemmer = SnowballStemmer("german")
stop_words = stopwords.words("german")
stop_words.extend([
        'winden','jonas','adam','eva','hanno','noah','sonja',\
        'mikkel','ines','michael','magnus','bartosz','egon','mads','silja','agnes',\
        'ulrich', 'katharina','hannah','martha','jana','tronte','jasmin','helene',\
        'charlotte','regina','franziska','peter','helge','aleksander','boris','claudia','clausen',\
        'elisabeth','elli','yasin','bernd','benni',\
        'albers','kahnwald','nielsen','doppler','tiedemann','niewald','tannhaus','tauber','woller','kruger','obendorf'])

In [8]:
duo_data = pd.read_csv('duolingo_german_vocab.txt',header=None, sep="; ",engine='python')
duo_data

Unnamed: 0,0,1
0,der Mann,the man
1,die Frau,the woman
2,der Junge,the boy
3,ich,I
4,ich bin,I am
...,...,...
2050,Südamerika,South America
2051,die Pyramide,the pyramid
2052,Ägypten,Egypt
2053,Russland,Russia


In [9]:
duo_data.rename(columns={0:"ger",1:"eng"},inplace=True)
duo_data

Unnamed: 0,ger,eng
0,der Mann,the man
1,die Frau,the woman
2,der Junge,the boy
3,ich,I
4,ich bin,I am
...,...,...
2050,Südamerika,South America
2051,die Pyramide,the pyramid
2052,Ägypten,Egypt
2053,Russland,Russia


"ich bin" and similar entries need to be removed. We don't wanna see all verb conjugations... For the same reason we're not using https://www.duolingo.com/words as it contains plural forms (btw it also does not contain articles)

# just an exercise

In [54]:
def clean_text(text):
    
    RE_WSPACE = re.compile(r"\s+", re.IGNORECASE)
    RE_ASCII = re.compile(r"[^A-Za-zÀ-ž ]", re.IGNORECASE)
    RE_SINGLECHAR = re.compile(r"\b[A-Za-zÀ-ž]\b", re.IGNORECASE)
    
    text = re.sub(RE_ASCII, " ", text)
    text = re.sub(RE_SINGLECHAR, " ", text)
    text = re.sub(RE_WSPACE, " ", text)

    word_tokens = word_tokenize(text)
    words_tokens_lower = [word.lower() for word in word_tokens]

    words_filtered = [
        stemmer.stem(word) for word in words_tokens_lower if word not in stop_words
    ]

    text_clean = " ".join(words_filtered)
    return text_clean

In [55]:
duo_data["ger_clean"] = duo_data.ger.map(
    lambda x: clean_text(x) if isinstance(x, str) else x
)

In [79]:
duo_data.head(30)

Unnamed: 0,ger,eng,ger_clean
0,der Mann,the man,mann
1,die Frau,the woman,frau
2,der Junge,the boy,jung
3,ich,I,
4,ich bin,I am,
5,ein,one,
6,eine,a,
7,du,you,
8,du bist,you are,
9,das Kind,the child,kind


In [78]:
dark_data = pd.read_csv("texts.csv")

In [58]:
dark_data["text_clean"] = dark_data.episode_text.map(
    lambda x: clean_text(x) if isinstance(x, str) else x
)

In [59]:
dark_data[0:3]

Unnamed: 0,episode_text,episode,season,season00episode00,text_clean
0,- - Zieh dich um. -Martha? - Das kann nic...,2,3,S03E02,zieh lebst recht versproch wiedergutzumach pas...
1,Ich erinnere mich. Ich erinnere mich an ...,7,1,S01E07,erinn erinn mach jung gefund mhm passiert iden...
2,Das ist nicht mehr lustig! Katharina? Ulr...,6,1,S01E06,mehr lustig mehr lustig hilf hilf hilf albtrau...


In [62]:
dark_data.text_clean.str.split(' ')

0     [zieh, lebst, recht, versproch, wiedergutzumac...
1     [erinn, erinn, mach, jung, gefund, mhm, passie...
2     [mehr, lustig, mehr, lustig, hilf, hilf, hilf,...
3     [wusst, andert, imm, bleibt, spinnrad, dreht, ...
4     [wusst, ding, end, wohin, reis, fuhrt, trotzd,...
5     [schwarz, loch, gelt, hollenschlund, universum...
6     [gott, gib, gelass, unaband, hinzunehm, mut, v...
7     [kreis, neu, vermisstenfall, erschuttert, stad...
8     [dunkel, angezog, mott, licht, dunkel, gebor, ...
9     [vertrau, darauf, zeit, linear, verlauft, ewig...
10    [warum, sterb, tod, konnt, entkomm, wusst, wan...
11    [realitat, gibt, davon, einzig, existi, mehr, ...
12    [opf, mannlich, zehn, zeitpunkt, tod, etwa, st...
13    [mama, wer, elt, wer, weisst, tun, weh, mach, ...
14    [versprech, mach, gut, lieb, hast, versproch, ...
15    [anfang, end, end, anfang, miteinand, verbund,...
16    [anfang, end, seltsam, vorstell, beid, gleich,...
17    [geseh, aufwach, mann, ganz, kreislauf, le

In [73]:
all_words =[]
for line in dark_data.text_clean.str.split(' '):
    for word in line:
        all_words.append(word)
all_words = set(all_words)

In [74]:
found = []
for word in all_words:
    if word in list(duo_data.ger_clean.drop_duplicates()):
        found.append(word)

In [77]:
100*len(found)/len(all_words)

25.4933765882671