In [1]:
import os
from pathlib import Path
import pandas as pd
import spacy
from nltk.corpus import stopwords
nlp = spacy.load('de_core_news_sm')

def transcripts_to_df(path_to_transcripts, save_df=False):
    # Create dataframe out of transcripts
    df = pd.DataFrame(columns=['transcriptionName','content','year','month','day'])
    df = df.fillna(0)
    for transcription in os.listdir(path_to_transcripts):
        with open("{}/{}".format(path_to_transcripts, transcription), "r") as f:
            df.loc[transcription,'transcriptionName'] = str(transcription)
            df.loc[transcription,'content'] = f.read()
            date_string = str(transcription).split("_")[1]
            df.loc[transcription, 'year'] = date_string[4:]
            df.loc[transcription, 'month'] = date_string[2:4]
            df.loc[transcription, 'day'] = date_string[0:2]
    df = df.reset_index(drop=True)
    if save_df:
        df.to_csv("episodes.csv", index=False)
    return df

def preprocess_df(dataframe):
    non_nouns=[] # init array for all non-nouns found by spacy

    # preprocessing for topic modelling, in this case LDA (latent dirichlet analysis)
    print("next...")
    # remove "newline" and punctuation
    df["content"] = df["content"].apply(lambda x: x.replace("\n","").replace(".",""))
    # create token for spacy
    #doc = nlp(df.loc[transcript,'content'])
    # lemmatize with help of spacy (maybe we can try another library for this)
    df['content'] = df['content'].apply(lambda x: ((",".join([y.lemma_ for y in nlp(x)])).replace(","," ")))
    # Remove non-nouns
    #df["content"] = df["content"].apply(lambda x: [word for word in nlp(x) if word.pos_ == "NOUN"])
    # write everything lowercase
    df['content'] = df['content'].apply(lambda x: x.lower())
    df['content'] = df['content'].apply(lambda x: x.split())
    # stopword removal
    stop_words=set(stopwords.words('german'))
    df['content'] = df['content'].apply(lambda x: [word for word in x if word not in stop_words])
    # compensate structure for splitting
    df['content'] = df['content'].apply(lambda x: ",".join(x))
    df['content'] = df['content'].apply(lambda x: x.replace(",", " "))
    # convert to matching format
    df['content'] = df['content'].apply(lambda x: x.split())

In [2]:
df = transcripts_to_df("test_transcripts")
preprocess_df(df)

next...


In [3]:
print(df["content"][1])

['fernsehen', 'tagesschau', 'studio', 'tagesschau', 'stellung', 'islamist', 'luftschiff', 'präsident', 'kampf', 'extremist', 'koalition', 'incirlik', 'angriff', 'kurswechsel', 'anfang', 'woche', 'anschlag', 'tote', 'miliz', 'grenzgebiet', 'sicherheitsabstand', 'nachbarschaft', 'staat', 'norden', 'nacht', 'angriff', 'luftwaffe', 'luftraum', 'kampfjets', 'militär', 'islamist', 'regierung', 'luftwaffenstützpunkt', 'incirlik', 'kampfjets', 'luftschläge', 'verfügung', 'regierung', 'wunsch', 'einsatzmöglichkeiten', 'luftwaffe', 'bild', 'klare', 'kampf', 'innere', 'großaktion', 'morgen', 'person', 'anhänger', 'arbeiterpartei', 'polizist', 'einsatz', 'politik', 'hand', 'anschlag', 'anfang', 'woche', 'miris', 'mensch', 'tod', 'wort', 'zukunft', 'sache', 'angst', 'schwächung', 'stärkung', 'kurde', 'terror', 'land', 'ministerpräsident', 'äußerung', 'bundestagswahlkampf', 'kritik', 'spd-politiker', 'interview', 'frage', 'partei', 'beliebtheit', 'kanzlerin', 'kanzlerkandidaten', 'generalsekretärin'

In [4]:
#print([str(word).lower() for word in list(df["content"])[0]])
#for x in nlp("Guten Abend meine Damen und Herren"):
#    print(x.pos_)
print(df["content"][1])

['fernsehen', 'tagesschau', 'studio', 'tagesschau', 'stellung', 'islamist', 'luftschiff', 'präsident', 'kampf', 'extremist', 'koalition', 'incirlik', 'angriff', 'kurswechsel', 'anfang', 'woche', 'anschlag', 'tote', 'miliz', 'grenzgebiet', 'sicherheitsabstand', 'nachbarschaft', 'staat', 'norden', 'nacht', 'angriff', 'luftwaffe', 'luftraum', 'kampfjets', 'militär', 'islamist', 'regierung', 'luftwaffenstützpunkt', 'incirlik', 'kampfjets', 'luftschläge', 'verfügung', 'regierung', 'wunsch', 'einsatzmöglichkeiten', 'luftwaffe', 'bild', 'klare', 'kampf', 'innere', 'großaktion', 'morgen', 'person', 'anhänger', 'arbeiterpartei', 'polizist', 'einsatz', 'politik', 'hand', 'anschlag', 'anfang', 'woche', 'miris', 'mensch', 'tod', 'wort', 'zukunft', 'sache', 'angst', 'schwächung', 'stärkung', 'kurde', 'terror', 'land', 'ministerpräsident', 'äußerung', 'bundestagswahlkampf', 'kritik', 'spd-politiker', 'interview', 'frage', 'partei', 'beliebtheit', 'kanzlerin', 'kanzlerkandidaten', 'generalsekretärin'

In [5]:
from gensim.corpora.dictionary import Dictionary
from gensim import corpora

dictionary = corpora.Dictionary(list(df["content"]))

In [6]:
corpus = [dictionary.doc2bow(text) for text in list(df["content"])]

In [7]:
corpus

[[(0, 1),
  (1, 2),
  (2, 1),
  (3, 2),
  (4, 2),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 3),
  (13, 1),
  (14, 2),
  (15, 1),
  (16, 3),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 3),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 2),
  (26, 2),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 2),
  (31, 2),
  (32, 1),
  (33, 2),
  (34, 8),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 4),
  (48, 1),
  (49, 2),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 2),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 2),
  (68, 2),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 2),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 3),
  (78, 1),
  (79, 2),
  (80, 2),
  (81, 1),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 3),
  (87, 1),
  (88, 1),
  (89, 1),
  (90, 2),
  (91, 1)

In [9]:
import gensim
NUM_TOPICS = 4
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.013*"jahr" + 0.009*"mensch" + 0.009*"land" + 0.007*"flüchtling" + 0.007*"abend"')
(1, '0.011*"flüchtling" + 0.011*"jahr" + 0.011*"mensch" + 0.009*"land" + 0.007*"regierung"')
(2, '0.003*"flüchtling" + 0.002*"minute" + 0.001*"wirtschaft" + 0.001*"waffenruhe" + 0.001*"papst"')
(3, '0.011*"jahr" + 0.010*"mensch" + 0.010*"flüchtling" + 0.008*"land" + 0.006*"regierung"')
