In [1]:
# make dataframes from txt data
# each txt contains one tagesschau transcription
import os
import pandas as pd
import numpy as np
from wordcloud import WordCloud
from nltk.stem.snowball import SnowballStemmer
import nltk
from nltk.corpus import stopwords
#nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import LatentDirichletAllocation as LDA
import warnings
import time
warnings.simplefilter("ignore", DeprecationWarning)
import matplotlib.pyplot as plt
import spacy
import de_core_news_lg
nlp = de_core_news_lg.load(disable=['parser'])
#nlp.add_pipe(nlp.create_pipe('sentencizer'))
import datetime
from joblib import Parallel, delayed
from functools import partial
from gensim import corpora,models
import re
import pyLDAvis.gensim
# Enable in-notebook visualizations
%matplotlib inline
pyLDAvis.enable_notebook()
pd.options.display.max_rows = 10


# load the s2t transcriptions
def load_transcriptions(path):
    os.chdir(path)
    transcriptions = os.listdir()#[:30]
    return transcriptions

# preprocessing for topic modelling, in this case LDA (latent dirichlet allocation)
def lemmatize_pipe(doc):
    lemma_list = [tok.lemma_ for tok in doc]# if str(tok).isalpha] #usually str(tok.lemma_).lower()
    return lemma_list

def preprocess_pipe(texts,nouns_array):#,non_nouns_array):
    preproc_pipe = []
    #non_nouns = non_nouns_array
    nounss = nouns_array
    for doc in nlp.pipe(texts, batch_size=int(len(transcriptions)/n_cpu_cores),n_process=int(n_cpu_cores)):
        preproc_pipe.append(lemmatize_pipe(doc))
        #non_nouns.extend([word for word in doc if word.pos_!='NOUN'])
        #nounss.append([str(chunk.text) for chunk in doc.noun_chunks])
        nounss.append([str(word) for word in doc if not word.is_stop and str(word).isalpha()])# and word.pos_=='NOUN'])# or word.pos_=='ADJ'])
    return preproc_pipe,nounss#,non_nouns

def preprocess_transcriptions(transcripts_df,number_cpu_cores=os.cpu_count()-1):
    
    manual_stopwords = ['wetter','wochentag','monat','ziel','zahl','stunden','weg','fernsehen','praesident',
                        'damen','herren','menschen','land','abend','grad','nacht','euro','geld','regierung',
                       'millionen','unternehmen','tagesthemen','angaben','himmelsrichtung',
                       'milliarden','tagesschau','woche','wochen','leben','%','welt','€','sueden',
                        'mitarbeiter','montag','welt','richtung','deutsche','deutschen','mal',
                        'moeglichkeit','fast','klar','meist','letzten','mehrere','steht','bleibt','sieht',
                        'deutlich','gestern','sehen','kritik','guten','folgen','neues','geben','bislang',
                       'eigentlich','teilweise','lassen','suedosten','gekommen','insgesamt','sagen','erneut',
                        'frauen','bringen','nachmittag','naechsten','mindestens','krise']
    
    # init array for all nouns found by spacy
    nounss = []
    #print("Use: "+str(len(transcriptions))+" transcriptions, preprocess them (and perform LDA (Latent Dirichlet Allocation))")
    # use following line instead of the line after the folowing to use "tagesschau" only
    #df = pd.DataFrame(index=[i for i in range(len(transcriptions)) if "tagessschau" in transcriptions[i]], columns=['transcriptionName','content','year','month','day'])
    df = pd.DataFrame(index=[i for i in range(len(transcriptions))], columns=['transcriptionName','content','preprocessed','year','month','day'])
    for transcription in range(len(transcriptions)):
        # use following line to use "tagesschau" only
        #if "tagesschau" in str(transcriptions[transcription]):
        if transcription>-1: #placeholder for above line
            with open(transcriptions[transcription], "r") as f:
                df.loc[transcription,'transcriptionName'] = str(transcriptions[transcription])
                df.loc[transcription,'content'] = f.read()
    df = df.reset_index()
    df1=df.copy()
    for transcript in range(len(df)):
        # remove "newline" and punctuation
        #print(df.loc[transcript,'content'])
        df.loc[transcript,'content'] = df.loc[transcript,'content'].replace("\n","").replace(".","")

    sta = datetime.datetime.now()
    df['content'],nounss = preprocess_pipe(df['content'],nounss)#,non_nouns)
    print("lemmatizing+filtering out nouns needs:",(datetime.datetime.now()-sta).total_seconds(),"seconds")

    for i in range(len(nounss)):
        nounss[i] = [noun.lower() for noun in nounss[i]]

    manual = [[['wetter'],['sonne','regen','wind','schnee','schauer','luft','wolken','gewitter','gewittern']],
              [['himmelsrichtung'],['norden','sueden','osten','westen']],
              [['wochentag'],['montag','dienstag','mittwoch','donnerstag','freitag','samstag','sonntag']],
              [['monat'],['januar','februar','maerz','april','mai','juni','juli','august','september','oktober','november','dezember']],
              [['corona'],['korona']]]
    start_manual=datetime.datetime.now()
    for i in range(len(nounss)):
        for j in range(len(nounss[i])):
            nounss[i][j] = nounss[i][j].replace("ä",'ae').replace("ü","ue").replace("ö","oe").replace("ß","ss")
            for k in manual:
                for l in range(len(k[1])):
                    if k[1][l]==nounss[i][j]:
                        nounss[i][j] = k[0][0]
    end_manual=datetime.datetime.now()
    print("replacing e.g. 'sonne','regen'... by 'wetter' needs:",
          (end_manual-start_manual).total_seconds(),"seconds")    
        
    manualStopWords = manual_stopwords
    removeWords = []
    for i in range(len(nounss)):
        for j in range(len(nounss[i])):
            for k in manualStopWords:
                if k in nounss[i][j]:
                    #print("nounss in manualS",nounss[i])
                    removeWords.append(nounss[i][j])
    removeWords = list(dict.fromkeys(removeWords))

    for i in range(len(nounss)):
        nounss[i] = [word for word in nounss[i] if word not in removeWords]
        
    for i in range(len(nounss)):
        nounss[i] = (','.join(nounss[i])).replace(','," ")
    
    #merge all docs into dataframe
    for i in range(len(df)):
        df.at[i,'preprocessed'] = nounss[i]
        
    for i in range(len(df)):
        if 'tagesschau' in df.loc[i,'transcriptionName'] and 'vor' not in df.loc[i,'transcriptionName']:
            df.at[i,'year'] = int((df.loc[i,'transcriptionName'].split('_')[1][4:8]))
    #display(df.head(10))
    return df,df1

def print_workcloud(processed_transcripts):
    
    # used from tutorial to create a workcloud
    all_documents_as_one_string = ','.join(list(processed_transcripts.values))
    print(len(all_documents_as_one_string))
    #all_documents_as_one_string = ','.join(list(df['content'].values))
    wordcloud = WordCloud(width=800,height=400,background_color="black", max_words=500, contour_width=3, contour_color='steelblue')
    wordcloud.generate(all_documents_as_one_string)
    time.sleep(0.3)
    display(wordcloud.to_image())

def plot_25_most_common_words(count_data, count_vectorizer):
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts+=t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:25]
    #print((count_dict))
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 
    #print(x_pos)
    plt.figure(2, figsize=(15, 15/1.6180))
    plt.subplot(title='25 most common words')
    sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
    dfPlot = pd.DataFrame(data={'words':words,'counts':counts})
    #display(dfPlot)
    sns.barplot(x=words,y=counts,data=dfPlot,palette='husl')
    plt.xticks(x_pos, words, rotation=90) 
    plt.xlabel('words')
    plt.ylabel('counts')
    plt.show()


def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % (int(topic_idx)+1)," ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    

def train_lda_sklearn(df,count_vectorizer,count_data,number_topics=10,number_words=3,ngram_tuple=(1,1),stop_words=None):#stopwords.words('german'))
    # n_jobs=-1 to use all processors, max_iter=50 randomly chosen, not sure about it
    lda = LDA(n_components=n_topics, n_jobs=-1)
    lda.fit(count_data)
    print("LDA Topics:")
    print_topics(lda, count_vectorizer, n_words)
    return lda

def train_model_gensim(df,num_topics=20,passes=10,workers=int((os.cpu_count()/2))-1):
    for i in range(len(df['preprocessed'])):
        df.at[i,'preprocessed'] = df.loc[i,'preprocessed'].split(" ")
    dictionary = corpora.Dictionary(df['preprocessed'])
    bow_corpus = [dictionary.doc2bow(doc) for doc in df['preprocessed']]
    lda_model = models.LdaMulticore(bow_corpus, num_topics=num_topics, id2word=dictionary, passes=passes, workers=workers)
    return lda_model, bow_corpus, dictionary
    

print("done")

done


In [8]:
#%%time
############################### preprocess and train for each year separately for a given daterange ####################################


path = "/home/sim/all/Master/Forschungspraktikum/Tagesschau/transcripts/"
n_cpu_cores = 13 #int(os.cpu_count()-1)
print("number cpu_cores to use:",n_cpu_cores)

#load s2t transcriptions
path = "/home/sim/all/Master/Forschungspraktikum/Tagesschau/transcripts/"
transcriptions = load_transcriptions(path)
keys = ['2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017','2018','2019','2020']
years_dict = {}
for year in keys: 
    years_dict[year] = []

# lda for certain dates
daterange = pd.date_range(datetime.datetime(2007, 1, 1), datetime.datetime(2020, 7, 31))
dates_of_interest = []
for date in daterange:
    for i in transcriptions:
        if 'tagesschau' in i and 'vor' not in i:
            year = int((i.split('_')[1][4:8]))
            if year==date.year:
                month = int((i.split('_')[1][2:4]))
                if month == date.month:
                    day = int((i.split('_')[1][0:2]))
                    if day == date.day:
                        years_dict[str(year)].append(i)
                        dates_of_interest.append(i)
print("There are ",len(dates_of_interest),"transcriptions in given daterange")

#"""
for year in years_dict.keys():
    print("\npreprocess transcriptions of year:",year,"quantity:",len(years_dict[str(year)]))
    if len(years_dict[str(year)]) == 0:
        print("no transcription for year:",year)
        continue
    start = datetime.datetime.now()
    #transcriptions = transcriptions[i*50:(i+1)*50]
    transcriptions = years_dict[str(year)]
    # maybe: more threads -> more RAM so in case of low ram use less cores


    df_processed,df1 = preprocess_transcriptions(transcriptions,number_cpu_cores=n_cpu_cores)
    #print_workcloud(df_processed['preprocessed'])

    # Initialise count vectorizer
    count_vectorizer = CountVectorizer(ngram_range=(1,1))
    # Fit and transform the processed titlest
    count_data = count_vectorizer.fit_transform(df_processed['preprocessed'])
    #params for LDA
    n_topics = 10 #alpha
    n_words = 5 #beta

    # Visualise the 10 most common words
    #plot_25_most_common_words(count_data, count_vectorizer)
    
    #"""
    # with gensim lda
    startLDA = datetime.datetime.now()
    lda_model,bow_corpus,dictionary = train_model_gensim(df=df_processed,num_topics=20,passes=10,workers=7)
    endLDA = datetime.datetime.now()
    print("lda training needs:",(endLDA-startLDA).total_seconds(),"seconds")
    for idx, topic in lda_model.print_topics(-1):
        print('Topic: {} \nWords: {}'.format(idx+1, re.sub('[^A-Za-z ]+', '', topic).replace("  "," ")))
    
    #"""
    """
    # with sklearn lda
    lda_model = train_lda_sklearn(df_processed,count_vectorizer,count_data,number_topics=n_topics,number_words=n_words,ngram_tuple=(2,3),stop_words=None)
    print_topics(lda_model, count_vectorizer, n_words)
    del df_processed
    del count_data
    del lda_model
    #print("loop needed:",(end-start).total_seconds(),"seconds")
    #"""
    startVis = datetime.datetime.now()
    display(pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary))
    endVis = datetime.datetime.now()
    print((endVis-startVis).total_seconds(),"seconds")

#"""

number cpu_cores to use: 13
There are  3765 transcriptions in given daterange

preprocess transcriptions of year: 2007 quantity: 263
lemmatizing+filtering out nouns needs: 20.621717 seconds
replacing e.g. 'sonne','regen'... by 'wetter' needs: 0.677988 seconds
lda training needs: 22.693536 seconds
Topic: 1 
Words: berlin polizei gewerkschaft gericht treffen usa gdl bahn peking gipfel
Topic: 2 
Words: spd bahn merkel gipfel polizei heiligendamm cdu partei kanzlerin gdl
Topic: 3 
Words: treffen union cdu fdp bahn heftige olympischen risse beton anschlaege
Topic: 4 
Words: spd urlaub gilt hamburger gdl weiterbildung mindestlohn scheint frankreich bau
Topic: 5 
Words: spd bahn soldaten berlin merkel gdl kinder muenchen gipfel einsatz
Topic: 6 
Words: bahn kanzlerin spd berlin kinder trotz china laender fordern bekommen
Topic: 7 
Words: kinder merkel spd gewerkschaft oettinger stark polizei afghanistan cdu csu
Topic: 8 
Words: berlin spd polizei partei kinder merkel sarkozy union afghanistan

14.325941 seconds

preprocess transcriptions of year: 2008 quantity: 358
lemmatizing+filtering out nouns needs: 28.373154 seconds
replacing e.g. 'sonne','regen'... by 'wetter' needs: 0.952064 seconds
lda training needs: 31.465592 seconds
Topic: 1 
Words: maschine daten bolivien bank einigung treffen schwere piraten russischen buerger
Topic: 2 
Words: unicef urteil koalition kirche stellen union daten private bundesinnenminister forschung
Topic: 3 
Words: spd waffenruhe mieter spiel stadt kinder patientenverfuegung wasser vermieter loew
Topic: 4 
Words: berlin polizei soldaten spd mitte merkel politik hilfe bundestag israel
Topic: 5 
Words: spd cdu berlin obama fdp partei polizei koalition stadt afghanistan
Topic: 6 
Words: gazastreifen israel hamas palaestinenser aussenminister partei israelische georgien cdu kinder
Topic: 7 
Words: spd usa union berlin obama partei eu koalition merkel melden
Topic: 8 
Words: spd haelfte nordkorea beck piraten usa banken aufgabe partei kurt
Topic: 9 
W

17.429537 seconds

preprocess transcriptions of year: 2009 quantity: 358
lemmatizing+filtering out nouns needs: 28.768824 seconds
replacing e.g. 'sonne','regen'... by 'wetter' needs: 1.002378 seconds
lda training needs: 29.240532 seconds
Topic: 1 
Words: porsche laender internet strom windows berlin zukunft sri entschieden strasse
Topic: 2 
Words: spd eu partei internet usa entscheidung berlin israel afghanistan opel
Topic: 3 
Words: internet obama opposition getoetet usa afghanistan zb polizei faellt bericht
Topic: 4 
Words: winnenden schueler waffen polizei nato lehrer memorial frankreich besucher verhindern
Topic: 5 
Words: spd arbeit berlin kurzarbeit polizei obama cdu union vertrag fdp
Topic: 6 
Words: fdp berlin spd obama union laender cdu usa freiheit merkel
Topic: 7 
Words: opel motors general magna berlin bund entscheidung laender bahn csu
Topic: 8 
Words: porsche vw spd volkswagen nato ahmadinedschad regime ergebnis bundesrat schule
Topic: 9 
Words: israel gazastreifen hamas 

16.973536 seconds

preprocess transcriptions of year: 2010 quantity: 357
lemmatizing+filtering out nouns needs: 26.98892 seconds
replacing e.g. 'sonne','regen'... by 'wetter' needs: 0.881418 seconds
lda training needs: 31.50793 seconds
Topic: 1 
Words: fdp merkel afghanistan berlin soldaten spd usa partei koalition kanzlerin
Topic: 2 
Words: tuerkei merkel eu berlin frage westerwelle union polen kanzlerin fdp
Topic: 3 
Words: stadt christen bkk kirche steinbach sicherheit nordwesten sportler vertriebenen demonstration
Topic: 4 
Words: partei merkel arbeit obama internet lafontaine loki sicherungsverwahrung reform kabinett
Topic: 5 
Words: spd stadt new soldaten google daten arbeit polen treffen opposition
Topic: 6 
Words: frei hausarrest internationalen bleiben obama anfang mitte druck mainz politik
Topic: 7 
Words: berlin spd gruenen cdu fdp koalition opposition hartz iv kuenftig
Topic: 8 
Words: spd soldaten opfer berlin afghanistan merkel cdu eu haelfte offenbar
Topic: 9 
Words: wes

16.417245 seconds

preprocess transcriptions of year: 2011 quantity: 353
lemmatizing+filtering out nouns needs: 28.198861 seconds
replacing e.g. 'sonne','regen'... by 'wetter' needs: 0.909705 seconds
lda training needs: 30.067648 seconds
Topic: 1 
Words: usa rwe nationen vereinten suedsudan csu bank banken gazprom mitglied
Topic: 2 
Words: paar soldaten opposition christen wasser gaddafi gaddafis aufstaendischen wissen fest
Topic: 3 
Words: fdp spd polizei berlin bremen gruenen merkel kanzlerin stimmen behoerden
Topic: 4 
Words: fdp opposition eu partei parlament gewalt spd laender merkel waehrungsfonds
Topic: 5 
Words: gaddafi opposition polizei berlin demonstranten tunesien usa merkel stadt laender
Topic: 6 
Words: gaddafi tripolis soldaten papst aufstaendischen getoetet finden opposition stadt cdu
Topic: 7 
Words: berlin merkel spd opposition partei demonstranten cdu fdp gruenen usa
Topic: 8 
Words: uebergangsrat banken gaddafi tripolis usa gaddafis israel bank rebellen berlin
Topic

16.01481 seconds

preprocess transcriptions of year: 2012 quantity: 357
lemmatizing+filtering out nouns needs: 29.59586 seconds
replacing e.g. 'sonne','regen'... by 'wetter' needs: 0.971804 seconds
lda training needs: 30.017888 seconds
Topic: 1 
Words: usa porsche frankreich kairo costa vw spd romney verfassungsschutz clinton
Topic: 2 
Words: syrien berlin opposition gruenen partei spd cdu ezb entscheidung kinder
Topic: 3 
Words: syrien israel usa berlin opposition internationalen soldaten finden stadt politik
Topic: 4 
Words: spd thema obama syrien soldaten steinbrueck merkel laender afghanistan opposition
Topic: 5 
Words: partei banken opposition syrien putin costa concordia basel verhandlungen kinder
Topic: 6 
Words: fukushima truppen syrien soldaten kirche rebellen ackermann trier slowakei islamkonferenz
Topic: 7 
Words: syrien opposition fdp polizei merkel kanzlerin partei demonstranten botschaft csu
Topic: 8 
Words: syrien syrischen stadt spd cdu china opposition nordwesten finde

15.608071 seconds

preprocess transcriptions of year: 2013 quantity: 362
lemmatizing+filtering out nouns needs: 28.340817 seconds
replacing e.g. 'sonne','regen'... by 'wetter' needs: 0.99928 seconds
lda training needs: 33.60539 seconds
Topic: 1 
Words: erde mollath obama datenschuetzer militaerschlag aegypten asteroiden richter treffen putin
Topic: 2 
Words: muenchen bayern dortmund partei hoeness boston wulff zypern nordkorea internet
Topic: 3 
Words: us gewalt airways airlines american koennten laender stark pferdefleisch produkte
Topic: 4 
Words: soldaten spd demonstranten tuerkei finden berlin zeigt eu usa armee
Topic: 5 
Words: usa berlin partei syrien bleiben aegypten obama nsa daten spd
Topic: 6 
Words: kanzlerin polizei janukowitsch merkel papst koalition entscheidung wuensche schwer besuch
Topic: 7 
Words: ullrich gestaendnis jan demonstranten maessig aussenminister emscher tuerkischen us glaube
Topic: 8 
Words: prozess eu usa syrien schumacher entscheidung opposition union pa

16.735491 seconds

preprocess transcriptions of year: 2014 quantity: 366
lemmatizing+filtering out nouns needs: 28.705252 seconds
replacing e.g. 'sonne','regen'... by 'wetter' needs: 0.9821 seconds
lda training needs: 31.235783 seconds
Topic: 1 
Words: ukraine separatisten stadt usa eu finden tuerkei gruenen berlin israel
Topic: 2 
Words: fluechtlinge usa tuerkei erdogan google ukraine israel m zuvor waffen
Topic: 3 
Words: ukraine krim eu israel usa soldaten russische treffen berlin russischen
Topic: 4 
Words: soldaten ukraine kampf eu lufthansa taliban nordwesten un armee treffen
Topic: 5 
Words: ukraine russische fragen putin arbeitnehmer ezb pkk aktivisten separatisten einfach
Topic: 6 
Words: soldaten bundeswehr eu ukraine berlin nebel stadt waffenruhe tuerkei fluechtlinge
Topic: 7 
Words: ukraine separatisten berlin aussenminister russische treffen osze ukrainische lage russischen
Topic: 8 
Words: ukraine eu opposition nato finden kuenftig union kinder koalition nordwesten
Topic:

16.697884 seconds

preprocess transcriptions of year: 2015 quantity: 365
lemmatizing+filtering out nouns needs: 29.037727 seconds
replacing e.g. 'sonne','regen'... by 'wetter' needs: 0.92922 seconds
lda training needs: 31.903026 seconds
Topic: 1 
Words: iran abkommen obama griechischen banken sanktionen kongress einigung tsipras atomprogramm
Topic: 2 
Words: fluechtlinge finden spd kinder soldaten bundestag tsipras trotz frankreich merkel
Topic: 3 
Words: fluechtlinge merkel berlin eu grenze fluechtlingen kanzlerin union partei spd
Topic: 4 
Words: fluechtlinge berlin stadt paris thema eu ukraine finden syrien kampf
Topic: 5 
Words: finden ukraine berlin stadt vw edathy gabriel nordwesten spd verfahren
Topic: 6 
Words: fluechtlinge laender berlin bund griechischen cockpit kinder papst kamen usa
Topic: 7 
Words: stadion fluechtlinge einfach berlin frankreich finden usa terror entscheidung bleiben
Topic: 8 
Words: ukraine fluechtlinge spd polizei berlin hamburg merkel stadt eu finden
Top

16.725644 seconds

preprocess transcriptions of year: 2016 quantity: 367
lemmatizing+filtering out nouns needs: 29.127845 seconds
replacing e.g. 'sonne','regen'... by 'wetter' needs: 0.961787 seconds
lda training needs: 29.922716 seconds
Topic: 1 
Words: merkel aleppo syrischen stadt fluechtlinge trump berlin tuerkei bund konservativen
Topic: 2 
Words: aleppo tuerkei syrien eu erdogan berlin merkel polizei nordosten kanzlerin
Topic: 3 
Words: tuerkei de fluechtlinge stadt trump syrien paris vw merkel ankara
Topic: 4 
Words: tuerkei fluechtlinge polizei behoerden eu frankreich berlin merkel abkommen grenze
Topic: 5 
Words: tuerkei fluechtlinge eu spd merkel berlin syrien union cdu stadt
Topic: 6 
Words: kerber fluechtlinge merkel finale laender angelique syrien regnet treffen who
Topic: 7 
Words: fluechtlinge eu partei tuerkei gruenen usa grenze polizei merkel kandidaten
Topic: 8 
Words: kuba castro usa obama scheint internet finden thema lufthansa staaten
Topic: 9 
Words: amri anschlag

16.605192 seconds

preprocess transcriptions of year: 2017 quantity: 51
lemmatizing+filtering out nouns needs: 3.841191 seconds
replacing e.g. 'sonne','regen'... by 'wetter' needs: 0.146075 seconds
lda training needs: 5.672419 seconds
Topic: 1 
Words: cdu polen afd wahlkampf sicherheit usa wahlprogramm facebook trump holm
Topic: 2 
Words: trump eu usa fluechtlinge museum italien obama libyen opfer tunesien
Topic: 3 
Words: polizei steinmeier trump behoerden berlin amri koeln personen entscheidung wissen
Topic: 4 
Words: muenchen nato trump laender sicherheitskonferenz erdogan internationalen leipzig merkel fuehrung
Topic: 5 
Words: trump merkel spd afghanistan usa partei laender eu kinder behoerden
Topic: 6 
Words: wasser vw platz berlin gabriel trump neuer usa herzog spd
Topic: 7 
Words: npd trump nato eu usa fluechtlinge schulz zusammenarbeit partei dahlmeier
Topic: 8 
Words: trump berlin cdu donald steinbach amri urteil falschmeldungen bericht washington
Topic: 9 
Words: nawalny ver

5.217608 seconds

preprocess transcriptions of year: 2018 quantity: 0
no transcription for year: 2018

preprocess transcriptions of year: 2019 quantity: 0
no transcription for year: 2019

preprocess transcriptions of year: 2020 quantity: 208
lemmatizing+filtering out nouns needs: 17.150638 seconds
replacing e.g. 'sonne','regen'... by 'wetter' needs: 0.585062 seconds
lda training needs: 21.148904 seconds
Topic: 1 
Words: corona eu cdu union afd berlin usa coronavirus iran pandemie
Topic: 2 
Words: corona quarantaene italien coronavirus pandemie kunden polen bahn tuerkei lage
Topic: 3 
Words: corona pandemie union berlin polizei trump quarantaene partei app laender
Topic: 4 
Words: coronavirus virus berlin china union thueringen partei trump amt hertha
Topic: 5 
Words: china iran bleiben usa trump irak us putin parlament kraftwerke
Topic: 6 
Words: corona tuerkei eu fluechtlinge polizei bundeswehr trump frage china grenze
Topic: 7 
Words: corona pandemie coronavirus eu massnahmen virus b

11.94631 seconds


In [7]:
#%%time
################### preprocess data for given daterange in yearly portions for feasibility in terms of RAM usage #######################
################### train on whole data afterwards #############################################################################################

path = "/home/sim/all/Master/Forschungspraktikum/Tagesschau/transcripts/"
n_cpu_cores = 13 #int(os.cpu_count()-1)
print("number cpu_cores to use:",n_cpu_cores)
df_all_processed = pd.DataFrame(index=[], columns=['preprocessed'])
#display(df_all_processed)

#load s2t transcriptions
path = "/home/sim/all/Master/Forschungspraktikum/Tagesschau/transcripts/"
transcriptions = load_transcriptions(path)
keys = ['2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017','2018','2019','2020']
years_dict = {}
for year in keys: 
    years_dict[year] = []

# prepare daterange of interest for lda
daterange = pd.date_range(datetime.datetime(2007, 1, 1), datetime.datetime(2020, 12, 31))
dates_of_interest = []
for date in daterange:
    for i in transcriptions:
        if 'tagesschau' in i and 'vor' not in i:
            #print(i)
            year = int((i.split('_')[1][4:8]))
            if year==date.year:
                month = int((i.split('_')[1][2:4]))
                if month == date.month:
                    day = int((i.split('_')[1][0:2]))
                    if day == date.day:
                        years_dict[str(year)].append(i)
                        dates_of_interest.append(i)
print("There are ",len(dates_of_interest),"transcriptions in given daterange")

# preprocess every year
for year in years_dict.keys():
    print("\npreprocess transcriptions of year:",year,"quantity:",len(years_dict[str(year)]))
    if len(years_dict[str(year)]) == 0:
        print("no transcription for year:",year)
        continue
    start = datetime.datetime.now()
    transcriptions = years_dict[str(year)]
    df_processed,df1 = preprocess_transcriptions(transcriptions,number_cpu_cores=n_cpu_cores)
    df_all_processed['preprocessed'] = df_all_processed['preprocessed'].append(df_processed['preprocessed'],ignore_index=True)
    

# Initialise count vectorizer
count_vectorizer = CountVectorizer(ngram_range=(1,1))

# Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(df_all_processed['preprocessed'])

#params for LDA
n_topics = 10 #alpha
n_words = 5 #beta

# Visualise the 25 most common words
#plot_25_most_common_words(count_data, count_vectorizer)

# with gensim lda
print("Train LDA...")
startLDA = datetime.datetime.now()
lda_model,bow_corpus,dictionary = train_model_gensim(df=df_all_processed,num_topics=20,passes=10,workers=7)
endLDA = datetime.datetime.now()
print("lda training needs:",(endLDA-startLDA).total_seconds(),"seconds")
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx+1, re.sub('[^A-Za-z ]+', '', topic).replace("  "," ")))
#"""
"""
# with sklearn lda
lda_model = train_lda_sklearn(df_processed,count_vectorizer,count_data,number_topics=n_topics,number_words=n_words,ngram_tuple=(2,3),stop_words=None)
print_topics(lda_model, count_vectorizer, n_words)
del df_processed
del count_data
del lda_model
#print("loop needed:",(end-start).total_seconds(),"seconds")
#"""
#"""
startVis = datetime.datetime.now()
display(pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary))
endVis = datetime.datetime.now()
print((endVis-startVis).total_seconds(),"seconds")

print("\ndone")

number cpu_cores to use: 13
There are  3878 transcriptions in given daterange

preprocess transcriptions of year: 2007 quantity: 263
lemmatizing+filtering out nouns needs: 20.721334 seconds
replacing e.g. 'sonne','regen'... by 'wetter' needs: 0.681707 seconds

preprocess transcriptions of year: 2008 quantity: 358
lemmatizing+filtering out nouns needs: 30.149627 seconds
replacing e.g. 'sonne','regen'... by 'wetter' needs: 0.995603 seconds

preprocess transcriptions of year: 2009 quantity: 358
lemmatizing+filtering out nouns needs: 29.487958 seconds
replacing e.g. 'sonne','regen'... by 'wetter' needs: 0.985858 seconds

preprocess transcriptions of year: 2010 quantity: 357
lemmatizing+filtering out nouns needs: 27.983322 seconds
replacing e.g. 'sonne','regen'... by 'wetter' needs: 0.912645 seconds

preprocess transcriptions of year: 2011 quantity: 353
lemmatizing+filtering out nouns needs: 28.276387 seconds
replacing e.g. 'sonne','regen'... by 'wetter' needs: 0.913597 seconds

preprocess 

15.630298 seconds

done


In [None]:
#%%time 
#lda_model = models.LdaMulticore(bow_corpus, num_topics=20, id2word=dictionary, passes=100, workers=7)
#for idx, topic in lda_model.print_topics(-1):
#    print('Topic: {} \nWords: {}'.format(idx+1, re.sub('[^A-Za-z ]+', '', topic).replace("  "," ")))

In [None]:
#%%time 
#display(pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary))