In [1]:
# make dataframes from txt data
# each txt contains one tagesschau transcription
import os
import pandas as pd
import numpy as np
from wordcloud import WordCloud
from nltk.stem.snowball import SnowballStemmer
import nltk
from nltk.corpus import stopwords
#nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import LatentDirichletAllocation as LDA
import warnings
import time
warnings.simplefilter("ignore", DeprecationWarning)
import matplotlib.pyplot as plt
import spacy
import de_core_news_lg
nlp = de_core_news_lg.load(disable=['parser','ner'])
#nlp.add_pipe(nlp.create_pipe('sentencizer'))
import datetime
from joblib import Parallel, delayed
from functools import partial
from gensim import corpora,models
import re

# load the s2t transcriptions
def load_transcriptions(path):
    os.chdir(path)
    transcriptions = os.listdir()#[:30]
    return transcriptions

# preprocessing for topic modelling, in this case LDA (latent dirichlet allocation)
def lemmatize_pipe(doc):
    lemma_list = [tok.lemma_ for tok in doc]# if str(tok).isalpha] #usually str(tok.lemma_).lower()
    return lemma_list

def preprocess_pipe(texts,nouns_array):#,non_nouns_array):
    preproc_pipe = []
    #non_nouns = non_nouns_array
    nounss = nouns_array
    for doc in nlp.pipe(texts, batch_size=int(len(transcriptions)/n_cpu_cores),n_process=int(n_cpu_cores)):
        preproc_pipe.append(lemmatize_pipe(doc))
        #non_nouns.extend([word for word in doc if word.pos_!='NOUN'])
        #nounss.append([str(chunk.text) for chunk in doc.noun_chunks])
        nounss.append([str(word) for word in doc if not word.is_stop and word.pos_=='NOUN'])# or word.pos_=='VERB'])# or word.pos_=='ADJ'])
    return preproc_pipe,nounss#,non_nouns

def preprocess_transcriptions(transcripts_df,number_cpu_cores=os.cpu_count()-1,manual_stopwords=[]):
    # init array for all nnouns found by spacy
    nounss = []
    #print("Use: "+str(len(transcriptions))+" transcriptions, preprocess them (and perform LDA (Latent Dirichlet Allocation))")
    # use following line instead of the line after the folowing to use "tagesschau" only
    #df = pd.DataFrame(index=[i for i in range(len(transcriptions)) if "tagessschau" in transcriptions[i]], columns=['transcriptionName','content','year','month','day'])
    df = pd.DataFrame(index=[i for i in range(len(transcriptions))], columns=['transcriptionName','content','preprocessed','year','month','day'])
    for transcription in range(len(transcriptions)):
        # use following line to use "tagesschau" only
        #if "tagesschau" in str(transcriptions[transcription]):
        if transcription>-1: #placeholder for above line
            with open(transcriptions[transcription], "r") as f:
                df.loc[transcription,'transcriptionName'] = str(transcriptions[transcription])
                df.loc[transcription,'content'] = f.read()
    df = df.reset_index()
    df1=df.copy()
    for transcript in range(len(df)):
        # remove "newline" and punctuation
        #print(df.loc[transcript,'content'])
        df.loc[transcript,'content'] = df.loc[transcript,'content'].replace("\n","").replace(".","")

    sta = datetime.datetime.now()
    df['content'],nounss = preprocess_pipe(df['content'],nounss)#,non_nouns)
    print("lemmatizing+filtering out nouns needs:",(datetime.datetime.now()-sta).total_seconds(),"seconds")

    for i in range(len(nounss)):
        nounss[i] = [noun.lower() for noun in nounss[i]]

    manual = [[['wetter'],['sonne','regen','wind','schnee','schauer','luft','wolken','gewitter','gewittern']],
                 [['himmelsrichtung'],['norden','süden','osten','westen']]]
    start_manual=datetime.datetime.now()
    for i in range(len(nounss)):
        for j in range(len(nounss[i])):
            for k in manual:
                for l in range(len(k[1])):
                    if k[1][l]==nounss[i][j]:
                        nounss[i][j] = k[0][0]
    end_manual=datetime.datetime.now()
    print("replacing e.g. 'sonne','regen'... by 'wetter' needs:",
          (end_manual-start_manual).total_seconds(),"seconds")    
        
    manualStopWords = manual_stopwords
    removeWords = []
    for i in range(len(nounss)):
        for j in range(len(nounss[i])):
            for k in manualStopWords:
                if k in nounss[i][j]:
                    #print("nounss in manualS",nounss[i])
                    removeWords.append(nounss[i][j])
    removeWords = list(dict.fromkeys(removeWords))

    for i in range(len(nounss)):
        nounss[i] = [word for word in nounss[i] if word not in removeWords]
        
    for i in range(len(nounss)):
        nounss[i] = (','.join(nounss[i])).replace(','," ")
    
    #merge all docs into dataframe
    for i in range(len(df)):
        df.at[i,'preprocessed'] = nounss[i]
    return df,df1

def print_workcloud(processed_transcripts):
    
    # used from tutorial to create a workcloud
    all_documents_as_one_string = ','.join(list(processed_transcripts.values))
    print(len(all_documents_as_one_string))
    #all_documents_as_one_string = ','.join(list(df['content'].values))
    wordcloud = WordCloud(width=800,height=400,background_color="black", max_words=500, contour_width=3, contour_color='steelblue')
    wordcloud.generate(all_documents_as_one_string)
    time.sleep(0.3)
    display(wordcloud.to_image())

def plot_25_most_common_words(count_data, count_vectorizer):
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts+=t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:25]
    #print((count_dict))
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 
    #print(x_pos)
    plt.figure(2, figsize=(15, 15/1.6180))
    plt.subplot(title='25 most common words')
    sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
    dfPlot = pd.DataFrame(data={'words':words,'counts':counts})
    #display(dfPlot)
    sns.barplot(x=words,y=counts,data=dfPlot,palette='husl')
    plt.xticks(x_pos, words, rotation=90) 
    plt.xlabel('words')
    plt.ylabel('counts')
    plt.show()


def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % (int(topic_idx)+1)," ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    

def train_lda(df,count_vectorizer,count_data,number_topics=10,number_words=3,ngram_tuple=(1,1),stop_words=None):#stopwords.words('german'))
    # n_jobs=-1 to use all processors, max_iter=50 randomly chosen, not sure about it
    lda = LDA(n_components=n_topics, n_jobs=-1)
    lda.fit(count_data)
    print("LDA Topics:")
    print_topics(lda, count_vectorizer, n_words)
    return lda
print("done")

done


In [None]:
%%time
path = "/home/sim/all/Master/Forschungspraktikum/Tagesschau/transcripts/"
n_cpu_cores = 13 #int(os.cpu_count()-1)
print("number cpu_cores to use:",n_cpu_cores)

#load s2t transcriptions
path = "/home/sim/all/Master/Forschungspraktikum/Tagesschau/transcripts/"
transcriptions = load_transcriptions(path)
keys = ['2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017','2018','2019','2020']
years_dict = {}
for year in keys: 
    years_dict[year] = []

# lda for certain dates
daterange = pd.date_range(datetime.datetime(2007, 1, 1), datetime.datetime(2020, 12, 31))
dates_of_interest = []
for date in daterange:
    for i in transcriptions:
        if 'tagesschau' in i and 'vor' not in i:
            year = int((i.split('_')[1][4:8]))
            if year==date.year:
                month = int((i.split('_')[1][2:4]))
                if month == date.month:
                    day = int((i.split('_')[1][0:2]))
                    if day == date.day:
                        years_dict[str(year)].append(i)
                        dates_of_interest.append(i)
print("There are ",len(dates_of_interest),"transcriptions in given daterange")

#for year in years_dict.keys():
    #print(year)
    #print(len(years_dict[str(year)]))

#"""
for year in years_dict.keys():
    print("\npreprocess transcriptions of year:",year,"quantity:",len(years_dict[str(year)]))
    if len(years_dict[str(year)]) == 0:
        print("no transcription for year:",year)
        continue
    start = datetime.datetime.now()
    #transcriptions = transcriptions[i*50:(i+1)*50]
    transcriptions = years_dict[str(year)]
    # maybe: more threads -> more RAM so in case of low ram use less cores

    manual_stopwords = ['wetter','ziel','zahl','stunden','weg','fernsehen','präsident'
                        'damen','herren','menschen','land','abend','grad','nacht','euro','geld','regierung',
                       'millionen','unternehmen','tagesthemen','angaben','präsident','himmelsrichtung',
                       'milliarden','tagesschau','woche','wochen','leben']
    df_processed,df1 = preprocess_transcriptions(transcriptions,number_cpu_cores=n_cpu_cores,manual_stopwords=manual_stopwords)
    #print_workcloud(df_processed['preprocessed'])

    # Initialise count vectorizer
    count_vectorizer = CountVectorizer(ngram_range=(1,1))
    # Fit and transform the processed titles
    count_data = count_vectorizer.fit_transform(df_processed['preprocessed'])
    #params for LDA
    n_topics = 10 #alpha
    n_words = 5 #beta

    # Visualise the 10 most common words
    #plot_25_most_common_words(count_data, count_vectorizer)
    
    startLDA = datetime.datetime.now()
    """
    # with gensim lda
    for i in range(len(df_processed['preprocessed'])):
        df_processed.at[i,'preprocessed'] = df_processed.loc[i,'preprocessed'].split(" ")
    dictionary = corpora.Dictionary(df_processed['preprocessed'])
    bow_corpus = [dictionary.doc2bow(doc) for doc in df_processed['preprocessed']]
    lda_model = models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=20, workers=n_cpu_cores)
    endLDA = datetime.datetime.now()
    print("lda training needs:",(endLDA-startLDA).total_seconds(),"seconds")
    for idx, topic in lda_model.print_topics(-1):
        print('Topic: {} \nWords: {}'.format(idx+1, re.sub('[^A-Za-z ]+', '', topic).replace("  "," ")))
        #print('Topic: {} \nWords: {}'.format(idx,topic))
    
    #"""
    # with sklearn lda
    lda_model = train_lda(df_processed,count_vectorizer,count_data,number_topics=n_topics,number_words=n_words,ngram_tuple=(2,3),stop_words=None)
    print_topics(lda_model, count_vectorizer, n_words)
    del df_processed
    del count_data
    del lda_model
    #print("loop needed:",(end-start).total_seconds(),"seconds")
#"""

number cpu_cores to use: 13
There are  2566 transcriptions in given daterange

preprocess transcriptions of year: 2007 quantity: 263
lemmatizing+filtering out nouns needs: 12.630225 seconds
replacing e.g. 'sonne','regen'... by 'wetter' needs: 0.134651 seconds
LDA Topics:
Topic #1: polizei mitte koalition opposition soldaten
Topic #2: gipfel polizei union doping entscheidung
Topic #3: partei soldaten union eu mitte
Topic #4: soldaten partei bahn einsatz arbeit
Topic #5: kritik us partei länder bahn
Topic #6: bahn mindestlohn union koalition kinder
Topic #7: koalition union mindestlohn soldaten thema
Topic #8: bahn gipfel union streik kanzlerin
Topic #9: kinder bahn polizei politik damen
Topic #10: kinder soldaten richtung familien nordosten
Topic #1: polizei mitte koalition opposition soldaten
Topic #2: gipfel polizei union doping entscheidung
Topic #3: partei soldaten union eu mitte
Topic #4: soldaten partei bahn einsatz arbeit
Topic #5: kritik us partei länder bahn
Topic #6: bahn mind