In [17]:
import os
import xml.etree.ElementTree as ET
import csv
import gensim
import nltk
from sklearn.utils import resample
import pathlib
from datetime import datetime
import re

## Laden der Bundestagsreden

Hilfsfunktion, die aus den einzelnen xml-Dateien die Wahlperiode, die Sitzungsnummer und das Datum herausfiltert

In [9]:
def get_meta_data(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    wahlperiode = root.find('teiHeader').find("fileDesc").find("titleStmt").find("legislativePeriod").text
    session = root.find('teiHeader').find("fileDesc").find("titleStmt").find("sessionNo").text
    date = root.find('teiHeader').find("fileDesc").find("publicationStmt").find("date").text
    return {"wahlperiode": wahlperiode, "session": session, "date": date}

Die Bundestagsreden des PolMine-Projektes wurden unter folgendem Link heruntergeladen: https://github.com/PolMine/GermaParlTEI

Mit einem for-Loop werden die einzelnen Reden der xml-Dateien in einem dictionary abgespeichert.

In [10]:
corpus_directory = "GermaParlTEI-master"
corpus_dict = {}
count = 0

for path, dirs, files in os.walk(corpus_directory):
    for index, file in enumerate(files):
        xml_file = os.path.join(path, file)
        meta_data_dict = get_meta_data(xml_file)
        tree = ET.parse(xml_file)
        root = tree.getroot()
        for speaker in root.iter("sp"):   
            corpus_dict[count] = {}
            corpus_dict[count]["name"] = speaker.get("name")
            corpus_dict[count]["role"] = speaker.get("role")
            corpus_dict[count]["party"] = speaker.get("party")
            corpus_dict[count]["text"] = " ".join([paragraph.text for paragraph in speaker.iter("p")])
            corpus_dict[count]["wahlperiode"] = meta_data_dict["wahlperiode"]
            corpus_dict[count]["session"] = meta_data_dict["session"]
            corpus_dict[count]["date"] = meta_data_dict["date"]
            count += 1

## Laden der Ländernamen

In [11]:
long_country_names = ["bosnien und herzegowina", "burkina faso", "costa rica","dominikanische republik",
                      "san marino","saudi arabien","sri lanka","vereinigte arabische emirate",
                      "sierra leone", "trinidad und tobago", "zentralafrikanische republik",
                      "demokratische republik kongo","vereinigtes königreich"]   

countries_names_german = []

with open("Ländernamen.csv", encoding="utf-8-sig") as read_file:
    csv_file = csv.reader(read_file)
    for row in csv_file:
        countries_names_german.append(row[0])

## Trainieren der Wortvektormodelle

Klasse um ein Wortvektomodell aus allen Bundestagsreden zu bilden.

In [20]:
class MySentences_Full():
    
    def __init__(self,corpus_dict):
        self.corpus_dict = corpus_dict
        
        
    def __iter__(self):
        for speech in self.corpus_dict:
            if self.corpus_dict[speech]["role"] == "presidency":
                continue
            speech_text = self.corpus_dict[speech]["text"]
            speech_text = re.sub("[,'—–]"," ",speech_text)
            speech_text = re.sub("-","",speech_text)
            speech_text = speech_text.lower()
            speech_text = speech_text.replace("saudiarabien","saudi_arabien")
            speech_text = speech_text.replace("vereinigte staaten von amerika","usa")
            speech_text = speech_text.replace("vereinigten staaten von amerika","usa")
            speech_text = speech_text.replace("vereinigte staaten","usa")
            speech_text = speech_text.replace("vereinigten staaten","usa")
            speech_text = speech_text.replace("amerika","usa")
            speech_text = speech_text.replace("botswana","botsuana")
            speech_text = speech_text.replace("vereinigte königreich","vereinigtes_königreich")
            speech_text = speech_text.replace("vereinigten königreichs","vereinigtes_königreich")
            speech_text = speech_text.replace("großbritannien","vereinigtes_königreich")
            for country in long_country_names:
                speech_text = speech_text.replace(country, country.replace(" ","_"))
            for country in countries_names_german:
                speech_text = speech_text.replace(country.lower() + "s", country.lower())
            final_text = [nltk.tokenize.word_tokenize(sentence) for sentence in nltk.tokenize.sent_tokenize(speech_text.lower())]
            final_text = [[re.sub("[-,.'!?;–:—\"„“/]","",word) for word in sentence if re.sub("[-,.'!?;–:—\"„“/]","",word)] for sentence in final_text]
            for sentence in final_text:
                if sentence:
                    yield sentence

Klasse um ein Wortvektormodell aus Bundestagsreden eines bestimmten Zeitraums zu bilden.

In [21]:
class MySentences_Period():
    
    def __init__(self,corpus_dict, year):
        self.corpus_dict = corpus_dict
        self.year = year
        
        
    def __iter__(self):
        for speech in self.corpus_dict:
            speech_date = self.corpus_dict[speech]["date"]
            speech_date = datetime.strptime(speech_date,"%Y-%m-%d")
            if speech_date.year < self.year - 3 or speech_date.year > self.year:
                continue
            if self.corpus_dict[speech]["role"] == "presidency":
                continue
            speech_text = self.corpus_dict[speech]["text"]
            speech_text = re.sub("[,'—–]"," ",speech_text)
            speech_text = re.sub("-","",speech_text)
            speech_text = speech_text.lower()
            speech_text = speech_text.replace("saudiarabien","saudi_arabien")
            speech_text = speech_text.replace("vereinigte staaten von amerika","usa")
            speech_text = speech_text.replace("vereinigten staaten von amerika","usa")
            speech_text = speech_text.replace("vereinigte staaten","usa")
            speech_text = speech_text.replace("vereinigten staaten","usa")
            speech_text = speech_text.replace("amerika","usa")
            speech_text = speech_text.replace("botswana","botsuana")
            speech_text = speech_text.replace("vereinigte königreich","vereinigtes_königreich")
            speech_text = speech_text.replace("vereinigten königreichs","vereinigtes_königreich")
            speech_text = speech_text.replace("großbritannien","vereinigtes_königreich")
            for country in long_country_names:
                speech_text = speech_text.replace(country, country.replace(" ","_"))
            for country in countries_names_german:
                speech_text = speech_text.replace(country.lower() + "s", country.lower())
            final_text = [nltk.tokenize.word_tokenize(sentence) for sentence in nltk.tokenize.sent_tokenize(speech_text.lower())]
            final_text = [[re.sub("[-,.'!?;–:—\"„“/]","",word) for word in sentence if re.sub("[-,.'!?;–:—\"„“/]","",word)] for sentence in final_text]
            for sentence in final_text:
                if sentence:
                    yield sentence

Trainieren des vollständigen Wortvektormodells für alle Bundestagsreden zwischen 1996-2016.

In [None]:
sentences = MySentences_Full(corpus_dict)
full_model = gensim.models.Word2Vec(sentences,size=300, iter=15,window=15, sg=1)
directory_path = pathlib.Path("Word2vec")
directory_path.mkdir(parents=True, exist_ok=True)
model_name = pathlib.Path('word2vec_300dims_iter30_window15_skipgram_full_model')
full_model.save(directory_path / model_name)

Trainieren einzelner Wortvekormodelle für Bundestagsreden vierjähriger Zeiträume.  Für jeden Zeitraum werden 25 Modelle anhand unterschiedlicher, durch bootstrapping veränderter Datensätze gebildet. Jedes Modell wird mittels des Modells des vorherigen Zeitraumes initialisiert. Das erste Modell wird mittels des vollständigen Modells initialisiert.

In [None]:
for year in range(1999,2017):
    year_path = pathlib.Path(r"Word2vec/Bootstrapping_Chronological/{}".format(year))
    year_path.mkdir(parents=True, exist_ok=True)
    if year == 1999:
        sentences = [sentence for sentence in MySentences_Period(corpus_dict, year)]
        model = gensim.models.Word2Vec.load(r"Word2vec/word2vec_300dims_iter30_window15_skipgram_full_model")
        model.train(sentences, total_examples=len(sentences),epochs=15)
        model_name = pathlib.Path('word2vec_300dims_iter15_window15_skipgram_timespan{}-{}_fixedModel'.format(year-3,year))
        model.save(str(year_path / model_name) )
        for bootstrap_iteration in range(0,25):
            bootstrap_sample = resample(sentences, replace=True, n_samples=None)
            model = gensim.models.Word2Vec.load(r"Word2vec/word2vec_300dims_iter30_window15_skipgram_full_model")
            model.train(bootstrap_sample, total_examples= len(bootstrap_sample),epochs=15)
            model_name = pathlib.Path('word2vec_300dims_iter15_window15_skipgram_timespan{}-{}_bootstrap_iteration{}'.format(year-3,year,bootstrap_iteration))
            model.save(str(year_path / model_name ))
            print("Timespan:{}-{}  Bootstrap_iteration:{}  Corpus Count: {}".format(year-3, year, bootstrap_iteration, model.corpus_count))
    else:
        prev_year_path = pathlib.Path(r"Word2vec/Bootstrapping_Chronological/{}".format(year-1))
        prev_fixed_model_name = pathlib.Path('word2vec_300dims_iter15_window15_skipgram_timespan{}-{}_fixedModel'.format(year-4,year-1))
        sentences = [sentence for sentence in MySentences_Period(corpus_dict, year)]
        model = gensim.models.Word2Vec.load(prev_year_path / prev_fixed_model_name)
        model.train(sentences, total_examples=len(sentences),epochs=15)
        model_name = pathlib.Path('word2vec_300dims_iter15_window15_skipgram_timespan{}-{}_fixedModel'.format(year-3,year))
        model.save(str(year_path / model_name ))
        for bootstrap_iteration in range(0,25):
            bootstrap_sample = resample(sentences, replace=True, n_samples=None)
            model = gensim.models.Word2Vec.load(main_path + str(year - 1) + "\\" + "word2vec_300dims_iter15_window15_skipgram_timespan{}-{}_fixedModel".format(year-4,year-1))
            model.train(bootstrap_sample, total_examples= len(bootstrap_sample),epochs=15)
            model_name = pathlib.Path('word2vec_300dims_iter15_window15_skipgram_timespan{}-{}_bootstrap_iteration{}'.format(year-3,year,bootstrap_iteration))
            model.save(str(year_path / model_name ))
            print("Timespan:{}-{}  Bootstrap_iteration:{}  Corpus Count: {}".format(year-3, year, bootstrap_iteration, model.corpus_count))

