In [51]:
import pandas as pd
from docx import Document
import numpy as np
import collections
import spacy
import nltk
from math import sqrt
import glob
import re
import json
from collections import Counter, OrderedDict

from wordcloud import WordCloud
from nltk.util import ngrams
from nltk.corpus import stopwords

from spacy_sentiws import spaCySentiWS
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer


# Stop words
german_stop_words = stopwords.words('german')

# German lemmatizer
nlp = spacy.load(r'/home/tina/anaconda3/lib/python3.8/site-packages/de_core_news_sm/de_core_news_sm-2.3.0')

# Sentiment analysis lexicon
nlp2 = spacy.load('de')
sentiws = spaCySentiWS(sentiws_path=r'/home/tina/anaconda3/lib/python3.8/site-packages/spacy_sentiws/data')
nlp2.add_pipe(sentiws)

# Pandas options
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [52]:
# Convert docx to dataframe.
parties = ['fdp.docx', 'gruene.docx', 'spd.docx', 'linke.docx', 'afd.docx', 'cdu.docx']

def text_to_df(party_list):
    
    # Collect texts in list.
    df_list = []

    for party in party_list:
        doc = Document(party)
        fullText = []
        
        # Join paragraphs.
        for para in doc.paragraphs:
            fullText.append(para.text)
        full_doc = ' '.join(fullText)
        df_list.append(full_doc)
        
    # Convert list to df.   
    df = pd.DataFrame({'Text': df_list})
    return df

In [53]:
df = text_to_df(parties)

# Add parties column.
df['Party'] = ['FDP', 'Die Grünen', 'SPD', 'Linke', 'AfD', 'CDU']
print(df)

                                                Text       Party
0    Nie gab es mehr zu tun Wie es ist, darf es n...         FDP
1   DEUTSCHLAND. ALLES IST DRIN. Programmentwurf ...  Die Grünen
2   DAS ZUKUNFTSPROGRAMM DER SPD –– WOFÜR WIR STE...         SPD
3  Zeit zu handeln: Für soziale Sicherheit, Fried...       Linke
4  Deutschland.Aber normal. Programm der Alternat...         AfD
5  Das Programm für Stabilität und Erneuerung.  G...         CDU


In [54]:
# Let's do some preliminary wrangling and cleaning.
class DataCleaner:
    def __init__(self, df_column):
        self.df_column = df_column
        
    def remove_chars(self):
        
        # Remove white space and unnecessary characters.
        column_stripped = self.df_column.strip()
        return column_stripped
    
    def remove_punct(self):
        
        # Remove everything that is not whitespace or word character, dash or gender star.
        no_punct = re.sub(r'[^\w\s*-]',' ',self.df_column) 
        
        #Replace gender asterisk so it is seen as one word.
        gendered = no_punct.replace('*innen', 'innen')
        return gendered

    def remove_digits(self):
        
        # Remove digits.
        pattern = r'[0-9]'
        no_digit = re.sub(pattern, '', self.df_column)
        return no_digit
        
    def get_word_count(self):
        
        # Count how many words are in each text.
        word_count = len(self.df_column.split())
        return word_count

        
        
    

In [55]:
# Remove digits, whitespace, unnecessary characters
df['Text'] = df['Text'].apply(lambda row: DataCleaner(str(row)).remove_chars())
df['Text'] = df['Text'].apply(lambda row: DataCleaner(row).remove_punct())
df['Text'] = df['Text'].apply(lambda row: DataCleaner(row).remove_digits())

# Count words
df['Word count'] = df['Text'].apply(lambda row: DataCleaner(row).get_word_count())


In [56]:
df

Unnamed: 0,Text,Party,Word count
0,Nie gab es mehr zu tun Wie es ist darf es nic...,FDP,29405
1,DEUTSCHLAND ALLES IST DRIN Programmentwurf z...,Die Grünen,47794
2,DAS ZUKUNFTSPROGRAMM DER SPD WOFÜR WIR STEH...,SPD,23428
3,Zeit zu handeln Für soziale Sicherheit Fried...,Linke,53162
4,Deutschland Aber normal Programm der Alternat...,AfD,23415
5,Das Programm für Stabilität und Erneuerung G...,CDU,44402


In [57]:
# class PrepareText splits text into tokens, lemmas, removes stopwords, and creates bow

class PrepareText:
    def __init__(self, df_column):
        self.df_column = df_column
        
    def remove_stopwords(self):
        token = self.df_column.split(' ')
        no_stopwords = [word for word in token if word.lower() not in german_stop_words]
        
        # remove whitespace
        no_stopwords = [word for word in no_stopwords if word]

        return no_stopwords
        
    def get_lemmas(self):
        
        # create list for lemmas
        lemmas_list = []
        
        # extract lemmas based on spacy
        for sentence in self.df_column:
            doc = nlp(sentence)
            for token in doc:
                lemmas_list.append(token.lemma_)
                
        return lemmas_list
    
    def get_bow(self):
        
        # Use lemmas to create bow dict.
        lemmas = self.df_column
        
        # Bow: counts word occurences, and 
        # returns ordered dictionary beginning with the most common
        bow = OrderedDict(Counter(lemmas).most_common())
        return bow
    
    def get_pos_bow(self):
        
        # Pos tags list
        pos_tags = []
        
        # Extract pos tags based on spacy
        doc = nlp(self.df_column)
        for token in doc:
            pos_tags.append(token.pos_)
                
        # Create Pos tags dictionary    
        pos_bow = OrderedDict(Counter(pos_tags).most_common())
        return pos_bow
    
    def get_pos_dict(self):
        
        # Pos tags dict
        pos_tags_list = []
                
        # Get Pos tags with the words.
        doc = nlp(self.df_column)
        for token in doc:
            pos_tags_list.append((token.pos_,token.text))
            
        # Create Pos-word dictionary    
        pos_words_bow = OrderedDict(Counter(pos_tags_list).most_common())
        return pos_words_bow

In [58]:
# Remove stopwords: Apply row-wise for each text.
df['Text: No Stopwords'] = df['Text'].apply(lambda row: PrepareText(row).remove_stopwords())

In [59]:
# Lemmatize text without stopwords.
df['Lemmas'] = df['Text: No Stopwords'].apply(lambda row: PrepareText(row).get_lemmas())

In [60]:
# Get bag of words of text without stopwords.
df['BoW'] = df['Text: No Stopwords'].apply(lambda row: PrepareText(row).get_bow())

In [61]:
# Get bag of words of pos.
df['Pos BoW'] = df['Text'].apply(lambda row: PrepareText(row).get_pos_bow())


In [62]:
# Get dict of words and their pos.
df['Pos Word Dict'] = df['Text'].apply(lambda row: PrepareText(row).get_pos_dict())

In [63]:
df

Unnamed: 0,Text,Party,Word count,Text: No Stopwords,Lemmas,BoW,Pos BoW,Pos Word Dict
0,Nie gab es mehr zu tun Wie es ist darf es nic...,FDP,29405,"[Nie, gab, mehr, tun, darf, bleiben, darf, ble...","[Nie, geben, mehr, tun, dürfen, bleiben, dürfe...","{'Freie': 346, 'Demokraten': 342, 'fordern': 9...","{'NOUN': 7849, 'DET': 4089, 'ADJ': 3916, 'VERB...","{('SPACE', ' '): 3218, ('CCONJ', 'und'): 1388,..."
1,DEUTSCHLAND ALLES IST DRIN Programmentwurf z...,Die Grünen,47794,"[DEUTSCHLAND, DRIN, Programmentwurf, Bundestag...","[DEUTSCHLAND, DRIN, Programmentwurf, Bundestag...","{'mehr': 163, 'Menschen': 156, 'sollen': 126, ...","{'NOUN': 11735, 'DET': 6316, 'SPACE': 5965, 'A...","{('SPACE', ' '): 5585, ('CCONJ', 'und'): 2437,..."
2,DAS ZUKUNFTSPROGRAMM DER SPD WOFÜR WIR STEH...,SPD,23428,"[ZUKUNFTSPROGRAMM, SPD, WOFÜR, STEHEN, ANTREIB...","[ZUKUNFTSPROGRAMM, SPD, WOFÜR, STEHEN, ANTREIB...","{'müssen': 73, 'mehr': 66, 'dafür': 53, 'unter...","{'NOUN': 5891, 'DET': 3417, 'SPACE': 2985, 'AD...","{('SPACE', ' '): 2537, ('CCONJ', 'und'): 1197,..."
3,Zeit zu handeln Für soziale Sicherheit Fried...,Linke,53162,"[Zeit, handeln, soziale, Sicherheit, Frieden, ...","[Zeit, handeln, soziale, Sicherheit, Friede, K...","{'müssen': 391, 'Menschen': 219, 'mehr': 170, ...","{'NOUN': 14246, 'SPACE': 7760, 'DET': 7273, 'V...","{('SPACE', ' '): 6729, ('CCONJ', 'und'): 2707,..."
4,Deutschland Aber normal Programm der Alternat...,AfD,23415,"[Deutschland, normal, Programm, Alternative, D...","[Deutschland, normal, Programm, Alternative, D...","{'AfD': 152, 'Deutschland': 93, 'deutschen': 5...","{'NOUN': 6344, 'DET': 3826, 'SPACE': 3158, 'AD...","{('SPACE', ' '): 2493, ('CCONJ', 'und'): 950, ..."
5,Das Programm für Stabilität und Erneuerung G...,CDU,44402,"[Programm, Stabilität, Erneuerung, GEMEINSAM, ...","[Programm, Stabilität, Erneuerung, GEMEINSAM, ...","{'Deutschland': 148, 'mehr': 139, 'müssen': 13...","{'NOUN': 10430, 'SPACE': 8254, 'DET': 5863, 'A...","{('SPACE', ' '): 6132, ('CCONJ', 'und'): 2360,..."


In [65]:
# Word use: Who talks the most about what? How do parties use language?

class MostCommon():
    
    def __init__(self, df_column):
        self.df_column = df_column
    
    # Most common words.
    def most_common_words(self):
        most_common_ten = list(self.df_column.items())[:10]
        return dict(most_common_ten)   
        
    def most_common_pos_tags(self):
        
        most_common_pos = {}
        
        # Get the top 10 most used POS tags.
        for key, value in self.df_column.items():
            if len(most_common_pos) < 10:
                if key != 'SPACE':
                    most_common_pos[key] = value
        return most_common_pos
        
    def get_most_common_nouns(self):
        
        most_common_nouns = {}
        
        for key, value in self.df_column.items():
            if len(most_common_nouns) < 10:
                if 'NOUN' in key:
                    most_common_nouns[key[1]] = value
        return most_common_nouns
       
        
    def get_most_common_verbs(self):
        
        most_common_verbs = {}
        
        for key, value in self.df_column.items():
            if len(most_common_verbs) < 10:
                if 'VERB' in key:
                    most_common_verbs[key[1]] = value
        return most_common_verbs
        
    def get_most_common_adjectives(self):
                
        most_common_adj = {}
        
        for key, value in self.df_column.items():
            if len(most_common_adj) < 10:
                if 'ADJ' in key:
                    most_common_adj[key[1]] = value
        return most_common_adj

    def most_common_bigrams(self):
                
        # Get the bigrams from text string.
        nltk_tokens = nltk.word_tokenize(self.df_column)
        bigrams = nltk.bigrams(nltk_tokens)
        
        bigrams_dict = OrderedDict(Counter(bigrams).most_common())
        bigrams_list = list(bigrams_dict.items())[:10]
        bigrams_dict = dict([(' '.join(x[0]), x[1]) for x in bigrams_list])
        return bigrams_dict
    

In [66]:
# Get most common...
df['Most common words'] = df['BoW'].apply(lambda row: MostCommon(row).most_common_words())
df['Most common POS tags'] = df['Pos BoW'].apply(lambda row: MostCommon(row).most_common_pos_tags())
df['Most common nouns'] = df['Pos Word Dict'].apply(lambda row: MostCommon(row).get_most_common_nouns())
df['Most common verbs'] = df['Pos Word Dict'].apply(lambda row: MostCommon(row).get_most_common_verbs())
df['Most common adj'] = df['Pos Word Dict'].apply(lambda row: MostCommon(row).get_most_common_adjectives())
df['Most common bigrams'] = df['Text'].apply(lambda row: MostCommon(row).most_common_bigrams())

In [67]:
# Drop columns not needed.
df_new = df.drop(['Text', 'Text: No Stopwords','Lemmas', 'Pos Word Dict', 'Pos BoW', 'BoW'], axis=1)

In [68]:
df_new

Unnamed: 0,Party,Word count,Most common words,Most common POS tags,Most common nouns,Most common verbs,Most common adj,Most common bigrams
0,FDP,29405,"{'Freie': 346, 'Demokraten': 342, 'fordern': 9...","{'NOUN': 7849, 'DET': 4089, 'ADJ': 3916, 'VERB...","{'Demokraten': 342, 'Menschen': 61, 'Unternehm...","{'wollen': 434, 'muss': 113, 'fordern': 99, 'm...","{'Freie': 346, 'europäischen': 32, 'neue': 30,...","{'Freie Demokraten': 340, 'Wir Freie': 329, 'D..."
1,Die Grünen,47794,"{'mehr': 163, 'Menschen': 156, 'sollen': 126, ...","{'NOUN': 11735, 'DET': 6316, 'ADJ': 5912, 'VER...","{'Menschen': 156, 'Unternehmen': 60, 'Frauen':...","{'wollen': 489, 'können': 170, 'sollen': 126, ...","{'neue': 67, 'europäische': 43, 'europäischen'...","{'wollen wir': 279, 'Wir wollen': 175, 'in der..."
2,SPD,23428,"{'müssen': 73, 'mehr': 66, 'dafür': 53, 'unter...","{'NOUN': 5891, 'DET': 3417, 'ADJ': 2908, 'ADP'...","{'Menschen': 50, 'Arbeit': 44, 'Gesellschaft':...","{'wollen': 104, 'müssen': 73, 'können': 63, 'u...","{'neue': 36, 'neuen': 22, 'digitalen': 21, 'so...","{'Wir werden': 167, 'werden wir': 159, 'und di..."
3,Linke,53162,"{'müssen': 391, 'Menschen': 219, 'mehr': 170, ...","{'NOUN': 14246, 'DET': 7273, 'VERB': 6726, 'AD...","{'Menschen': 219, 'Beschäftigten': 97, 'Arbeit...","{'wollen': 583, 'müssen': 391, 'muss': 254, 'k...","{'soziale': 91, 'öffentlichen': 79, 'gute': 74...","{'Wir wollen': 318, 'wollen wir': 246, 'in der..."
4,AfD,23415,"{'AfD': 152, 'Deutschland': 93, 'deutschen': 5...","{'NOUN': 6344, 'DET': 3826, 'ADJ': 2980, 'ADP'...","{'Bürger': 31, 'Familien': 30, 'Kinder': 27, '...","{'muss': 81, 'wollen': 69, 'müssen': 54, 'ford...","{'deutschen': 54, 'deutsche': 24, 'europäische...","{'Die AfD': 107, 'in der': 58, 'für die': 57, ..."
5,CDU,44402,"{'Deutschland': 148, 'mehr': 139, 'müssen': 13...","{'NOUN': 10430, 'DET': 5863, 'ADJ': 5482, 'VER...","{'Menschen': 111, 'Sicherheit': 67, 'Land': 61...","{'wollen': 483, 'können': 174, 'müssen': 138, ...","{'neue': 78, 'digitale': 50, 'stärker': 50, 'b...","{'Wir wollen': 266, 'Wir werden': 203, 'wollen..."


In [71]:
# Export as JSON.
result = df_new.to_json('./export.json', orient='index')