# Text analysis election programs for NRW 2017

In [1]:
import re
import sys
import json
from pprint import pprint
import snowballstemmer
import pandas as pd
import glob

In [2]:
path =r'data/'
files = glob.glob(path + "/*.txt")

In [3]:
programs = []
paragraphs = {}

for file in files:
    with open(file) as f:
        programs.append(file)
        container = ''
        for line in f:
            line = line.replace('\n', '')
            if len(line) == 0:
                continue
            if container == '' and len(line) <70:
                paragraphs.setdefault('paragraphs', [])
                paragraphs['paragraphs'].append(line) 
                continue
            container += line + ' '
            if line.endswith('.'):
                paragraphs.setdefault('paragraphs', [])
                paragraphs['paragraphs'].append(container[:-1])
                container = ''
        programs.append(paragraphs)

In [4]:
def splitting_into_words(text):
    return re.split('[\s\.\,\?\!\"\']+', text)

def test_stopwords(word):
    stopwords = ('Die', 'Der', 'Mit', 'Diese', 'Deshalb', 'Mit', 'Für',
                 'Eine', 'Unsere', 'Ein', 'Das', 'Sie', 'In', 'Dazu',
                 'Mio', 'Gut', 'Kein', 'Nur', 'FÜR', 'Dafür', 'Denn', 'Darum',
                'Es', 'Im', 'DIE', 'Auch', 'Den', 'Daher', 'Damit', 'Um', 'Dies',
                'Ihnen', 'Als', 'UND', 'Und', 'Wir', 'Durch', 'So', 'Dem', 'Zudem', 'Viel', 'Dabei',
                'Darüber', 'Unser', 'Neben', 'Außerdem', 'Auf', 'Bei', 'Viele', 'Gleichzeitig',
                'Hier', 'Gerade', 'Hierzu', 'Zur', 'Aus', 'Nr', 'Seit', 'Nicht', 'An', 'Doch',
                'Wenn', 'Nach', 'Dadurch', 'Alle', 'Hierfür', 'Vor', 'Ebenso', 'Deswegen', 'Ohne',
                'Wie', 'Zusätzlich','Bis','Uns','Zum','Immer','Außer', 'Einen', 'Einige', 'Bisher',
                'Daran', 'Nachdem', 'Beim', 'Oft', 'Hinzu', 'Darin', 'Des', 'Weiteren', 'Bereits',
                'Dennoch', 'Noch', 'Keine', 'Vom', 'Jetzt', 'Diejenigen', 'Gegen', 'Unter', 'Einzelnen',
                'Jeder', 'Zweiten', 'Leider', 'Kurz', 'Vielerorts', 'Unterschiedliche', 'Eines', 'Bislang',
                'Somit', 'Sowohl', 'Zuvor', 'Während', 'Jedem', 'Gleiches', 'Drei', 'Einer', 'Solche',
                'Dritten', 'Klein', 'Stattdessen', 'Gute', 'Ver', 'Innen', 'Erstes', 'Ihrem', 'Groß')
    return word in stopwords

In [36]:
counted_words = {}
word_count = 0
stemmer = snowballstemmer.stemmer('german')

for index, paragraph in enumerate(paragraphs.values()):
    current_paragraph = paragraph
    counter = 0
    end = 0
    while len(current_paragraph) > 0 and counter < 1000:
        current_paragraph = current_paragraph[end:]
        pos = re.search(r'[\s\.\,\?\!\:\"\'\“]+', current_paragraph)
        if pos is None:
            end = len(current_paragraph)
            word = current_paragraph
        else:
            word = current_paragraph[:pos.start()]
            end = pos.end()
        if not re.match('^[A-Z]', word):
            counter += end
            continue
        if test_stopwords(word):
            counter += end
            continue
        word_count += 1
        stemmed = stemmer.stemWord(word)
        if word in counted_words.keys():
            counted_words[word]['count'] += 1
        else:
            counted_words[word] = {'count': 1, 'stem': stemmed, 'stem_count': 1}
        if stemmed in counted_words[word]['stem']:
            counted_words[word]['stem_count'] += 1
        if 'occurence' not in counted_words[word].keys():
            counted_words[word]['occurence'] = []
        counted_words[word]['occurence'].append({'paragraph_index': index, 'position': counter})
        counter += end

TypeError: expected string or bytes-like object

In [33]:
df = pd.DataFrame(counted_words)

In [34]:
df

In [7]:
df = df.transpose()

In [8]:
df.head()

Unnamed: 0,count,occurence,stem,stem_count
A,1,"[{'position': 115, 'paragraph_index': 1026}]",A,2
A2/B1,1,"[{'position': 868, 'paragraph_index': 473}]",A2/B1,2
A3,1,"[{'position': 543, 'paragraph_index': 870}]",A3,2
A4,1,"[{'position': 550, 'paragraph_index': 870}]",A4,2
AG,1,"[{'position': 407, 'paragraph_index': 338}]",AG,2


In [10]:
df['stem_share'] = (df['stem_count'] / len(df)*100)

In [None]:
# df['global_share'] = 

In [13]:
df.sort_values('stem_share', ascending=False).head(20)

Unnamed: 0,count,occurence,stem,stem_count,stem_share
NRW,395,"[{'position': 4, 'paragraph_index': 3}, {'posi...",NRW,396,5.70194
Menschen,251,"[{'position': 42, 'paragraph_index': 5}, {'pos...",Mensch,252,3.62851
GRÜNE,187,"[{'position': 543, 'paragraph_index': 4}, {'po...",GRÜNE,188,2.70698
Kommunen,117,"[{'position': 299, 'paragraph_index': 14}, {'p...",Kommun,118,1.69906
Land,101,"[{'position': 83, 'paragraph_index': 4}, {'pos...",Land,102,1.46868
Kinder,92,"[{'position': 355, 'paragraph_index': 15}, {'p...",Kind,93,1.33909
Schulen,91,"[{'position': 359, 'paragraph_index': 14}, {'p...",Schul,92,1.32469
Unternehmen,84,"[{'position': 184, 'paragraph_index': 18}, {'p...",unternehm,85,1.2239
Arbeit,77,"[{'position': 353, 'paragraph_index': 21}, {'p...",Arbeit,78,1.12311
Entwicklung,74,"[{'position': 431, 'paragraph_index': 20}, {'p...",Entwicklung,75,1.07991
