## MDS Thesis
#### 01 Pre-processing

<hr style="opacity: 0.5">

### Setup

In [45]:
# install if needed
#!pip install transformers

# load libraries
import os
import pandas as pd
import numpy as np
import nltk
import re
from transformers import pipeline
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm

In [43]:
# manually downloaded german stopwords (in terminal)
#python -m nltk.downloader stopwords punkt

# find location of nltk data
#print(nltk.data.path)

# set path to nltk data
nltk.data.path.append("/Users/varvarailyina/nltk_data/corpora/stopwords")

# import german stopwords
stop_words = set(stopwords.words("german"))

# output first 10
list(stop_words)[:10]

['könnte',
 'unsere',
 'einem',
 'ins',
 'ihre',
 'aber',
 'unser',
 'hin',
 'manchen',
 'um']

In [44]:
# set wd
os.getcwd()

'/Users/varvarailyina/hertie/mds_thesis/scripts'

In [60]:
# load monthly data
df_monthly = pd.read_csv("../data/in/partypress/csv/monthly_agendas.csv")
#df_monthly = df_monthly[df_monthly["issue"] == 9]

# load partypress data
df_partypress = pd.read_csv("../data/in/partypress/csv/partypress.csv")

# load texts data
df_texts = pd.read_csv("../data/in/partypress/csv/partypress_texts.csv")

<hr style="opacity: 0.2">

### Data pre-processing

#### 1. clean data, merge data

In [21]:
# filter data for germany
df_issues = df_partypress[df_partypress["country_name"] == "germany"]
df_texts = df_texts[df_texts["country_name"] == "germany"]

In [22]:
# merge data
df_DE = df_issues.merge(df_texts, on=["country_name", "id"], how="left")

df_DE = df_DE[[
    "country_name", "id", "party", "date", "month_start", "month_end", "month", "calendar_week",
    "week_start", "week_end", "issue_ridge", "issue_super", "header", "text"
]]

df_DE.head()

Unnamed: 0,country_name,id,party,date,month_start,month_end,month,calendar_week,week_start,week_end,issue_ridge,issue_super,header,text
0,germany,18020,FDP,2010-01-04,2010-01-01,2010-01-31,1001,1,2010-01-04,2010-01-10,98,98,Marc Jungnickel neuer Pressesprecher der FDP-B...,. Die Vorsitzende der FDP-Bundestagsfraktion B...
1,germany,18021,FDP,2010-01-04,2010-01-01,2010-01-31,1001,1,2010-01-04,2010-01-10,5,20,Steinmeier leidet an Politikamnesie,. Zu den heutigen Äußerungen von SPD-Fraktions...
2,germany,18042,FDP,2010-01-05,2010-01-01,2010-01-31,1001,1,2010-01-04,2010-01-10,1,1,Nur Entlastung garantiert Erfolg bei Bekämpfun...,. Zu den aktuellen Arbeitslosenzahlen erklärt ...
3,germany,18065,FDP,2010-01-06,2010-01-01,2010-01-31,1001,1,2010-01-04,2010-01-10,192,192,Westbalkan und Türkei müssen ihre Beitrittsper...,". Zu dem Vorstoß der CSU-Landesgruppe, die Dis..."
4,germany,18083,FDP,2010-01-07,2010-01-01,2010-01-31,1001,1,2010-01-04,2010-01-10,6,6,Die Sechsjährige Grundschule ist kein Erfolgsm...,. Zu den Äußerungen des Hamburger Bürgermeiste...


#### 2. pre-process the text

In [48]:
# define function to preprocess text
def preprocess_text(text):
    text = text.lower()  # lowercase
    tokens = word_tokenize(text, language="german")  # tokenize
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]  # remove stopwords
    return " ".join(tokens)

In [49]:
# apply preprocessing
df_DE["processed_text"] = df_DE["text"].apply(preprocess_text)

# output results
print(df_DE[["text", "processed_text"]].iloc[:5])

                                                text  \
0  . Die Vorsitzende der FDP-Bundestagsfraktion B...   
1  . Zu den heutigen Äußerungen von SPD-Fraktions...   
2  . Zu den aktuellen Arbeitslosenzahlen erklärt ...   
3  . Zu dem Vorstoß der CSU-Landesgruppe, die Dis...   
4  . Zu den Äußerungen des Hamburger Bürgermeiste...   

                                      processed_text  
0  vorsitzende birgit homburger teilt marc jungni...  
1  heutigen äußerungen steinmeier erklärt stellve...  
2  aktuellen arbeitslosenzahlen erklärt arbeitsma...  
3  vorstoß diskussion erweiterung eu länder westb...  
4  äußerungen hamburger bürgermeisters ole beust ...  


In [61]:
# save as .csv
df_DE.to_csv("../data/out/df_clean.csv", index=False)

#### 3. document-feature matrix (DFM)

_DFM_: a structured representation of text data in numerical form, used in NLP to analyze word frequencies across documents

In [52]:
vectorizer = CountVectorizer()
dfm = vectorizer.fit_transform(df_DE["processed_text"])

# convert to df for analysis
df_dfm = pd.DataFrame(dfm.toarray(), columns=vectorizer.get_feature_names_out())

# output top words
print(df_dfm.head())

   00  000  0000  000mal  001  001im  001über  007  008  00pressekonferenz  \
0   0    0     0       0    0      0        0    0    0                  0   
1   0    0     0       0    0      0        0    0    0                  0   
2   0    0     0       0    0      0        0    0    0                  0   
3   0    0     0       0    0      0        0    0    0                  0   
4   0    0     0       0    0      0        0    0    0                  0   

   ...  ürgerten  üringen  ürümqi  šefcovic  šemeta  šešelj  šešeljevci  \
0  ...         0        0       0         0       0       0           0   
1  ...         0        0       0         0       0       0           0   
2  ...         0        0       0         0       0       0           0   
3  ...         0        0       0         0       0       0           0   
4  ...         0        0       0         0       0       0           0   

   šešeljs  štefan  žatec  
0        0       0      0  
1        0       0      

#### 4. remove most frequent and rare words

In [58]:
# choose words in at least 20% and at most 70% of documents
min_docfreq = 0.2 * len(df_dfm)
max_docfreq = 0.7 * len(df_dfm)

# sum up word occurances across documents
word_counts = df_dfm.sum(axis=0)

# filter out frequent / rare words
df_mft = df_dfm.loc[:, (word_counts >= min_docfreq) & (word_counts <= max_docfreq)]

# output top words
print(df_mft.head())

   bereits  dafür  deutschen  deutschland  endlich  erklärt  euro  \
0        0      0          0            0        0        0     0   
1        0      0          0            0        0        1     0   
2        0      0          0            0        0        1     0   
3        0      0          0            0        0        1     0   
4        0      0          0            0        0        1     0   

   europäischen  frage  fraktion  ...  mehr  menschen  millionen  müssen  \
0             0      0         0  ...     0         0          0       0   
1             0      0         0  ...     0         1          0       0   
2             0      0         0  ...     0         0          0       0   
3             3      1         0  ...     0         0          0       1   
4             0      0         0  ...     0         0          0       0   

   prozent  schon  seit  spd  sprecher  sprecherin  
0        0      0     0    0         0           0  
1        0      0     

In [62]:
# save as .csv
df_mft.to_csv("../data/out/df_mft.csv", index=False)