## MDS Thesis
#### 01. Pre-process the PARTYPRESS data

<br>
<hr style="opacity: 0.5">

### Setup

In [2]:
# load libraries
import os
import pandas as pd
import pickle
import nltk
nltk.download('punkt')

from datasets import Dataset
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/varvarailyina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# check wd
os.getcwd()
#os.chdir("/Users/varvarailyina/hertie/mds_thesis/scripts/")

# load labels data
df_partypress = pd.read_csv("../data/in/partypress/csv/partypress.csv")

# load text data
df_texts = pd.read_csv("../data/in/partypress/csv/partypress_texts.csv")

<hr style="opacity: 0.5">

### Pre-process data

-- *Merge PARTYPRESS datasets*

In [3]:
# check indices
print(df_partypress.index.is_unique)
print(df_texts.index.is_unique)

True
True


In [4]:
# set merging index to be 'id'
df_partypress.set_index('id', inplace=True)
df_texts.set_index('id', inplace=True)

In [5]:
# merge datasets based on id and country_name
df = df_partypress.merge(df_texts, on=['id', 'country_name'], how='inner')

In [6]:
# check var names
print(df.columns)

Index(['country_name', 'parlgov_id', 'party', 'party_name',
       'party_name_english', 'family_name', 'date', 'month', 'month_start',
       'month_end', 'calendar_week', 'week_start', 'week_end', 'header',
       'issue_multi', 'issue_mono', 'issue', 'issue_coder2', 'position',
       'position_coder2', 'cv_sample', 'issue_ridge', 'issue_super', 'text'],
      dtype='object')


-- *Filter data*

In [38]:
# filter for germany
df_de = df[df['country_name'] == 'germany']

# filter out press releases with no issue (use Monolingual Transformer)
df_de = df_de[df_de['issue_mono'].isin([98, 99]) == False]

In [8]:
df_de.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43685 entries, 18021 to 10366
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   country_name        43685 non-null  object 
 1   parlgov_id          43685 non-null  float64
 2   party               43685 non-null  object 
 3   party_name          43685 non-null  object 
 4   party_name_english  43685 non-null  object 
 5   family_name         43685 non-null  object 
 6   date                43685 non-null  object 
 7   month               43685 non-null  int64  
 8   month_start         43685 non-null  object 
 9   month_end           43685 non-null  object 
 10  calendar_week       43685 non-null  int64  
 11  week_start          43685 non-null  object 
 12  week_end            43685 non-null  object 
 13  header              43685 non-null  object 
 14  issue_multi         43685 non-null  int64  
 15  issue_mono          43685 non-null  int64  
 16  

-- *Add issue lables*

In [39]:
# specify issue labels
issue_mapping = {
    1: "Macroeconomics",
    2: "Civil Rights",
    3: "Health",
    4: "Agriculture",
    5: "Labor",
    6: "Education",
    7: "Environment",
    8: "Energy",
    9: "Immigration",
    10: "Transportation",
    12: "Law and Crime",
    13: "Social Welfare",
    14: "Housing",
    15: "Domestic Commerce",
    16: "Defense",
    17: "Technology",
    18: "Foreign Trade",
    20: "Government Operations",
    23: "Culture",
    191: "International Affairs",
    192: "European Integration"
}

In [40]:
# add `issue_label` column
df_de['issue_label'] = df_de['issue_mono'].map(issue_mapping)

In [42]:
# define needed variables
needed_vars = [
    'country_name', 'party', 'party_name', 'family_name', 'date', 'month', 'calendar_week', 'issue_mono', 'issue_label', 'header', 'text'
]

# clean `df_de`
df_de = df_de[needed_vars]

-- *Remove outliers (based on word count in press releases)*

In [43]:
# add a word count column
df_de['n_words'] = df_de['text'].str.split().apply(len)

# define lower and upper cutoffs
lower_bound = df_de['n_words'].quantile(0.025)
upper_bound = df_de['n_words'].quantile(0.975)

# remove outliers (filter within range)
df_de = df_de[(df_de['n_words'] >= lower_bound) & (df_de['n_words'] <= upper_bound)].copy()

In [44]:
# save as .csv
df_de.to_csv("../data/out/df_de.csv", index=False)

# load `df_de` data
#df_de = pd.read_csv("../data/out/df_de.csv")

-- *Sample data (remove this later)*

In [45]:
# randomly select 1000 press releases
#df_sample = df_de.sample(n=1000, random_state=42)

# randomly select 8800 press releases
df_sample = df_de.sample(n=8800, random_state=42)

In [46]:
# look at a press release
print(df_sample['text'].iloc[0])

Anton Hofreiter, Fraktionsvorsitzender, ruft zur Teilnahme an der Demonstration „Der Agrarindustrie die Stirn bieten! Wir haben es satt!“ auf und erklärt zum Start der Internationalen Grünen Woche:  Minister Schmidt verpasst eine Chance. Die Internationale Grüne Woche wäre der richtige Anlass, um die notwendige Agrarwende anzukündigen. Doch der Minister setzt lieber auf ein Weiterso, mit dem das Bauernhofsterben befördert, die Natur weiter zerstört und die Gesundheit von Mensch und Tier vernachlässigt wird. Es ist nicht akzeptabel, dass Minister Schmidt den Glyphosat-Ausstieg ganz offensichtlich weiter aussitzen und das Tierleid bloß mit einem staatlichen Alibi-Labelchen überkleben will. Auch das Sondierungsergebnis von Union und SPD beweist: Schwarz-Rot will ein grundlegendes Umsteuern verhindern.  Dabei wäre eine echte Agrarwende dringend nötig, damit Artensterben, Hofaufgaben, Tierleid und nitratbelastetes Grundwasser endlich Vergangenheit sind. Statt eines Wischi-Waschi-Labels von 

-- *Split press releases into sentences*

In [47]:
# define function to split press releases into sentences
def explode_sentences(df, text_col='text'):
    
    # apply sentence splitting
    df['sentences'] = df[text_col].apply(sent_tokenize)
    
    # each sentence gets its own row
    df_exploded = df.explode('sentences').reset_index(drop=True)
    
    # rename column
    df_exploded = df_exploded.rename(columns={'sentences': 'sentence'})
    
    return df_exploded

In [48]:
# run function
#df_sentences = explode_sentences(df_de)
df_sentences = explode_sentences(df_sample)

In [49]:
# save as .csv
df_sentences.to_csv("../data/out/df_sentences.csv", index=False)

-- *Extract only sentence text*

In [50]:
# extract sentence text
sentences = list(df_sentences['sentence'])

In [51]:
# save sentence data as .pkl file
with open('../data/out/sentences.pkl', 'wb') as f:
    pickle.dump(sentences, f)