## MDS Thesis
#### 01. Pre-process the PARTYPRESS data

<br>
<hr style="opacity: 0.5">

### Setup

In [2]:
# load libraries
import os
import pandas as pd
import pickle
import nltk
nltk.download('punkt')

from datasets import Dataset
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/varvarailyina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# check wd
os.getcwd()
#os.chdir("/Users/varvarailyina/hertie/mds_thesis/scripts/")

# load labels data
df_partypress = pd.read_csv("../data/in/partypress/csv/partypress.csv")

# load text data
df_texts = pd.read_csv("../data/in/partypress/csv/partypress_texts.csv")

<hr style="opacity: 0.5">

### Pre-process data

-- *Merge PARTYPRESS datasets*

In [3]:
# check indices
print(df_partypress.index.is_unique)
print(df_texts.index.is_unique)

True
True


In [4]:
# set merging index to be 'id'
df_partypress.set_index('id', inplace=True)
df_texts.set_index('id', inplace=True)

In [5]:
# merge datasets based on id and country_name
df = df_partypress.merge(df_texts, on=['id', 'country_name'], how='inner')

In [6]:
# check var names
print(df.columns)

Index(['country_name', 'parlgov_id', 'party', 'party_name',
       'party_name_english', 'family_name', 'date', 'month', 'month_start',
       'month_end', 'calendar_week', 'week_start', 'week_end', 'header',
       'issue_multi', 'issue_mono', 'issue', 'issue_coder2', 'position',
       'position_coder2', 'cv_sample', 'issue_ridge', 'issue_super', 'text'],
      dtype='object')


-- *Filter data*

In [7]:
# filter for germany
df_de = df[df['country_name'] == 'germany']

# filter out press releases with no issue (use Monolingual Transformer)
df_de = df_de[df_de['issue_mono'].isin([98, 99]) == False]

In [8]:
df_de.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43685 entries, 18021 to 10366
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   country_name        43685 non-null  object 
 1   parlgov_id          43685 non-null  float64
 2   party               43685 non-null  object 
 3   party_name          43685 non-null  object 
 4   party_name_english  43685 non-null  object 
 5   family_name         43685 non-null  object 
 6   date                43685 non-null  object 
 7   month               43685 non-null  int64  
 8   month_start         43685 non-null  object 
 9   month_end           43685 non-null  object 
 10  calendar_week       43685 non-null  int64  
 11  week_start          43685 non-null  object 
 12  week_end            43685 non-null  object 
 13  header              43685 non-null  object 
 14  issue_multi         43685 non-null  int64  
 15  issue_mono          43685 non-null  int64  
 16  

In [12]:
# save as .csv
df_de.to_csv("../data/out/df_de.csv", index=False)

# load `df_de` data
#df_de = pd.read_csv("../data/out/df_de.csv")

-- *Sample data (remove this later)*

In [9]:
# randomly select 1000 press releases
#df_sample = df_de.sample(n=1000, random_state=42)

# randomly select 8800 press releases
df_sample = df_de.sample(n=8800, random_state=42)

In [10]:
# look at a press release
print(df_sample['text'].iloc[0])

Anlässlich der aktuellen Entwicklungen in der Syrien-Krise erklärt der Vorsitzende SPD-Bundestagsfraktion Frank-Walter Steinmeier:   Es ist ein Hoffnungsschimmer in letzter Minute. Innerhalb von nur 24 Stunden haben Äußerungen von US-Außenminister Kerry, seinem russischen Kollegen Lawrow und dem syrischen Außenminister al-Muallim einen Ausweg aus der scheinbar unausweichlichen militärischen Logik gewiesen. Nachdem inzwischen auch Präsident Obama sich offen für den russischen Vorstoß gezeigt hat, muss die syrische Führung jetzt ihre ernsthafte Kooperationsbereitschaft unter Beweis stellen. Dabei ist die russische Regierung gefragt, ihren gesamten Einfluss geltend zu machen, um die syrischen Chemiewaffen unter internationale Kontrolle zu stellen. Dazu muss sich die internationale Staatengemeinschaft unter Führung der USA und Russlands jetzt auf ein konkretes und verifizierbares Verfahren mit verbindlichen Zeitplänen und Sanktionsmechanismen verständigen. Der UN-Sicherheitsrat muss hierüb

-- *Split press releases into sentences*

In [18]:
# define function to split press releases into sentences
def explode_sentences(df, text_col='text'):
    
    # apply sentence splitting
    df['sentences'] = df[text_col].apply(sent_tokenize)
    
    # each sentence gets its own row
    df_exploded = df.explode('sentences').reset_index(drop=True)
    
    # rename column
    df_exploded = df_exploded.rename(columns={'sentences': 'sentence'})
    
    return df_exploded

In [19]:
# run function
#df_sentences = explode_sentences(df_de)
df_sentences = explode_sentences(df_sample)

In [20]:
# save as .csv
df_sentences.to_csv("../data/out/df_sentences.csv", index=False)

-- *Extract only sentence text*

In [21]:
# extract sentence text
sentences = list(df_sentences['sentence'])

In [22]:
# save sentence data as .pkl file
with open('../data/out/sentences.pkl', 'wb') as f:
    pickle.dump(sentences, f)

-- *ARCHIVE*

In [None]:
# pre-process texts df
#texts = df_de['text'].dropna()
texts = df_sample['text'].dropna()
text_df = texts.to_frame()

# convert df to dataset
dataset = Dataset.from_pandas(text_df)

In [None]:
# save dataset to disk
dataset.save_to_disk("../data/out/dataset/")