## MDS Thesis
#### 01. Pre-process the PARTYPRESS data

<br>
<hr style="opacity: 0.5">

### Setup

In [1]:
# load libraries
import pandas as pd
import os

from datasets import Dataset

In [2]:
# check wd
os.getcwd()
#os.chdir("/Users/varvarailyina/hertie/mds_thesis/scripts/")

# load labels data
df_partypress = pd.read_csv("../data/in/partypress/csv/partypress.csv")

# load text data
df_texts = pd.read_csv("../data/in/partypress/csv/partypress_texts.csv")

<hr style="opacity: 0.5">

### Pre-process data

In [3]:
print(df_partypress.index.is_unique)
print(df_texts.index.is_unique)

True
True


In [4]:
# set merging index to be 'id'
df_partypress.set_index('id', inplace=True)
df_texts.set_index('id', inplace=True)

In [5]:
# merge datasets based on id and country_name
df = df_partypress.merge(df_texts, on=['id', 'country_name'], how='inner')

In [6]:
# check var names
print(df.columns)

Index(['country_name', 'parlgov_id', 'party', 'party_name',
       'party_name_english', 'family_name', 'date', 'month', 'month_start',
       'month_end', 'calendar_week', 'week_start', 'week_end', 'header',
       'issue_multi', 'issue_mono', 'issue', 'issue_coder2', 'position',
       'position_coder2', 'cv_sample', 'issue_ridge', 'issue_super', 'text'],
      dtype='object')


In [7]:
# filter for germany
df_de = df[df['country_name'] == 'germany']

# filter out press releases with no issue (use Monolingual Transformer)
df_de = df_de[df_de['issue_mono'].isin([98, 99]) == False]

In [8]:
df_de.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43685 entries, 18021 to 10366
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   country_name        43685 non-null  object 
 1   parlgov_id          43685 non-null  float64
 2   party               43685 non-null  object 
 3   party_name          43685 non-null  object 
 4   party_name_english  43685 non-null  object 
 5   family_name         43685 non-null  object 
 6   date                43685 non-null  object 
 7   month               43685 non-null  int64  
 8   month_start         43685 non-null  object 
 9   month_end           43685 non-null  object 
 10  calendar_week       43685 non-null  int64  
 11  week_start          43685 non-null  object 
 12  week_end            43685 non-null  object 
 13  header              43685 non-null  object 
 14  issue_multi         43685 non-null  int64  
 15  issue_mono          43685 non-null  int64  
 16  issue

In [9]:
# save as .csv
df_de.to_csv("../data/out/df_de.csv", index=False)

# load `df_de` data
#df_de = pd.read_csv("../data/out/df_de.csv")

In [10]:
# randomly select 1000 press releases
df_sample = df_de.sample(n=1000, random_state=42)

# randomly select 4400 press releases
#df_sample = df_de.sample(n=4400, random_state=42)

# save as .csv
df_sample.to_csv("../data/out/df_sample.csv", index=False)

In [11]:
# pre-process texts df
#texts = df_de['text'].dropna()
texts = df_sample['text'].dropna()
text_df = texts.to_frame()

# convert df to dataset
dataset = Dataset.from_pandas(text_df)

In [12]:
# save dataset to disk
dataset.save_to_disk("../data/out/dataset/")

Saving the dataset (0/1 shards):   0%|          | 0/8800 [00:00<?, ? examples/s]