In [50]:
import pandas as pd


In [51]:
#read preprocessed data
gaza_after = pd.read_csv('../Data/gaza_textcontain_after_new_preprocessed.csv', index_col=0)
gaza_before = pd.read_csv('../Data/gaza_textcontain_before_new_preprocessed.csv', index_col=0)
ukraine_after = pd.read_csv('../Data/ukraine_textcontain_after_new_preprocessed.csv', index_col=0)
ukraine_before = pd.read_csv('../Data/ukraine_textcontain_before_new_preprocessed.csv', index_col=0)
ukraine_before

Unnamed: 0,article_title,author,published_time,article_text,article_category_one,article_category_two,picture_description,author_title,author_description
0,"All sides should give Russia, Ukraine some spa...",Global Times,2022-02-23,Russian President Vladimir Putin on Monday sig...,OPINION,EDITORIAL,United Nations Security Council meets after Ru...,Author details not found,Author details not found
1,Ukrainian president expects no conflict with R...,Xinhua,2022-02-23,Ukrainian President Volodymyr Zelensky said o...,WORLD,EUROPE,Ukrainian President Volodymyr Zelensky attends...,Author details not found,Author details not found
2,British PM announces sanctions against Russia ...,Xinhua,2022-02-23,British Prime Minister Boris Johnson announce...,WORLD,EUROPE,British Prime Minister Boris Johnson Photo: VCG,Author details not found,Author details not found
3,NATO chief urges Russia to choose diplomacy in...,Xinhua,2022-02-23,The chief of the North Atlantic Treaty Organi...,WORLD,EUROPE,NATO Secretary General Jens Stoltenberg speaks...,Author details not found,Author details not found
4,"EU agrees Russia sanctions package, warns of f...",Xinhua,2022-02-23,The European Union (EU) is ready to take furt...,WORLD,EUROPE,European Commission President Ursula von der L...,Author details not found,Author details not found
...,...,...,...,...,...,...,...,...,...
478,Russia's upper house approves use of military ...,Xinhua,2022-02-23,"The Russian Federation Council, or the upper ...",WORLD,EUROPE,Russian President Vladimir Putin signs documen...,Author details not found,Author details not found
479,US sanctions Russia after Putin sends troops t...,Xinhua,2022-02-23,U.S. President Joe Biden on Tuesday announced...,WORLD,AMERICAS,U.S. President Joe Biden walks out from the So...,Author details not found,Author details not found
480,DPP's so-called sympathy for Ukraine 'self-ser...,Global Times,2022-02-23,"The secessionist DPP authority's so-called ""sy...",CHINA,SOCIETY,Tsai Ing-wen Photo:AFP,Author details not found,Author details not found
481,"US hardly leads the West out of helplessness, ...",Zhao Minghao,2022-02-23,The Munich Security Conference 2022 was held u...,OPINION,VIEWPOINT,Outgoing Munich Security Conference Chairman W...,Author details not found,Author details not found


In [52]:
# filtering out rows with no text or only photos
gaza_after = gaza_after[(gaza_after['article_category_one'] != 'PHOTO') & (gaza_after['article_text'].notna())]
gaza_before = gaza_before[(gaza_before['article_category_one'] != 'PHOTO') & (gaza_before['article_text'].notna())]
ukraine_after = ukraine_after[(ukraine_after['article_category_one'] != 'PHOTO') & (ukraine_after['article_text'].notna())]
ukraine_before = ukraine_before[(ukraine_before['article_category_one'] != 'PHOTO') & (ukraine_before['article_text'].notna())]

# reset index
gaza_after.reset_index(drop=True, inplace=True)
gaza_before.reset_index(drop=True, inplace=True)
ukraine_after.reset_index(drop=True, inplace=True)
ukraine_before.reset_index(drop=True, inplace=True)

In [53]:
# function to sanitize filenames
def sanitize_filename(filename: str, max_length: int = 50):
    invalid_chars = str.maketrans('<>:"/\\|?*\'\n\r\t', '_____________')
    sanitized = filename.translate(invalid_chars)
    return sanitized[:max_length].strip()

In [54]:
import nltk, os
#export each row of dataframe as a nicely formated text file
def export_text_files(df: pd.DataFrame, folder_name: str, annotator_name: str):
    output_folder = f'../BRAT_Data/{folder_name}/{annotator_name}'
    os.makedirs(output_folder, exist_ok=True)
    for idx, row in df.iterrows():
        title = sanitize_filename(row["article_title"])
        category = sanitize_filename(row["article_category_two"])
        filename = f'{row["published_time"]}_{category}_{title}'
        txt_file_path = f'{output_folder}/{filename}.txt'
        ann_file_path = f'{output_folder}/{filename}.ann'
        with open(txt_file_path, 'w', encoding='utf-8' ) as f:
            f.write(f'Published Time: {row["published_time"]}\n')
            f.write(f'Title: {row["article_title"]}\n')
            f.write(f'Category 1: {row["article_category_one"]}; Category 2: {row["article_category_two"]}\n')
            f.write(f'Author: {row["author"]}\n')
            f.write(f'Author title: {row["author_title"]}\n')
            f.write(f'Author description: {row["author_description"]}\n')
            f.write(f'Text: \n')
            sentences = nltk.sent_tokenize(row["article_text"])
            for sentence in sentences:
                f.write(sentence.strip() + '\n')
         # create an empty .ann file
        with open(ann_file_path, 'w', encoding='utf-8'):
            pass

In [55]:
# function to split articles between annotators
def split_articles(df):
    agnieszka_df = df.iloc[::2]  # select rows with even indices
    ania_df = df.iloc[1::2]      # select rows with odd indices
    return agnieszka_df, ania_df

In [56]:
# split and export for each DataFrame
def export_for_annotators():
    # split data for each category
    gaza_after_agnieszka, gaza_after_ania = split_articles(gaza_after)
    gaza_before_agnieszka, gaza_before_ania = split_articles(gaza_before)
    ukraine_after_agnieszka, ukraine_after_ania = split_articles(ukraine_after)
    ukraine_before_agnieszka, ukraine_before_ania = split_articles(ukraine_before)

    # export articles for Agnieszka
    export_text_files(gaza_after_agnieszka, "Gaza_after", "Articles_for_Agnieszka")
    export_text_files(gaza_before_agnieszka, "Gaza_before", "Articles_for_Agnieszka")
    export_text_files(ukraine_after_agnieszka, "Ukraine_after", "Articles_for_Agnieszka")
    export_text_files(ukraine_before_agnieszka, "Ukraine_before", "Articles_for_Agnieszka")

    # export articles for Ania
    export_text_files(gaza_after_ania, "Gaza_after", "Articles_for_Ania")
    export_text_files(gaza_before_ania, "Gaza_before", "Articles_for_Ania")
    export_text_files(ukraine_after_ania, "Ukraine_after", "Articles_for_Ania")
    export_text_files(ukraine_before_ania, "Ukraine_before", "Articles_for_Ania")


In [57]:
# run the export
export_for_annotators()