In [1]:
from google.colab import drive

import regex as re
import pandas as pd
import numpy as np

import glob
import os.path
import numpy as np
import sys
import codecs

In [2]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
train_folder = "/content/drive/MyDrive/NLP/project_5_data/datasets/train-articles" 
dev_folder = "/content/drive/MyDrive/NLP/project_5_data/datasets/dev-articles"    
train_labels_file = "/content/drive/MyDrive/NLP/project_5_data/datasets/train-labels-task-si/"
dev_labels_file = "/content/drive/MyDrive/NLP/project_5_data/datasets/dev-labels-task-si"


In [4]:
def read_articles_from_file_list(folder_name, file_pattern="*.txt"):
    """
    Read articles from files matching patterns <file_pattern> from  
    the directory <folder_name>. 
    The content of the article is saved in the dictionary whose key
    is the id of the article (extracted from the file name).
    Each element of <sentence_list> is one line of the article.
    """
    file_list = glob.glob(os.path.join(folder_name, file_pattern))
    articles = {}
    article_id_list, sentence_id_list, sentence_list = ([], [], [])
    for filename in sorted(file_list):
        article_id = os.path.basename(filename).split(".")[0][7:]
        with codecs.open(filename, "r", encoding="utf8") as f:
            articles[article_id] = f.read()
    return articles

In [5]:
file_list = glob.glob(os.path.join(train_folder, "*.txt"))
train_articles_content, train_articles_id = ([], [])
for filename in file_list:
    with open(filename, "r", encoding="utf-8") as f:
        train_articles_content.append(' '.join([line.strip() for line in f]))
        train_articles_id.append(os.path.basename(filename).split(".")[0][7:])


In [6]:
train_articles = read_articles_from_file_list(train_folder)
dev_articles = read_articles_from_file_list(dev_folder)

len(train_articles), len(dev_articles)

(371, 75)

In [7]:
TASK_3_ARTICLE_ID_COL = 0
#TASK_3_TECHNIQUE_NAME_COL = 1
TASK_3_FRAGMENT_START_COL = 1
TASK_3_FRAGMENT_END_COL = 2

def extract_article_id_from_file_name(fullpathfilename):

    regex = re.compile("article([0-9]+).*")
    return regex.match(os.path.basename(fullpathfilename)).group(1)

   
def load_annotation_list_from_folder(folder_name, techniques_names):

    file_list = glob.glob(os.path.join(folder_name, "*.labels"))
    if len(file_list)==0:
        print("Cannot load file list in folder " + folder_name)
        sys.exit()
    annotations = {}
    for filename in file_list:
        annotations[extract_article_id_from_file_name(filename)] = []
        with open(filename, "r") as f:
            for row_number, line in enumerate(f.readlines()):
                row = line.rstrip().split("\t")
                annotations[row[TASK_3_ARTICLE_ID_COL]].append((row[TASK_3_FRAGMENT_START_COL], row[TASK_3_FRAGMENT_END_COL]))

    return annotations

In [22]:
techniques_names = [ "propaganda" ]
train_annotation = load_annotation_list_from_folder(train_labels_file, techniques_names)

In [28]:
train_labels = {}
for article in train_articles.keys():
    labels = [0] * len(train_articles[article])
    for annot in train_annotation[article]:
        labels[int(annot[0]):int(annot[1])+1] = [1] * (int(annot[1]) - int(annot[0]) + 1)
    train_labels[article] = labels

In [29]:
train_labels_str = {}
with open("/content/drive/MyDrive/NLP/train_sentence_classification.txt", "w") as fout:
        
    count = 0
    sentence_count = 0
    for article_id in train_articles.keys():

        index = 0
        word_index = 0
        # labels = [0] * len(train_articles[article_id].replace('\n\n',' ').split(' '))
        labels = [0] * len(train_articles[article_id].replace('\n\n',' ').replace('\n', ' ').strip().split(' '))
        labels_str = ['O'] * len(labels)


        for sentence in train_articles[article_id].replace('\n\n', '\n').strip().split('\n'):
            sentence_is_prop = False
            sentence_count += 1
            for word in sentence.split(' '):
                if train_labels[article_id][index] == 1:
                    labels[word_index] = 1
                    labels_str[word_index] = 'I-Prop'
                    sentence_is_prop = True
                word_index += 1
                index += len(word) + 1
            # index += 1
            if sentence_is_prop:
                count += 1
                fout.write("%s\t%s\t%s\n" % (article_id, sentence, 1))
            else:
                fout.write("%s\t%s\t%s\n" % (article_id, sentence, 0))
        train_labels[article_id] = labels
        train_labels_str[article_id] = labels_str
print("Propaganda sentences: {}.  Total sentences: {} ".format(count, sentence_count))

Propaganda sentences: 4832.  Total sentences: 16690 


In [30]:
dev_annotation = load_annotation_list_from_folder(dev_labels_file, techniques_names)

dev_labels = {}
for article in dev_articles.keys():
    labels = [0] * len(dev_articles[article])
    for annot in dev_annotation[article]:
        labels[int(annot[0]):int(annot[1])] = [1] * (int(annot[1]) - int(annot[0]))
    dev_labels[article] = labels


dev_labels_str = {}

In [31]:
with open("/content/drive/MyDrive/NLP/dev_sentence_classification.txt", "w") as fout:

    count = 0
    sentence_count = 0
    for article_id in dev_articles.keys():
        index = 0
        word_index = 0

        labels = [0] * len(dev_articles[article_id].replace('\n\n','\n').replace('\n', ' ').strip().split(' '))
        labels_str = ['O'] * len(labels)
        
        first_sentence = True
        for sentence in dev_articles[article_id].replace('\n\n', '\n').strip().split('\n'):
            sentence_is_prop = False
            sentence_count += 1
            for word in sentence.split(' '):
                if dev_labels[article_id][index] == 1:
                    labels[word_index] = 1
                    labels_str[word_index] = 'I-Prop'
                    sentence_is_prop = True
                word_index += 1
                index += len(word) + 1
            if first_sentence:
                first_sentence = False
                index += 1
            if sentence_is_prop:
                count += 1
                fout.write("%s\t%s\t%s\n" % (article_id, sentence, 1))
            else:
                fout.write("%s\t%s\t%s\n" % (article_id, sentence, 0))
            # index += 1

        dev_labels[article_id] = labels
        dev_labels_str[article_id] = labels_str
print("Propaganda sentences: {}.  Total sentences: {} ".format(count, sentence_count))

Propaganda sentences: 787.  Total sentences: 3177 
