In [None]:
! pip install pandas tqdm nltk matplotlib

In [3]:
import pandas as pd
import nltk
import matplotlib as mpl 
import matplotlib.pyplot as plt 
from tqdm import tqdm

nltk.download('punkt')
tqdm.pandas()

train_stances = pd.read_csv('train_stances.csv', index_col="Body ID")
train_bodies = pd.read_csv('train_bodies.csv', index_col="Body ID")

test_stances = pd.read_csv('competition_test_stances.csv', index_col="Body ID")
test_bodies = pd.read_csv('competition_test_bodies.csv', index_col="Body ID")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/thomasvant/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# XSum
train_bodies_xsum = pd.read_csv(f'train_bodies_google_pegasus-xsum.csv', index_col="Body ID")
test_bodies_xsum = pd.read_csv(f'test_bodies_google_pegasus-xsum.csv', index_col="Body ID")

# CNN/DailyNews
train_bodies_cnn_dailymail = pd.read_csv(f'train_bodies_google_pegasus-cnn_dailymail.csv', index_col="Body ID")
test_bodies_cnn_dailymail = pd.read_csv(f'test_bodies_google_pegasus-cnn_dailymail.csv', index_col="Body ID")

# Newsroom
train_bodies_newsroom = pd.read_csv(f'train_bodies_google_pegasus-newsroom.csv', index_col="Body ID")
test_bodies_newsroom = pd.read_csv(f'test_bodies_google_pegasus-newsroom.csv', index_col="Body ID")

# Multi-News
train_bodies_multi_news = pd.read_csv(f'train_bodies_google_pegasus-multi_news.csv', index_col="Body ID")
test_bodies_multi_news = pd.read_csv(f'test_bodies_google_pegasus-multi_news.csv', index_col="Body ID")

# Reduction calculation

In [5]:
def count(df):
    return df["articleBody"].progress_apply(lambda x: len(nltk.word_tokenize(x)))

## Train data

In [6]:
count_df_train = count(train_bodies)
count_df_train_xsum = count(train_bodies_xsum)
count_df_train_cnn_dailymail = count(train_bodies_cnn_dailymail)
count_df_train_newsroom = count(train_bodies_newsroom)
count_df_train_multi_news = count(train_bodies_multi_news)

100%|██████████████████████████████████████| 1683/1683 [00:02<00:00, 692.27it/s]
100%|████████████████████████████████████| 1683/1683 [00:00<00:00, 12305.69it/s]
100%|█████████████████████████████████████| 1683/1683 [00:00<00:00, 6160.91it/s]


In [7]:
reduction_df_train_xsum = count_df_train_xsum / count_df_train
reduction_df_train_cnn_dailymail = count_df_train_cnn_dailymail / count_df_train
reduction_df_train_newsroom = count_df_train_newsroom / count_df_train
reduction_df_train_multi_news = count_df_train_multi_news / count_df_train

In [8]:
(reduction_df_train_xsum > 1.0).value_counts()

False    1649
True       34
Name: articleBody, dtype: int64

In [9]:
(reduction_df_train_cnn_dailymail > 1.0).value_counts()

False    1622
True       61
Name: articleBody, dtype: int64

In [None]:
(reduction_df_train_newsroom > 1.0).value_counts()

In [None]:
(reduction_df_train_multi_news > 1.0).value_counts()

In [None]:
train_bodies_xsum[reduction_df_train_xsum > 1.0] = train_bodies[reduction_df_train_xsum > 1.0]
train_bodies_cnn_dailymail[reduction_df_train_cnn_dailymail > 1.0] = train_bodies[reduction_df_train_cnn_dailymail > 1.0]
train_bodies_newsroom[reduction_df_train_newsroom > 1.0] = train_bodies[reduction_df_train_newsroom > 1.0]
train_bodies_multi_news[reduction_df_train_multi_news > 1.0] = train_bodies[reduction_df_train_multi_news > 1.0]

In [None]:
train_bodies_xsum.to_csv("train_bodies_xsum_cleaned")
train_bodies_cnn_dailymail.to_csv("train_bodies_cnn_dailymail_cleaned")
train_bodies_newsroom.to_csv("train_bodies_newsroom_cleaned")
train_bodies_multi_news.to_csv("train_bodies_multi_news_cleaned")

## Test data

In [None]:
count_df_test = count(test_bodies)
count_df_test_xsum = count(test_bodies_xsum)
count_df_test_cnn_dailymail = count(test_bodies_cnn_dailymail)
count_df_test_newsroom = count(test_bodies_newsroom)
count_df_test_multi_news = count(test_bodies_multi_news)

In [None]:
reduction_df_test_xsum = count_df_test_xsum / count_df_test
reduction_df_test_cnn_dailymail = count_df_test_cnn_dailymail / count_df_test
reduction_df_test_newsroom = count_df_test_newsroom / count_df_test
reduction_df_test_multi_news = count_df_test_multi_news / count_df_test

In [None]:
(reduction_df_test_xsum > 1.0).value_counts()

In [None]:
(reduction_df_test_cnn_dailymail > 1.0).value_counts()

In [None]:
(reduction_df_test_newsroom > 1.0).value_counts()

In [None]:
(reduction_df_test_multi_news > 1.0).value_counts()

In [None]:
test_bodies_xsum[reduction_df_test_xsum > 1.0] = train_bodies[reduction_df_test_xsum > 1.0]
test_bodies_cnn_dailymail[reduction_df_test_cnn_dailymail > 1.0] = train_bodies[reduction_df_test_cnn_dailymail > 1.0]
test_bodies_newsroom[reduction_df_test_newsroom > 1.0] = train_bodies[reduction_df_test_newsroom > 1.0]
test_bodies_multi_news[reduction_df_test_multi_news > 1.0] = train_bodies[reduction_df_test_multi_news > 1.0]

In [None]:
test_bodies_xsum.to_csv("test_bodies_xsum_cleaned")
test_bodies_cnn_dailymail.to_csv("test_bodies_cnn_dailymail_cleaned")
test_bodies_newsroom.to_csv("test_bodies_newsroom_cleaned")
test_bodies_multi_news.to_csv("test_bodies_multi_news_cleaned")

# Boxplot

In [None]:
count_df_test = count(test_bodies)
count_df_test_xsum = count(test_bodies_xsum)
count_df_test_cnn_dailymail = count(test_bodies_cnn_dailymail)
count_df_test_newsroom = count(test_bodies_newsroom)
count_df_test_multi_news = count(test_bodies_multi_news)

In [None]:
reduction_df_test_xsum = abs((count_df_test_xsum - count_df_test) / count_df_test)
reduction_df_test_cnn_dailymail = abs((count_df_test_cnn_dailymail - count_df_test) / count_df_test)
reduction_df_test_newsroom = abs((count_df_test_newsroom - count_df_test) / count_df_test)
reduction_df_test_multi_news = abs((count_df_test_multi_news - count_df_test) / count_df_test)

In [11]:
plt.rcParams["figure.dpi"] = 300

In [None]:
reduction_df_test_xsum.boxplot("bodyArticle")

In [None]:
reduction_df_test_cnn_dailymail.boxplot("bodyArticle")

In [None]:
count_df_test_newsroom.boxplot("bodyArticle")

In [None]:
count_df_test_multi_news.boxplot("bodyArticle")