In [None]:
# September 2023
# Data exploration and cleaning
# Violeta Berdejo-Espinola & Ákos Hájas

In [None]:
%pip install levenshtein pandas

In [None]:
import re
import os
import pandas as pd
from random import sample
from Levenshtein import ratio # string similarity metric that measures the difference between two sequences
os.getcwd()

In [None]:
pd.set_option("display.max_rows", 150)

df_repo = pd.read_csv(
    "../datasets/from_repo/majom_september_pos_added.csv.gz",
    header=None,
    index_col=0,
    names=[
        "id",
        "title_spa",
        "journal_name",
        "pub_year",
        "country",
        "abstract_eng",
        "main_text_eng",
        "ci_eng",
        "abstract_spa",
        "main_text_spa",
        "ci_spa",
    ],
)

df_pos = pd.read_csv(
    "../datasets/from_translate/translatE_spanish_positives_71.csv", 
    encoding="utf-8",
    names=[
        "title_spa",
        'Publication_type',
        "journal_name",
        "abstract_spa",
        "label"
    ],
    skiprows=1
)

df_pos = df_pos.drop(["Publication_type"], axis=1)

print(len(df_repo), len(df_pos))

# clean data

In [None]:
# remove unwanted documents in repo corpus

bad_title = df_repo["title_spa"].str.contains("In Memoriam|Editorial|Fe de erratas|FE DE ERRATA|ERRATA|aniversario|ARTÍCULO RETRACTADO")
bad_body = df_repo["main_text_spa"].str.contains("Texto completo disponible sólo en PDF|Full text available only in PDF format Texto completo disponible sólo en PDF")
             
df_repo = df_repo.dropna(subset=["title_spa", "abstract_spa", "main_text_spa", "journal_name"])
df_repo = df_repo[~bad_title | ~bad_body]

print(f"duplicated titles: {df_repo.title_spa.duplicated().sum()}")
print(f"duplicated indeces: {df_repo.index.has_duplicates}")
print(len(df_repo), len(df_pos))

In [None]:
# normalize titles and journal name in repo and pos

def normalize_title(title):
    title = re.sub("\s+", " ", title).capitalize().strip().replace(".", "")
    title = re.sub("\xa0", " ", title)
    return re.sub("\n{1,}", " ", title)

df_pos["title_spa"] = df_pos["title_spa"].apply(normalize_title)
df_repo["title_spa"] = df_repo["title_spa"].apply(normalize_title)
df_repo["journal_name"] = df_repo["journal_name"].apply(lambda name: name.capitalize().strip())

repo_titles = df_repo["title_spa"]
pos_titles = df_pos["title_spa"]

print(len(df_repo), len(df_pos))

# similarity between titles

In [None]:
# find similarity between titles from df_repo and df_pos

# each item in ratios is one title from repo mapped to similarity of all titles from the positives
ratios = repo_titles.map(lambda title_repo: pos_titles.map(lambda title_pos: ratio(title_pos, title_repo)))
# ratios[1:2:4]

# checking ratios
similarity = ratios.map(lambda ratio: ratio > 0.82)

# reduce the pos boolean to one by checking if _any_ of them is true -> a match between repo title and pos title
matches = similarity.map(lambda ratios: ratios.any()) #if any is TRUE

print(len(similarity), len(similarity.iloc[0]))
print(len(matches))
print(len(df_repo), len(df_pos))

In [None]:
# checking if all positive titles are in matches

pos_matches = similarity.iloc[0]

for row in similarity[1:]:
    pos_matches += row

print(len(df_pos[~pos_matches]))
df_pos[~pos_matches]

# please read
# after inspection:
# indeces 12,17,45,48,61,66 are documents that are also available in English
# indeces 21,66 are not available online
# index 23 is a thesis
# thus can't be included

# merge data

In [None]:
# combine positives and repo dfs, drop duplicates, label positives and negatives

df_match = df_repo[matches][["title_spa","abstract_spa","main_text_spa","journal_name"]]
df_match['label'] = "1"

df_repo = df_repo[["title_spa","abstract_spa","main_text_spa","journal_name"]]
df_repo['label'] = ""

df_combined = pd.concat([df_match, df_repo])
df_combined = df_combined.drop_duplicates(subset=["title_spa"], keep="first")
df_combined['label'] = df_combined.label.replace('','0', regex = True)
df_combined['abstract_spa'] = df_combined['abstract_spa'].str.replace(r'Resumen', '', regex=True)

print(len(df_combined[df_combined['label'] == "positive"]))
print(len(df_combined[df_combined['label'] == "negative"]))
print(len(df_combined))

df_combined.to_csv('../data/pos_neg.csv', encoding='utf-8')