In [None]:
# September 2023
# Data exploration and cleaning
# Violeta Berdejo-Espinola & Ákos Hájas

In [None]:
# linting 
# !nbqa pylint 1.pre_process_main_text.ipynb

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [None]:
import re
import os
import pandas as pd
from random import sample
from Levenshtein import ratio # string similarity metric that measures the difference between two sequences
os.getcwd()

In [None]:
pd.set_option("display.max_rows", 150)

df_repo = pd.read_csv(
    "../datasets/from_repo/majom_september_pos_added.csv",
    header=None,
    index_col=0,
    names=[
        "id",
        "title_spa",
        "journal_name",
        "pub_year",
        "country",
        "abstract_eng",
        "main_text_eng",
        "ci_eng",
        "abstract_spa",
        "main_text_spa",
        "ci_spa",
    ],
)

df_pos = pd.read_csv(
    "../datasets/from_translate/translatE_spanish_positives_71.csv", 
    encoding="utf-8",
    names=[
        "title_spa",
        'Publication_type',
        "journal_name",
        "abstract_spa",
        "label"
    ],
    skiprows=1
)


df_pos = df_pos.drop(["Publication_type"], axis=1)

In [None]:
len(df_repo)

# Clean dataframes

In [None]:
bad_title = df_repo["title_spa"].str.contains("In Memoriam|Editorial|Fe de erratas|FE DE ERRATA|ERRATA|aniversario|ARTÍCULO RETRACTADO")
bad_body = df_repo["main_text_spa"].str.contains("Texto completo disponible sólo en PDF|Full text available only in PDF format Texto completo disponible sólo en PDF")
             
df_repo = df_repo.dropna(subset=["title_spa", "abstract_spa", "main_text_spa", "journal_name"])
df_repo = df_repo[~bad_title | ~bad_body]

df_repo.title_spa.duplicated().sum()
# df_repo.index.has_duplicates

In [None]:
len(df_repo)

In [None]:
# Normalize titles and journal name from df_repo and df_pos

def normalize_title(title):
    title = re.sub("\s+", " ", title).capitalize().strip().replace(".", "")
    title = re.sub("\xa0", " ", title)
    return re.sub("\n{1,}", " ", title)

df_pos["title_spa"] = df_pos["title_spa"].apply(normalize_title)
df_repo["title_spa"] = df_repo["title_spa"].apply(normalize_title)
df_repo["journal_name"] = df_repo["journal_name"].apply(lambda name: name.capitalize().strip())

repo_titles = df_repo["title_spa"] #5019
pos_titles = df_pos["title_spa"] #69

In [None]:
len(repo_titles)
len(df_repo)

In [None]:
# Find similarity between titles from df_repo and df_pos

# each item in ratios is one title from repo mapped to similarity of all titles from the positives: 1 title -> 69 titles x 5000 titles
ratios = repo_titles.map(lambda title_repo: pos_titles.map(lambda title_pos: ratio(title_pos, title_repo)))

# for the 5019 sets of 69 ratios, map the 69 numbers to a boolean by checking if it's high enough
similarity = ratios.map(lambda ratio: ratio > 0.82)

# reduce the 69 boolean to one by checking if _any_ of them is true -> a match between repo title and pos title
matches = similarity.map(lambda ratios: ratios.any()) #if any is TRUE

print(len(matches)) # 5019
print(len(df_repo)) # 5019

# df_match = df_repo[
#     df_repo["title_spa"].map(lambda title_repo: df_pos["title_spa"].map(lambda title: ratio(title, title_repo)).map(lambda ratio: ratio > 0.92).any())
# ]["title_spa"]

# ratios["S2007-11322019000600238"]
# similarity["S2007-11322019000600238"]

In [None]:
type(ratios)
ratios[1:2:4]

In [None]:
# Titles in pos that are not in matches

pos_matches = similarity[0]
for row in similarity[1:]:
    pos_matches += row

print(len(df_pos[~pos_matches]))
df_pos[~pos_matches]

In [None]:
# Combine dfs, drop duplicates, label pos and neg

df_match = df_repo[matches][["title_spa","abstract_spa","main_text_spa","journal_name"]]
df_match['label'] = "positive"
df_repo = df_repo[["title_spa","abstract_spa","main_text_spa","journal_name"]]
df_repo['label'] = ""
df_combined = pd.concat([df_match, df_repo])
df_combined = df_combined.drop_duplicates(subset=["title_spa"], keep="first")
df_combined['label'] = df_combined.label.replace('','negative',regex = True)

print(len(df_combined[df_combined['label'] == "positive"]))
print(len(df_combined[df_combined['label'] == "negative"]))
print(len(df_combined))
df_combined.head(100)

In [None]:
df_combined.to_csv('C:\\Users\\uqvberde\\Dropbox\\TRANSLATE\\ML\\classifier_spanish\\datasets\\py_outputs\\pos_neg.csv')

In [None]:
# Sample negatives and create final dfs

df_neg = df_combined.loc[df_combined.label == "negative"]

def sample_negs(neg):
    sampled = neg.sample(n = len(df_match))
    return sampled

def final_df(pos, neg):
    df = pd.concat([pos, sample_negs(neg)])
    return df

def generate_sample_sets(count):
    return list(map(lambda _: final_df(df_match, df_neg), range(count)))

sample_sets = generate_sample_sets(10)

In [None]:
for i, sample in enumerate(sample_sets):
    sample.to_csv('C:\\Users\\uqvberde\\Dropbox\\TRANSLATE\\ML\\classifier_spanish\\datasets\\py_outputs\\data\\71\\pos_neg_{}.csv'.format(i))

In [None]:
# for i, sample in enumerate(sample_sets):
#     sample.to_csv(f'/Users/uqvberde/Dropbox/TRANSLATE/Objective 2 - Machine Learning/classifier_spanish/datasets/py_outputs/pos_neg/pos_neg_{i}.csv')

In [None]:
type(sample_sets)