In [1]:
# September 2023
# Data exploration and cleaning
# Violeta Berdejo-Espinola & Ákos Hájas

In [2]:
# linting 
# !nbqa pylint 1.pre_process_main_text.ipynb

In [3]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [4]:
import re
import os
import pandas as pd
from random import sample
from Levenshtein import ratio # string similarity metric that measures the difference between two sequences
os.getcwd()

'/Users/uqvberde/Projects/classifier_spanish/scripts'

In [5]:
pd.set_option("display.max_rows", 150)

df_repo = pd.read_csv(
    "../datasets/from_repo/majom_september_pos_added.csv",
    header=None,
    index_col=0,
    names=[
        "id",
        "title_spa",
        "journal_name",
        "pub_year",
        "country",
        "abstract_eng",
        "main_text_eng",
        "ci_eng",
        "abstract_spa",
        "main_text_spa",
        "ci_spa",
    ],
)

df_pos = pd.read_csv(
    "../datasets/from_translate/translatE_spanish_positives_71.csv", 
    encoding="utf-8",
    names=[
        "title_spa",
        'Publication_type',
        "journal_name",
        "abstract_spa",
        "label"
    ],
    skiprows=1
)


df_pos = df_pos.drop(["Publication_type"], axis=1)

In [6]:
len(df_repo)

5771

# Clean dataframes

In [7]:
bad_title = df_repo["title_spa"].str.contains("In Memoriam|Editorial|Fe de erratas|FE DE ERRATA|ERRATA|aniversario|ARTÍCULO RETRACTADO")
bad_body = df_repo["main_text_spa"].str.contains("Texto completo disponible sólo en PDF|Full text available only in PDF format Texto completo disponible sólo en PDF")
             
df_repo = df_repo.dropna(subset=["title_spa", "abstract_spa", "main_text_spa", "journal_name"])
df_repo = df_repo[~bad_title | ~bad_body]

df_repo.title_spa.duplicated().sum()
# df_repo.index.has_duplicates

  df_repo = df_repo[~bad_title | ~bad_body]


0

In [8]:
len(df_repo)

5019

In [9]:
# Normalize titles and journal name from df_repo and df_pos

def normalize_title(title):
    title = re.sub("\s+", " ", title).capitalize().strip().replace(".", "")
    title = re.sub("\xa0", " ", title)
    return re.sub("\n{1,}", " ", title)

df_pos["title_spa"] = df_pos["title_spa"].apply(normalize_title)
df_repo["title_spa"] = df_repo["title_spa"].apply(normalize_title)
df_repo["journal_name"] = df_repo["journal_name"].apply(lambda name: name.capitalize().strip())

repo_titles = df_repo["title_spa"] #5019
pos_titles = df_pos["title_spa"] #69

In [10]:
len(repo_titles)
len(df_repo)

5019

In [11]:
# Find similarity between titles from df_repo and df_pos

# each item in ratios is one title from repo mapped to similarity of all titles from the positives: 1 title -> 69 titles x 5000 titles
ratios = repo_titles.map(lambda title_repo: pos_titles.map(lambda title_pos: ratio(title_pos, title_repo)))

# for the 5019 sets of 69 ratios, map the 69 numbers to a boolean by checking if it's high enough
similarity = ratios.map(lambda ratio: ratio > 0.82)

# reduce the 69 boolean to one by checking if _any_ of them is true -> a match between repo title and pos title
matches = similarity.map(lambda ratios: ratios.any()) #if any is TRUE

print(len(matches)) # 5019
print(len(df_repo)) # 5019

# df_match = df_repo[
#     df_repo["title_spa"].map(lambda title_repo: df_pos["title_spa"].map(lambda title: ratio(title, title_repo)).map(lambda ratio: ratio > 0.92).any())
# ]["title_spa"]

# ratios["S2007-11322019000600238"]
# similarity["S2007-11322019000600238"]

5019
5019


In [12]:
type(ratios)
ratios[1:2:4]

id
S2007-11322019000600065    0     0.423913
1     0.347222
2     0.326923
3...
Name: title_spa, dtype: object

In [13]:
# Titles in pos that are not in matches

pos_matches = similarity[0]
for row in similarity[1:]:
    pos_matches += row

print(len(df_pos[~pos_matches]))
df_pos[~pos_matches]

9


Unnamed: 0,title_spa,journal_name,abstract_spa,label
12,La liebre amenazada lepus flavigularis prefier...,Therya,Lepus flavigularis Wagner 1844 (Liebre de Tehu...,Positive
17,Murciélagos (mammalia: chiroptera) en áreas na...,Revista de Biología Tropical,"En general, los ambientes naturales se han tra...",Positive
21,Comprobación de un nuevo método para reducir l...,Ardeola,En las poblaciones naturales de aves los pollu...,Positive
22,Biología de la conservación del cisne coscorob...,Ardeola,Al final del resumen se explica la implementac...,Positive
45,Almacenamiento en frío del esperma de la truch...,Revista Mexicana de Biodiversidad,Con el objetivo de determinar un protocolo apr...,Positive
48,Éxito o fracaso: el papel de la restauración e...,Revista Mexicana de Biodiversidad,Estudios recientes han mostrado que la restaur...,Positive
61,Incubación artificial de huevo del pavón cornu...,Acta Zoológica Mexicana,La incubación artificial es una técnica favora...,Positive
65,Primera prueba de control de chaqueta amarilla...,BioScriba,La avispa exótica Vespula germanica (F.) es un...,Positive
66,Germinación de tres especies de fabaceae de in...,Quebracho,Grandes áreas del Monte Austral han sido sever...,Positive


In [14]:
# Combine dfs, drop duplicates, label pos and neg

df_match = df_repo[matches][["title_spa","abstract_spa","main_text_spa","journal_name"]]
df_match['label'] = "positive"
df_repo = df_repo[["title_spa","abstract_spa","main_text_spa","journal_name"]]
df_repo['label'] = ""
df_combined = pd.concat([df_match, df_repo])
df_combined = df_combined.drop_duplicates(subset=["title_spa"], keep="first")
df_combined['label'] = df_combined.label.replace('','negative',regex = True)

print(len(df_combined[df_combined['label'] == "positive"]))
print(len(df_combined[df_combined['label'] == "negative"]))
print(len(df_combined))
df_combined.head(100)

62
4957
5019


Unnamed: 0_level_0,title_spa,abstract_spa,main_text_spa,journal_name,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
dialnet-7087777,Manejo forestal comunitario en el sur de méxic...,"Evaluamos los cambios en la diversidad, estruc...",Las concesiones de aprovechamiento foresta...,Revista mexicana de biodiversidad,positive
S1870-34532018000501245,La conservación de mamíferos medianos en dos r...,Resumen La fauna es un elemento particularment...,"La fauna, y en particular los mamíferos median...",Revista mexicana de biodiversidad,positive
S1870-34532018000100268,Efecto del pastoreo sobre una comunidad de roe...,Resumen: La ganadería a escala global ocupa la...,"A escala global, el pastoreo de ganado domésti...",Revista mexicana de biodiversidad,positive
S1870-34532018000200553,Potencial reproductivo de stenocerus queretaro...,Resumen: Se estudió el potencial reproductivo ...,El potencial reproductivo indica la capacidad ...,Revista mexicana de biodiversidad,positive
dialnet-6524365,Metodo simple y economico para minimizar la mo...,La minería extractiva es vital para el desarro...,Los productos derivados de la minería ex...,Biodiversity and natural history,positive
S0327-93832017000100005,Manejo del conflicto entre carnívoros y ganade...,El conflicto entre carnívoros nativos y ganado...,introducción la conservaci ón de carnívoros qu...,Mastozoología neotropical,positive
barbastella-4,Conservación de colonias reproductoras de murc...,Los murciélagos requieren refugios específicos...,Los murciélagos utilizan los refugios que encu...,Barbastella,positive
dialnet-7144275,La importancia del agrosistema tradicional fax...,La importancia del agrosistema tradicional Fax...,El sistema Faxinal es una forma de organizació...,Revista ecosistemas,positive
S0327-93832016000200021,Mortalidad de mamíferos y medidas de mitigació...,Recopilamos información sobre mortalidad de ma...,"las infraestructuras lineales de transporte, c...",Mastozoología neotropical,positive
S0327-93832016000200009,Abundancia poblacional y manejo del jabalí (su...,El jabalí (Sus scrofa) es una de las especies ...,"introduccion el jabalí . en europa, el jabalí ...",Mastozoología neotropical,positive


In [15]:
df_combined.to_csv('C:\\Users\\uqvberde\\Dropbox\\TRANSLATE\\ML\\classifier_spanish\\datasets\\py_outputs\\pos_neg.csv')

In [16]:
# Sample negatives and create final dfs

df_neg = df_combined.loc[df_combined.label == "negative"]

def sample_negs(neg):
    sampled = neg.sample(n = len(df_match))
    return sampled

def final_df(pos, neg):
    df = pd.concat([pos, sample_negs(neg)])
    return df

def generate_sample_sets(count):
    return list(map(lambda _: final_df(df_match, df_neg), range(count)))

sample_sets = generate_sample_sets(10)

In [17]:
for i, sample in enumerate(sample_sets):
    sample.to_csv('C:\\Users\\uqvberde\\Dropbox\\TRANSLATE\\ML\\classifier_spanish\\datasets\\py_outputs\\data\\71\\pos_neg_{}.csv'.format(i))

In [18]:
# for i, sample in enumerate(sample_sets):
#     sample.to_csv(f'/Users/uqvberde/Dropbox/TRANSLATE/Objective 2 - Machine Learning/classifier_spanish/datasets/py_outputs/pos_neg/pos_neg_{i}.csv')

In [19]:
type(sample_sets)

list