In [1]:
import os
import pandas as pd
import string
import re
import langid

import numpy as np

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from IPython.display import clear_output
from langid.langid import LanguageIdentifier, model

#BASE_FILE = '/home/antonio/old_things/data.csv'
BASE_FILE = 'data/data.csv'
OUT_FILE = 'data/data_pp.csv'

## Read Data

In [2]:
data = pd.read_csv(BASE_FILE, sep='|', index_col=0)
sw_path = os.path.abspath("./stopwords.txt")
sw = stopwords.words(sw_path)

data['pp_title'] = None
data['pp_title_rm_sw'] = None
data['pp_tile_rm_sw_lem'] = None

In [3]:
data.head()

Unnamed: 0,author_id,author_name,author_ref,doctorate,paper_issn,paper_ano,paper,paper_home_page,pp_title,pp_title_rm_sw,pp_tile_rm_sw_lem
0,9179745776599946,Vanderlei Moraes Rodrigues,"RODRIGUES, V. M.",Computação,1034308,2000,Using the ACL2 Theorem Prover to Reason about ...,,,,
1,9103101956717062,Rafael Santos Coelho,"COELHO, R. S.;COELHO, RAFAEL S.",Ciência da Computação,15710653,2015,The k-hop connected dominating set problem: ha...,[doi:10.1016/j.endm.2015.07.011],,,
2,9120445622048393,Luiz Henrique Gomes,"GOMES, L. H.;GOMES, LUIZ H.",Ciências da Computação,2195259,2009,QUANTIFYING SOCIAL AND OPPORTUNISTIC BEHAVIOR ...,[doi:10.1142/s0219525909002088],,,
3,9120445622048393,Luiz Henrique Gomes,"GOMES, L. H.;GOMES, LUIZ H.",Ciências da Computação,1665316,2007,Workload models of spam and legitimate e-mails,,,,
4,9175591364526313,Lisane Brisolara de Brisolara,"BRISOLARA, Lisane;Brisolara, Lisane;BRISOLARA,...",Computação,18071953,2004,Supporting Collaboration in Distributed Design...,,,,


## Remove punctuation, stopwords, invalid text and applies lemmatization

In [4]:
def to_lower(text):
    text = [word.lower() for word in text.split()]
    return " ".join(text)

In [5]:
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

In [6]:
def remove_stopwords(text, sw):
    text = [word for word in text.split() if word not in sw]
    return " ".join(text)

In [7]:
def lemmatizer_text(text):
    lemmatizer = WordNetLemmatizer()
    text = word_tokenize(text)
    for index, word in enumerate(text):
            text[index] = lemmatizer.lemmatize(word)
    return ' '.join(text)

In [8]:
def remove_invalid(text):
    noise_file = open('noise.txt', 'r')
    noise = noise_file.read()
    text = remove_punctuation(text)
    text = re.sub('\n', ' ', text)
    text = re.sub(' +', ' ', text)
    text = text.split(' ')
    text2 = ''
    for index, word in enumerate(text):
        if(len(word) < 2 or len(word) > 16):
            text.pop(index)
        elif word in noise:
            text.pop(index)
        elif word.isnumeric():
            text.pop(index) 
    for word in text:
        text2 += word + ' '
    text2 = text2[:-1]
    return text2

In [9]:
n_titles = data.shape[0]
for index, row in data.iterrows():
    #text = to_lower(data.at[index, 'article'])
    text = to_lower(data.at[index, 'paper'])
    text = remove_invalid(text)
    #lang, log_prob = langid.classify(text)
    #data.at[index, 'is_english'] = lang + ' ' + str(log_prob)
    lang, log_prob = langid.classify(text)
    if lang == 'en':
        data.at[index, 'pp_title'] = text
        text = remove_stopwords(text, sw)
        data.at[index, 'pp_title_rm_sw'] = text
        text = lemmatizer_text(text)
        data.at[index, 'pp_tile_rm_sw_lem'] = text
    clear_output(wait=True)
    print(str("%.2f" % (index * 100 / n_titles)) + '%')

100.00%


In [10]:
data

Unnamed: 0,author_id,author_name,author_ref,doctorate,paper_issn,paper_ano,paper,paper_home_page,pp_title,pp_title_rm_sw,pp_tile_rm_sw_lem
0,9179745776599946,Vanderlei Moraes Rodrigues,"RODRIGUES, V. M.",Computação,01034308,2000,Using the ACL2 Theorem Prover to Reason about ...,,using the acl2 theorem prover to reason about ...,acl2 theorem prover reason vhdl components,acl2 theorem prover reason vhdl component
1,9103101956717062,Rafael Santos Coelho,"COELHO, R. S.;COELHO, RAFAEL S.",Ciência da Computação,15710653,2015,The k-hop connected dominating set problem: ha...,[doi:10.1016/j.endm.2015.07.011],the khop connected dominating set problem hard...,khop connected dominating set problem hardness...,khop connected dominating set problem hardness...
2,9120445622048393,Luiz Henrique Gomes,"GOMES, L. H.;GOMES, LUIZ H.",Ciências da Computação,02195259,2009,QUANTIFYING SOCIAL AND OPPORTUNISTIC BEHAVIOR ...,[doi:10.1142/s0219525909002088],quantifying social and opportunistic behavior ...,quantifying social opportunistic behavior emai...,quantifying social opportunistic behavior emai...
3,9120445622048393,Luiz Henrique Gomes,"GOMES, L. H.;GOMES, LUIZ H.",Ciências da Computação,01665316,2007,Workload models of spam and legitimate e-mails,,workload models of spam and legitimate emails,workload models spam legitimate emails,workload model spam legitimate email
4,9175591364526313,Lisane Brisolara de Brisolara,"BRISOLARA, Lisane;Brisolara, Lisane;BRISOLARA,...",Computação,18071953,2004,Supporting Collaboration in Distributed Design...,,supporting collaboration distributed design en...,supporting collaboration distributed design en...,supporting collaboration distributed design en...
...,...,...,...,...,...,...,...,...,...,...,...
26235,2283022405554044,Marco Aurelio Cavalcanti Pacheco,"Pacheco, M. A. C.;PACHECO, Marco Aurélio C.;Pa...",Computer Science,09204105,2017,Uncertainty quantification in reservoir simula...,[doi:10.1016/j.petrol.2017.03.046],uncertainty quantification reservoir simulatio...,uncertainty quantification reservoir simulatio...,uncertainty quantification reservoir simulatio...
26236,2283022405554044,Marco Aurelio Cavalcanti Pacheco,"Pacheco, M. A. C.;PACHECO, Marco Aurélio C.;Pa...",Computer Science,09521976,2018,Solving stochastic differential equations thro...,,solving stochastic differential equations thro...,solving stochastic differential equations gene...,solving stochastic differential equation genet...
26237,2283022405554044,Marco Aurelio Cavalcanti Pacheco,"Pacheco, M. A. C.;PACHECO, Marco Aurélio C.;Pa...",Computer Science,17518725,2018,An Efficient Model Based on Genetic Programmin...,[doi:10.1049/iet-map.2017.0490],an efficient model based on genetic programmin...,efficient model based genetic programming spli...,efficient model based genetic programming spli...
26238,2203695060210682,Thiago José Machado,"Machado, T. J.;MACHADO, THIAGO J.",Modelagem Computacional,01704214,2016,A new one-shot pointwise source reconstruction...,[http://onlinelibrary.wiley.com/doi/10.1002/mm...,new oneshot pointwise source reconstruction me...,oneshot pointwise source reconstruction method,oneshot pointwise source reconstruction method


In [12]:
data.loc[data['pp_title'].notnull()]

Unnamed: 0,author_id,author_name,author_ref,doctorate,paper_issn,paper_ano,paper,paper_home_page,pp_title,pp_title_rm_sw,pp_tile_rm_sw_lem
0,9179745776599946,Vanderlei Moraes Rodrigues,"RODRIGUES, V. M.",Computação,01034308,2000,Using the ACL2 Theorem Prover to Reason about ...,,using the acl2 theorem prover to reason about ...,acl2 theorem prover reason vhdl components,acl2 theorem prover reason vhdl component
1,9103101956717062,Rafael Santos Coelho,"COELHO, R. S.;COELHO, RAFAEL S.",Ciência da Computação,15710653,2015,The k-hop connected dominating set problem: ha...,[doi:10.1016/j.endm.2015.07.011],the khop connected dominating set problem hard...,khop connected dominating set problem hardness...,khop connected dominating set problem hardness...
2,9120445622048393,Luiz Henrique Gomes,"GOMES, L. H.;GOMES, LUIZ H.",Ciências da Computação,02195259,2009,QUANTIFYING SOCIAL AND OPPORTUNISTIC BEHAVIOR ...,[doi:10.1142/s0219525909002088],quantifying social and opportunistic behavior ...,quantifying social opportunistic behavior emai...,quantifying social opportunistic behavior emai...
3,9120445622048393,Luiz Henrique Gomes,"GOMES, L. H.;GOMES, LUIZ H.",Ciências da Computação,01665316,2007,Workload models of spam and legitimate e-mails,,workload models of spam and legitimate emails,workload models spam legitimate emails,workload model spam legitimate email
4,9175591364526313,Lisane Brisolara de Brisolara,"BRISOLARA, Lisane;Brisolara, Lisane;BRISOLARA,...",Computação,18071953,2004,Supporting Collaboration in Distributed Design...,,supporting collaboration distributed design en...,supporting collaboration distributed design en...,supporting collaboration distributed design en...
...,...,...,...,...,...,...,...,...,...,...,...
26235,2283022405554044,Marco Aurelio Cavalcanti Pacheco,"Pacheco, M. A. C.;PACHECO, Marco Aurélio C.;Pa...",Computer Science,09204105,2017,Uncertainty quantification in reservoir simula...,[doi:10.1016/j.petrol.2017.03.046],uncertainty quantification reservoir simulatio...,uncertainty quantification reservoir simulatio...,uncertainty quantification reservoir simulatio...
26236,2283022405554044,Marco Aurelio Cavalcanti Pacheco,"Pacheco, M. A. C.;PACHECO, Marco Aurélio C.;Pa...",Computer Science,09521976,2018,Solving stochastic differential equations thro...,,solving stochastic differential equations thro...,solving stochastic differential equations gene...,solving stochastic differential equation genet...
26237,2283022405554044,Marco Aurelio Cavalcanti Pacheco,"Pacheco, M. A. C.;PACHECO, Marco Aurélio C.;Pa...",Computer Science,17518725,2018,An Efficient Model Based on Genetic Programmin...,[doi:10.1049/iet-map.2017.0490],an efficient model based on genetic programmin...,efficient model based genetic programming spli...,efficient model based genetic programming spli...
26238,2203695060210682,Thiago José Machado,"Machado, T. J.;MACHADO, THIAGO J.",Modelagem Computacional,01704214,2016,A new one-shot pointwise source reconstruction...,[http://onlinelibrary.wiley.com/doi/10.1002/mm...,new oneshot pointwise source reconstruction me...,oneshot pointwise source reconstruction method,oneshot pointwise source reconstruction method


In [18]:
data.loc[data['pp_title'].notnull()].to_csv('data/all_data_clean.csv', sep='|')

In [19]:
data.loc[data['pp_title'].notnull()]

Unnamed: 0,id,name,name_citation,doctorate,article,article_year,is_english,pp_title,pp_title_rm_sw,pp_tile_rm_sw_lem
0,3172487852109469,Abdelhakim Senhaji Hafid,"HAFID, A. S.;A. HAFID;HAFID, ABDELHAKIM;HAFID,...",Computer Science and Operational Research,Performance Management of IEEE 802.15.4 Wirele...,2015.0,,performance management of ieee wireless sensor...,performance management ieee wireless sensor ne...,performance management ieee wireless sensor ne...
1,3172487852109469,Abdelhakim Senhaji Hafid,"HAFID, A. S.;A. HAFID;HAFID, ABDELHAKIM;HAFID,...",Computer Science and Operational Research,An Integrated Predictive Mobile-Oriented Bandw...,2014.0,,an integrated predictive mobileoriented framew...,integrated predictive mobileoriented framework...,integrated predictive mobileoriented framework...
3,3172487852109469,Abdelhakim Senhaji Hafid,"HAFID, A. S.;A. HAFID;HAFID, ABDELHAKIM;HAFID,...",Computer Science and Operational Research,Cross-layer aware joint design of sensing and ...,2016.0,,crosslayer aware joint design of sensing and f...,crosslayer aware joint design sensing frame du...,crosslayer aware joint design sensing frame du...
4,3172487852109469,Abdelhakim Senhaji Hafid,"HAFID, A. S.;A. HAFID;HAFID, ABDELHAKIM;HAFID,...",Computer Science and Operational Research,An Enhanced Reservation Based Medium Access Co...,2012.0,,an enhanced reservation based medium access co...,enhanced reservation based medium access contr...,enhanced reservation based medium access contr...
5,3172487852109469,Abdelhakim Senhaji Hafid,"HAFID, A. S.;A. HAFID;HAFID, ABDELHAKIM;HAFID,...",Computer Science and Operational Research,Path-Based QoS Provisioning for Optical Burst ...,2011.0,,pathbased qos provisioning for optical burst s...,pathbased qos provisioning optical burst switc...,pathbased qos provisioning optical burst switc...
...,...,...,...,...,...,...,...,...,...,...
39313,1531713258988427,Zhao Liang,"L. ZHAO;Zhao, L.;ZHAO, LIANG;LIANG, ZHAO",Engenharia Eletrônica e Computação,Semi-Supervised Learning Guided by the Modular...,2012.0,,semisupervised learning guided by the modulari...,semisupervised learning guided modularity meas...,semisupervised learning guided modularity meas...
39314,1531713258988427,Zhao Liang,"L. ZHAO;Zhao, L.;ZHAO, LIANG;LIANG, ZHAO",Engenharia Eletrônica e Computação,Selecting salient objects in real scenes: An o...,2011.0,,selecting salient objects real scenes an oscil...,selecting salient objects real scenes oscillat...,selecting salient object real scene oscillator...
39315,1531713258988427,Zhao Liang,"L. ZHAO;Zhao, L.;ZHAO, LIANG;LIANG, ZHAO",Engenharia Eletrônica e Computação,Characterizing chaotic melodies in automatic m...,2010.0,,characterizing chaotic melodies automatic musi...,characterizing chaotic melodies automatic musi...,characterizing chaotic melody automatic music ...
39316,1531713258988427,Zhao Liang,"L. ZHAO;Zhao, L.;ZHAO, LIANG;LIANG, ZHAO",Engenharia Eletrônica e Computação,Phase-disorder-induced double resonance of neu...,2010.0,,double resonance of neuronal activity,double resonance neuronal activity,double resonance neuronal activity
