In [9]:
import pandas as pd
import string

from time import time
from pathlib import Path

from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
#from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

DATA_PATH = './data'
Path(DATA_PATH).mkdir(exist_ok=True)

DEFAULT_STEMMER = SnowballStemmer('english')
DEFAULT_STOPWORDS = stopwords.words('english')

In [10]:
# Data Read
data_file = '{}/df_lattes.pkl.xz'.format(DATA_PATH)
%time df = pd.read_pickle(data_file, compression='xz')
print('{} df shape'.format(df.shape))
df.head(10)

CPU times: user 447 ms, sys: 23 ms, total: 470 ms
Wall time: 480 ms
(172165, 3) df shape


Unnamed: 0,id,titulo,ano
0,101982954114164,3D reconstruction methods for digital preserva...,2014
1,101982954114164,3D Viewer Software Build Based on Scanned Synt...,2016
2,102488447573085,The Globalization Strategy of a High-Tech Mult...,1996
3,102488447573085,Information Systems as an Instrument of Qualit...,2000
4,102488447573085,The Experience of a Cardiology Unit in the Dev...,2002
5,102488447573085,Information Systems as an Instrument for Quali...,2002
6,102488447573085,Clustering and Categorization Applied to Crypt...,2006
7,102488447573085,Criptoanalisys Outwit using Context Sensitive ...,2016
8,102488447573085,Cryptographic Algorithm Identification Using M...,2016
9,103102694865890,Influence of baroclinic sistems in severe rain...,2011


In [3]:
# drop id, ano
df.drop(['id','ano'], axis=1, inplace=True)

In [4]:
# Normalização
df.insert(1, 'clean', df['titulo'])

def remove_separation(doc):
    table = doc.maketrans('-/', '  ')
    return doc.translate(table)

def strip_ponct(doc):
    table = str.maketrans({key: None for key in string.punctuation})
    return doc.translate(table)

def normaliza(doc):
    doc = remove_separation(doc)
    doc = strip_ponct(doc)
    doc = doc.lower()
    return doc

t = time()
df['clean'] = df['clean'].apply(normaliza)

print('Time: {} mins'.format(round((time() - t) / 60, 2)))
df.head(10)

Time: 0.04 mins


Unnamed: 0,titulo,clean
0,3D reconstruction methods for digital preserva...,3d reconstruction methods for digital preserva...
1,3D Viewer Software Build Based on Scanned Synt...,3d viewer software build based on scanned synt...
2,The Globalization Strategy of a High-Tech Mult...,the globalization strategy of a high tech mult...
3,Information Systems as an Instrument of Qualit...,information systems as an instrument of qualit...
4,The Experience of a Cardiology Unit in the Dev...,the experience of a cardiology unit in the dev...
5,Information Systems as an Instrument for Quali...,information systems as an instrument for quali...
6,Clustering and Categorization Applied to Crypt...,clustering and categorization applied to crypt...
7,Criptoanalisys Outwit using Context Sensitive ...,criptoanalisys outwit using context sensitive ...
8,Cryptographic Algorithm Identification Using M...,cryptographic algorithm identification using m...
9,Influence of baroclinic sistems in severe rain...,influence of baroclinic sistems in severe rain...


In [5]:
# Stopwords / Lemma
def remove_stopwords(doc, stop_words=DEFAULT_STOPWORDS):
    words = [w for w in doc if not w in stop_words]
    return words

def stem_text(doc, stemmer=DEFAULT_STEMMER):
    words = [stemmer.stem(w) for w in doc]
    return words

def preprocess(doc):
    doc = word_tokenize(doc)
    doc = remove_stopwords(doc)
    doc = stem_text(doc)
    doc = [w for w in doc if len(w) > 1]
    doc = ' '.join(doc)
    return doc

t = time()
df['clean'] = df['clean'].apply(preprocess)
df['w_count'] = df['clean'].str.split().str.len()

print('Time: {} mins'.format(round((time() - t) / 60, 2)))
df.head(10)

Time: 1.12 mins


Unnamed: 0,titulo,clean,w_count
0,3D reconstruction methods for digital preserva...,3d reconstruct method digit preserv cultur her...,8
1,3D Viewer Software Build Based on Scanned Synt...,3d viewer softwar build base scan synthet fema...,11
2,The Globalization Strategy of a High-Tech Mult...,global strategi high tech multin corpor case s...,12
3,Information Systems as an Instrument of Qualit...,inform system instrument qualiti program healt...,7
4,The Experience of a Cardiology Unit in the Dev...,experi cardiolog unit develop qualiti program ...,10
5,Information Systems as an Instrument for Quali...,inform system instrument qualiti program,5
6,Clustering and Categorization Applied to Crypt...,cluster categor appli cryptanalysi,4
7,Criptoanalisys Outwit using Context Sensitive ...,criptoanalisi outwit use context sensit grammar,6
8,Cryptographic Algorithm Identification Using M...,cryptograph algorithm identif use machin learn...,8
9,Influence of baroclinic sistems in severe rain...,influenc baroclin sistem sever rainstorm buena...,8


In [6]:
no_benefit = df['w_count'].loc[df['w_count']<=2].count()
print('Docs without benefit to train:', no_benefit)

df.drop(df.loc[df['w_count']<=2].index, inplace=True)
df.drop(['titulo','w_count'], axis=1, inplace=True)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(inplace=True, drop=True)

Docs without benefit to train: 845


In [7]:
df.shape

(123114, 1)

In [8]:
path = '{}/models'.format(DATA_PATH)
Path(path).mkdir(exist_ok=True)

dev_file = '{}/corpus.csv'.format(path)
pre_file = '{}/corpus.csv.xz'.format(path)

df.to_csv(dev_file, index=None, header=False, columns=['clean'])
df.to_csv(pre_file, index=None, header=False, compression='xz', columns=['clean'])