In [1]:
import logging
import pandas as pd
import string, html

from time import time
from pathlib import Path
from bs4 import BeautifulSoup
from collections import defaultdict

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

# Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [2]:
papers_file = './data/papers_22072019.csv.xz'
works_file  = './data/works_22072019.csv.xz'

default_stemmer = PorterStemmer()
default_stopwords = stopwords.words('english')

### Data Read

In [3]:
# Artigos Extraidos
df_papers = pd.read_csv(papers_file, sep='|', compression='xz')
df_papers.drop(df_papers.loc[df_papers['idioma']!='Inglês'].index, inplace=True)
df_papers.drop(['id','id_doc','autores_cnpq','autores',
                'idioma','revista','natureza','ano'], axis=1, inplace=True)
df_papers.dropna(inplace=True)
df_papers.reset_index(inplace=True, drop=True)

# Trabalhos Extraidos
df_works = pd.read_csv(works_file, sep='|', compression='xz')
df_works.drop(df_works.loc[df_works['idioma']!='Inglês'].index, inplace=True)
df_works.drop(['id','id_doc','autores_cnpq','autores',
                'idioma','anais','natureza','ano'], axis=1, inplace=True)
df_works.dropna(inplace=True)
df_works.reset_index(inplace=True, drop=True)

# Check shape
print('artigos shape:', df_papers.shape)
print('trabalhos shape:', df_works.shape)

artigos shape: (61154, 1)
trabalhos shape: (119965, 1)


### Concatena Dataframes

In [4]:
frames = [df_papers, df_works]
df = pd.concat(frames)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(inplace=True, drop=True)

del [frames, df_papers, df_works]
print('docs shape:', df.shape)

docs shape: (145194, 1)


### Cleannig

In [5]:
def decode_html(doc):
    return html.unescape(doc)

def strip_html(doc):
    soup = BeautifulSoup(doc, "html.parser")
    return soup.get_text()

def remove_separation(doc):
    table = doc.maketrans('-/', '  ')
    return doc.translate(table)

def strip_ponct(doc):
    table = str.maketrans({key: None for key in string.punctuation})
    return doc.translate(table)

def one_space(doc):
    return ' '.join(doc.split()).lower()

def tokenize_text(doc):
    return [w for s in sent_tokenize(doc) for w in word_tokenize(s)]

def stem_text(doc, stemmer=default_stemmer):
    tokens = tokenize_text(doc)
    return ' '.join([stemmer.stem(t) for t in tokens])

def remove_stopwords(doc, stop_words=default_stopwords):
    tokens = [w for w in tokenize_text(doc) if w not in stop_words]
    return ' '.join(tokens)

t = time()
df['clean'] = df['titulo'].apply(decode_html)
df['clean'] = df['clean'].apply(strip_html)
df['clean'] = df['clean'].apply(remove_separation)
df['clean'] = df['clean'].apply(strip_ponct)
df['clean'] = df['clean'].apply(one_space)
df['clean'] = df['clean'].apply(stem_text)
df['clean'] = df['clean'].apply(remove_stopwords)

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))
df.head(10)

Time to clean up everything: 2.09 mins


Unnamed: 0,titulo,clean
0,Isolating Complex Polynomial Roots: An algebra...,isol complex polynomi root algebra algorithm u...
1,An algebraic algorithm to Isolate Complex poly...,algebra algorithm isol complex polynomi zero u...
2,Using Computer Algebra and Chebyshev Polynomia...,use comput algebra chebyshev polynomi count po...
3,Reviewing the SacarWeb design based in discoun...,review sacarweb design base discount usabl engin
4,Extended Immersive Learning Environment: A Hyb...,extend immers learn environ hybrid remot virtu...
5,New Technologies for Information and Communica...,new technolog inform commun pwm remot experi 3...
6,School vs Industry A Relation of Competencies ...,school vs industri relat compet skill
7,3D virtual worlds using open source platform a...,3d virtual world use open sourc platform integ...
8,"Technology PLC - Power Line Communication, Use...",technolog plc power line commun use monitor sy...
9,Immersive Learning Environment Using 3D Virtua...,immers learn environ use 3d virtual world inte...


### Remove snetences <= 2 words

In [6]:
df['wcount'] = df['clean'].str.count(' ') + 1
no_benefit = df['wcount'].loc[df['wcount']<=2].count()
print('Docs without benefit to train:', no_benefit)

# Remove wcount <= 2
df.drop(df.loc[df['wcount']<=2].index, inplace=True)
df.drop(['titulo','wcount'], axis=1, inplace=True)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(inplace=True, drop=True)


Docs without benefit to train: 746


### Salva Corpus

In [9]:
path = './data/models/'
Path(path).mkdir(exist_ok=True)
dev_file = '%scorpus.csv' % path
pre_file = '%scorpus.csv.xz' % path

df.to_csv(dev_file, index=None, header=False)
df.to_csv(pre_file, index=None, header=False, compression='xz')