In [1]:
import multiprocessing
import os
import pandas as pd
import string
import re
import langid

import numpy as np
import gzip

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from nltk.corpus import stopwords

BASE_FILE = '../data/base_titulos_english.txt'
OUT_FILE = '../data/all_titles_sw.txt.gz'

## Read Data

In [2]:
file = open(BASE_FILE, 'r')
data = file.readlines()
file.close()
sw_path = os.path.abspath("../stopwords.txt")
sw = stopwords.words(sw_path)


## Remove punctuation, stopwords and invalid text

In [3]:
def to_lower(text):
    text = [word.lower() for word in text.split()]
    return " ".join(text)

In [4]:
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

In [5]:
def remove_stopwords(text, sw):
    text = [word for word in text.split() if word not in sw]
    return " ".join(text)

In [6]:
def remove_invalid(text):
    noise_file = open('../noise.txt', 'r')
    noise = noise_file.read()
    text = remove_punctuation(text)
    #text = re.sub('[^A-Z a-z _-]', '', text)
    #text = re.sub('0-9', '', text)
    text = re.sub('\n', ' ', text)
    text = re.sub(' +', ' ', text)
    text = text.split(' ')
    text2 = ''
    for index, word in enumerate(text):
        if(len(word) < 2 or len(word) > 16):
            text.pop(index)
        elif word in noise:
            text.pop(index)
        elif re.search('\d+', word):
            text.pop(index)
    for word in text:
        text2 += word + ' '
    text2 = text2[:-1]
    return text2

In [7]:
data_clean = []
for line in data:
    text = remove_punctuation(line)
    text = to_lower(text)
    #text = remove_stopwords(text, sw)
    text = remove_invalid(text)
    data_clean.append(text)
data_clean

['corneal endothelial deposits associated with rifabutin use',
 'extranodal bcell lymphoma of the uvea case report',
 'eyelid swelling as the only manifestation of ocular sarcoidosis',
 'cytomegalovirus glycoprotein genotypes and central nervous system disease aids patients',
 'and lymphocyte correlate to intensity of interface hepatitis chronic hepatitis',
 'clinical aspects of influenza h1n1 hivinfected individuals são paulo during the pandemic of',
 'tissue and serum immune response chronic hepatitis with mild histological lesions',
 'immunogenicity and reactogenicity of influenza h1n1 inactivated monovalent nonadjuvanted vaccine elderly and patients',
 'hepatitis virus and human tlymphotropic virus coinfection epidemiological clinical laboratory and features',
 'estimation of initial condition heat conduction by neural network',
 'estimation of boundary conditions heat transfer by neural networks',
 'neural network based models the inversion of temperature vertical profiles from ra

## Remove no english docs and save

In [8]:
data = ''
for document in data_clean:
    lang, log_prob = langid.classify(document)
    if lang == 'en':
        data += document + '\n'

In [9]:
with gzip.open(OUT_FILE, 'wb', compresslevel=9) as f:
    f.write(data.encode())