# Capítulo 8 - Limpando dados sujos

## Código para limpeza de dados

In [7]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

def getNgrams(content, n):
	content = content.split(' ')
	output = []
	for i in range(len(content)-n+1):
		output.append(content[i:i+n])
	return output

html = urlopen('https://en.wikipedia.org/wiki/Python_(programming_language)')
bs = BeautifulSoup(html, 'html.parser')
content = bs.find('div', {'id':'mw-content-text'}).get_text()
ngrams = getNgrams(content, 2)
print(ngrams)
print(f'O número de 2-grams é {len(ngrams)}')

[['\n\nGeneral-purpose', 'programming'], ['programming', 'language\n\n\nPythonParadigmMulti-paradigm:'], ['language\n\n\nPythonParadigmMulti-paradigm:', 'object-oriented,[1]'], ['object-oriented,[1]', 'procedural'], ['procedural', '(imperative),'], ['(imperative),', 'functional,'], ['functional,', 'structured,'], ['structured,', 'reflectiveDesigned\xa0byGuido'], ['reflectiveDesigned\xa0byGuido', 'van'], ['van', 'RossumDeveloperPython'], ['RossumDeveloperPython', 'Software'], ['Software', 'FoundationFirst\xa0appeared20\xa0February'], ['FoundationFirst\xa0appeared20\xa0February', '1991;'], ['1991;', '31'], ['31', 'years'], ['years', 'ago\xa0(1991-02-20)[2]Stable'], ['ago\xa0(1991-02-20)[2]Stable', 'release3.10.7[3]\xa0\n'], ['release3.10.7[3]\xa0\n', ''], ['', ''], ['', '/'], ['/', '7'], ['7', 'September'], ['September', '2022;'], ['2022;', '12'], ['12', 'days'], ['days', 'ago\xa0(7'], ['ago\xa0(7', 'September'], ['September', '2022)Preview'], ['2022)Preview', 'release3.11.0rc2[4]\xa0\n'

In [11]:
# Usando expressões regulares para aperfeiçoar o código e limpar
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

def getNgrams(content, n):
	content = re.sub('\n|[[\d+\]]',' ', content)
	content = bytes(content, 'UTF-8')
	content = content.decode('ascii', 'ignore')
	content = content.split(' ')
	content = [word for word in content if word !='']
	output = []
	for i in range(len(content)-n+1):
		output.append(content[i:i+n])
	return output


html = urlopen('https://en.wikipedia.org/wiki/Python_(programming_language)')
bs = BeautifulSoup(html, 'html.parser')
content = bs.find('div', {'id':'mw-content-text'}).get_text()
ngrams = getNgrams(content, 2)
print(ngrams)
print(f'O número de 2-grams é {len(ngrams)}')

[['General-purpose', 'programming'], ['programming', 'language'], ['language', 'PythonParadigmMulti-paradigm:'], ['PythonParadigmMulti-paradigm:', 'object-oriented,'], ['object-oriented,', 'procedural'], ['procedural', '(imperative),'], ['(imperative),', 'functional,'], ['functional,', 'structured,'], ['structured,', 'reflectiveDesignedbyGuido'], ['reflectiveDesignedbyGuido', 'van'], ['van', 'RossumDeveloperPython'], ['RossumDeveloperPython', 'Software'], ['Software', 'FoundationFirstappeared'], ['FoundationFirstappeared', 'February'], ['February', ';'], [';', 'years'], ['years', 'ago('], ['ago(', '-'], ['-', '-'], ['-', ')'], [')', 'Stable'], ['Stable', 'release'], ['release', '.'], ['.', '.'], ['.', '/'], ['/', 'September'], ['September', ';'], [';', 'days'], ['days', 'ago('], ['ago(', 'September'], ['September', ')Preview'], [')Preview', 'release'], ['release', '.'], ['.', '.'], ['.', 'rc'], ['rc', '/'], ['/', 'September'], ['September', ';'], [';', 'days'], ['days', 'ago('], ['ago(

In [13]:
# Separando o código em sentença
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string

def cleanSentence(sentence):
	sentence = sentence.split(' ')
	sentence = [word.strip(string.punctuation+string.whitespace) for word in sentence]
	sentence = [word for word in sentence if len(word)>1 or (word.lower() =='a' or word.lower()=='i')]
	return sentence

def cleanInput(content):
    content = re.sub('\n|[[\d+\]]',' ', content)
    content = bytes(content, 'UTF-8')
    content = content.decode('ascii', 'ignore')
    sentences = content.split('. ')
    return [cleanSentence(sentence) for sentence in sentences]

def getNgramsFromSentence(content, n):
	output = []
	for i in range(len(content)-n+1):
		output.append(content[i:i+n])
	return output

def getNgrams(content, n):
	content = cleanInput(content)
	ngrams=[]
	for sentence in content:
		ngrams.extend(getNgramsFromSentence(sentence,n))
	return ngrams

html = urlopen('https://en.wikipedia.org/wiki/Python_(programming_language)')
bs = BeautifulSoup(html, 'html.parser')
content = bs.find('div', {'id':'mw-content-text'}).get_text()
ngrams = getNgrams(content, 2)
print(ngrams)
print(f'O número de 2-grams é {len(ngrams)}')

[['General-purpose', 'programming'], ['programming', 'language'], ['language', 'PythonParadigmMulti-paradigm'], ['PythonParadigmMulti-paradigm', 'object-oriented'], ['object-oriented', 'procedural'], ['procedural', 'imperative'], ['imperative', 'functional'], ['functional', 'structured'], ['structured', 'reflectiveDesignedbyGuido'], ['reflectiveDesignedbyGuido', 'van'], ['van', 'RossumDeveloperPython'], ['RossumDeveloperPython', 'Software'], ['Software', 'FoundationFirstappeared'], ['FoundationFirstappeared', 'February'], ['February', 'years'], ['years', 'ago'], ['ago', 'Stable'], ['Stable', 'release'], ['September', 'days'], ['days', 'ago'], ['ago', 'September'], ['September', 'Preview'], ['Preview', 'release'], ['rc', 'September'], ['September', 'days'], ['days', 'ago'], ['ago', 'September'], ['September', 'Typing'], ['Typing', 'disciplineDuck'], ['disciplineDuck', 'dynamic'], ['dynamic', 'strong'], ['strong', 'typing'], ['typing', 'gradual'], ['gradual', 'since'], ['but', 'ignored']

### Normalização de dados

In [18]:
from collections import Counter


def getNgrams(content, n):
	#content = content.upper()
	content = cleanInput(content)
	ngrams= Counter()
	for sentence in content:
		newNGrams = [' '.join(ngram) for ngram in getNgramsFromSentence(sentence,2)]
		ngrams.update(newNGrams)
	return(ngrams)

html = urlopen('https://en.wikipedia.org/wiki/Python_(programming_language)')
bs = BeautifulSoup(html, 'html.parser')
content = bs.find('div', {'id':'mw-content-text'}).get_text()
ngrams = getNgrams(content, 2)
print(ngrams)
print(f'O número de 2-grams é {len(ngrams)}')

Counter({'from the': 219, 'the original': 208, 'original on': 206, 'Archived from': 199, 'on June': 61, 'Software Foundation': 38, 'of the': 37, 'Python Software': 35, 'of Python': 33, 'Retrieved February': 30, 'in Python': 28, 'in the': 26, 'Retrieved March': 26, 'such as': 22, 'Retrieved January': 22, 'van Rossum': 21, 'on May': 21, 'as a': 20, 'Retrieved May': 19, 'is a': 18, 'on October': 18, 'Retrieved June': 18, 'on December': 17, 'Retrieved September': 17, 'Retrieved November': 17, 'on April': 17, 'Retrieved April': 17, 'Retrieved July': 16, 'to the': 15, 'the Python': 14, 'to be': 14, 'can be': 14, 'be used': 14, 'for Python': 14, 'Python Enhancement': 14, 'on March': 14, 'Retrieved December': 14, 'on February': 14, 'programming language': 13, 'Enhancement Proposals': 13, 'Python is': 12, 'standard library': 12, 'to Python': 12, 'on August': 12, 'Rossum Guido': 12, 'on January': 12, 'with the': 11, 'of a': 11, 'statement which': 11, 'used to': 11, 'Python Insider': 11, 'program