## Pré-processamento dos dados

In [None]:
!pip install bs4
!pip install nltk


In [None]:
nltk.download()

In [1]:
import os

cwd = os.getcwd()

In [14]:
import pandas as pd


df = pd.read_csv(cwd + '/data/posts.csv')

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105603 entries, 0 to 105602
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Post Link         105603 non-null  int64 
 1   Title             105603 non-null  object
 2   Body              105603 non-null  object
 3   Tags              105603 non-null  object
 4   CreationDate      105603 non-null  object
 5   Answer Date       105603 non-null  object
 6   AcceptedAnswerId  105603 non-null  int64 
 7   id                105603 non-null  int64 
 8   score             105603 non-null  int64 
dtypes: int64(4), object(5)
memory usage: 7.3+ MB


In [4]:
df.head()

Unnamed: 0,Post Link,Title,Body,Tags,CreationDate,Answer Date,AcceptedAnswerId,id,score
0,11227809,Why is processing a sorted array faster than p...,<p>Here is a piece of C++ code that shows some...,<java><c++><performance><cpu-architecture><bra...,2012-06-27 13:51:36,2012-06-27 13:56:42,11227902,11227902,26631
1,927358,How do I undo the most recent local commits in...,<p>I accidentally committed the wrong files to...,<git><version-control><git-commit><undo>,2009-05-29 18:09:14,2009-05-29 18:13:42,927386,927386,24839
2,2003505,How do I delete a Git branch locally and remot...,<h4>Failed Attempts to Delete a Remote Branch:...,<git><version-control><git-branch><git-push><g...,2010-01-05 01:12:15,2010-01-05 01:13:55,2003515,2003515,19584
3,292357,What is the difference between 'git pull' and ...,"<p>What are the differences between <a href=""h...",<git><version-control><git-pull><git-fetch>,2008-11-15 09:51:09,2008-11-15 09:52:40,292359,292359,13375
4,231767,"What does the ""yield"" keyword do?",<p>What is the use of the <code>yield</code> k...,<python><iterator><generator>,2008-10-23 22:21:11,2008-10-23 22:48:44,231855,231855,12274


In [16]:
from bs4 import BeautifulSoup
import unicodedata
import re


def remove_html_tags_func(text):
    '''
    Removes HTML-Tags from a string, if present
    
    Args:
        text (str): String to which the function is to be applied, string
    
    Returns:
        Clean string without HTML-Tags
    ''' 
    return BeautifulSoup(text, 'html.parser').get_text()


def remove_url_func(text):
    '''
    Removes URL addresses from a string, if present
    
    Args:
        text (str): String to which the function is to be applied, string
    
    Returns:
        Clean string without URL addresses
    ''' 
    return re.sub(r'https?://\S+|www\.\S+', '', text)


def remove_accented_chars_func(text):
    '''
    Removes all accented characters from a string, if present
    
    Args:
        text (str): String to which the function is to be applied, string
    
    Returns:
        Clean string without accented characters
    '''
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')


def remove_punctuation_func(text):
    '''
    Removes all punctuation from a string, if present
    
    Args:
        text (str): String to which the function is to be applied, string
    
    Returns:
        Clean string without punctuations
    '''
    return re.sub(r'[^a-zA-Z0-9]', ' ', text)


def remove_irr_char_func(text):
    '''
    Removes all irrelevant characters (numbers and punctuation) from a string, if present
    
    Args:
        text (str): String to which the function is to be applied, string
    
    Returns:
        Clean string without irrelevant characters
    '''
    return re.sub(r'[^a-zA-Z]', ' ', text)


def remove_extra_whitespaces_func(text):
    '''
    Removes extra whitespaces from a string, if present
    
    Args:
        text (str): String to which the function is to be applied, string
    
    Returns:
        Clean string without extra whitespaces
    ''' 
    return re.sub(r'^\s*|\s\s*', ' ', text).strip()


def word_count_func(text):
    '''
    Counts words within a string
    
    Args:
        text (str): String to which the function is to be applied, string
    
    Returns:
        Number of words within a string, integer
    ''' 
    return len(text.split())

In [17]:
import nltk

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

def text_normaliser(text):
    words = word_tokenize(text)
    
    stop_words = stopwords.words('english')
    filtered_words = [word for word in words if word not in stop_words]
    
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in filtered_words]
    
    return ' '.join(word for word in stemmed)

In [18]:
def pre_process_text(text):
    text = remove_html_tags_func(text)
    text = remove_url_func(text)
    text = remove_accented_chars_func(text)
    text = remove_punctuation_func(text)
    text = remove_irr_char_func(text)
    text = remove_extra_whitespaces_func(text)
    text = text.lower()
    
    return text

In [19]:
df['complete_text'] = df.Title + " " + df.Body

In [20]:
df.complete_text = df.complete_text.apply(lambda x: pre_process_text(x))



In [22]:
df['normalized_text'] = df.complete_text.apply(lambda x: text_normaliser(x))

In [23]:
df.head()

Unnamed: 0,Post Link,Title,Body,Tags,CreationDate,Answer Date,AcceptedAnswerId,id,score,complete_text,normalized_text
0,11227809,Why is processing a sorted array faster than p...,<p>Here is a piece of C++ code that shows some...,<java><c++><performance><cpu-architecture><bra...,2012-06-27 13:51:36,2012-06-27 13:56:42,11227902,11227902,26631,why is processing a sorted array faster than p...,process sort array faster process unsort array...
1,927358,How do I undo the most recent local commits in...,<p>I accidentally committed the wrong files to...,<git><version-control><git-commit><undo>,2009-05-29 18:09:14,2009-05-29 18:13:42,927386,927386,24839,how do i undo the most recent local commits in...,undo recent local commit git accident commit w...
2,2003505,How do I delete a Git branch locally and remot...,<h4>Failed Attempts to Delete a Remote Branch:...,<git><version-control><git-branch><git-push><g...,2010-01-05 01:12:15,2010-01-05 01:13:55,2003515,2003515,19584,how do i delete a git branch locally and remot...,delet git branch local remot fail attempt dele...
3,292357,What is the difference between 'git pull' and ...,"<p>What are the differences between <a href=""h...",<git><version-control><git-pull><git-fetch>,2008-11-15 09:51:09,2008-11-15 09:52:40,292359,292359,13375,what is the difference between git pull and gi...,differ git pull git fetch differ git pull git ...
4,231767,"What does the ""yield"" keyword do?",<p>What is the use of the <code>yield</code> k...,<python><iterator><generator>,2008-10-23 22:21:11,2008-10-23 22:48:44,231855,231855,12274,what does the yield keyword do what is the use...,yield keyword use yield keyword python exampl ...


In [27]:
output_filepath = cwd + '/data/processed/posts.csv'

In [29]:
df.to_csv(output_filepath, index=False)