# Preprocessing Text

In [1]:
import nltk
import pandas as pd
import numpy as np

# Expand the max width of how our dataFrames display on screen
pd.options.display.max_colwidth = 500

### Define a sentence to begin with

In [2]:
# three quote """""" allow to have multiple lines of strings
text = """Overfitting means that a model we've trained was trained too well and it's fitting too closely to the training dataset. 
A model has been overfit when the model is too complex (i.e. too many features/variables compared to the number of observations)"""

# Preprocessing

### Expand contractions

In [6]:
# contractions means we are-->we're
import contractions

def fix_contractions(text):
    text = contractions.fix(text)
    return text

no_contractions = fix_contractions(text)
pd.DataFrame({"Before": [text], "After": [no_contractions]}).T

Unnamed: 0,0
Before,Overfitting means that a model we've trained was trained too well and it's fitting too closely to the training dataset. \nA model has been overfit when the model is too complex (i.e. too many features/variables compared to the number of observations)
After,Overfitting means that a model we have trained was trained too well and it is fitting too closely to the training dataset. \nA model has been overfit when the model is too complex (i.e. too many features/variables compared to the number of observations)


### Tokenize text

nltk's `word_tokenize` function is a bit more advanced than the standard `split` function. `Tokenize` views the text linguistically and handles tokenizing compound terms, contractions, and punctuation much better than `split`.

In [7]:
from nltk import word_tokenize

def tokenize_text(text):
    """Return tokenized list of words"""
    words = word_tokenize(text)
    return words

tokens = tokenize_text(no_contractions)
pd.DataFrame({"Before": [no_contractions], "After": [tokens]}).T

Unnamed: 0,0
Before,Overfitting means that a model we have trained was trained too well and it is fitting too closely to the training dataset. \nA model has been overfit when the model is too complex (i.e. too many features/variables compared to the number of observations)
After,"[Overfitting, means, that, a, model, we, have, trained, was, trained, too, well, and, it, is, fitting, too, closely, to, the, training, dataset, ., A, model, has, been, overfit, when, the, model, is, too, complex, (, i.e, ., too, many, features/variables, compared, to, the, number, of, observations, )]"


### Lowercase words

In [8]:
def to_lowercase(tokens):
    """Lowercase all words"""
    lower_words = [w.lower() for w in tokens]
    return lower_words

lower_tokens = to_lowercase(tokens)
pd.DataFrame({"Before": [tokens], "After": [to_lowercase(lower_tokens)]}).T

Unnamed: 0,0
Before,"[Overfitting, means, that, a, model, we, have, trained, was, trained, too, well, and, it, is, fitting, too, closely, to, the, training, dataset, ., A, model, has, been, overfit, when, the, model, is, too, complex, (, i.e, ., too, many, features/variables, compared, to, the, number, of, observations, )]"
After,"[overfitting, means, that, a, model, we, have, trained, was, trained, too, well, and, it, is, fitting, too, closely, to, the, training, dataset, ., a, model, has, been, overfit, when, the, model, is, too, complex, (, i.e, ., too, many, features/variables, compared, to, the, number, of, observations, )]"


### Remove punctuation

The function below compiles a [translation table](https://www.programiz.com/python-programming/methods/string/maketrans) using `string.punctuation` as the set of characters to filter out, and then uses `translate()` within a list comprehension to pass each term through the translation table to remove unwanted characters.

In [9]:
import string

def remove_punctuation(tokens):
    """Remove punctuation from list of tokenized words"""
    table = str.maketrans('', '', string.punctuation)
    clean_tokens = [w.translate(table) for w in tokens]
    
    # remove empty values left behind from tokens that were only punctuation
    clean_tokens = list(filter(None, clean_tokens))
    return clean_tokens

no_punctuation = remove_punctuation(lower_tokens)
pd.DataFrame({"Before": [lower_tokens], "After": [no_punctuation]}).T

Unnamed: 0,0
Before,"[overfitting, means, that, a, model, we, have, trained, was, trained, too, well, and, it, is, fitting, too, closely, to, the, training, dataset, ., a, model, has, been, overfit, when, the, model, is, too, complex, (, i.e, ., too, many, features/variables, compared, to, the, number, of, observations, )]"
After,"[overfitting, means, that, a, model, we, have, trained, was, trained, too, well, and, it, is, fitting, too, closely, to, the, training, dataset, a, model, has, been, overfit, when, the, model, is, too, complex, ie, too, many, featuresvariables, compared, to, the, number, of, observations]"


### Remove Stopwords

In [10]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words.extend(['said', 'would', 'subject', 'use', 'also', 'like'])

def remove_stopwords(tokens, min_word_len=3):
    """Remove stopwords and short words from list of tokenized words"""
    valued_tokens = []
    for w in tokens:
        if w not in stop_words and len(w)>=min_word_len:
            valued_tokens.append(w)
    return valued_tokens

no_stopwords = remove_stopwords(no_punctuation)
pd.DataFrame({"Before": [no_punctuation], "After": [no_stopwords]}).T

Unnamed: 0,0
Before,"[overfitting, means, that, a, model, we, have, trained, was, trained, too, well, and, it, is, fitting, too, closely, to, the, training, dataset, a, model, has, been, overfit, when, the, model, is, too, complex, ie, too, many, featuresvariables, compared, to, the, number, of, observations]"
After,"[overfitting, means, model, trained, trained, well, fitting, closely, training, dataset, model, overfit, model, complex, many, featuresvariables, compared, number, observations]"


### Perform Stemming

Stemming is method of reducing inflectional forms of related terms. The goal is to reduce terms down to a root form.

[More information on stemming and lemmatization.](https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html)

In [11]:
from nltk.stem import PorterStemmer

def stem_words(tokens):
    """Normalizes variations of tokens through stemming"""
    ps = PorterStemmer()
    stems = []
    for word in tokens:
        stem = ps.stem(word)
        stems.append(stem)
    return stems

stems = stem_words(no_stopwords)
pd.DataFrame({"Before": [no_stopwords], "After": [stems]}).T

Unnamed: 0,0
Before,"[overfitting, means, model, trained, trained, well, fitting, closely, training, dataset, model, overfit, model, complex, many, featuresvariables, compared, number, observations]"
After,"[overfit, mean, model, train, train, well, fit, close, train, dataset, model, overfit, model, complex, mani, featuresvari, compar, number, observ]"


## Before and After

In [12]:
pd.DataFrame({"Before": [text], "After": [stems]}).T

Unnamed: 0,0
Before,Overfitting means that a model we've trained was trained too well and it's fitting too closely to the training dataset. \nA model has been overfit when the model is too complex (i.e. too many features/variables compared to the number of observations)
After,"[overfit, mean, model, train, train, well, fit, close, train, dataset, model, overfit, model, complex, mani, featuresvari, compar, number, observ]"
