In [1]:
import numpy as np
import os
import datetime
import pandas as pd
# from tqdm import tqdm
from matplotlib import pyplot as plt
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
from sklearn import preprocessing
from sklearn import datasets

In [23]:
import re
from bs4 import BeautifulSoup
import sys
import unicodedata

In [27]:
from nltk import tokenize
from nltk import corpus
import nltk

In [36]:
from nltk.stem import porter
from nltk import pos_tag
from nltk import word_tokenize
from nltk import tag

In [48]:
from sklearn import preprocessing

### Cleaning Text
- Stip whitespace
- remove punctuation
- same case

In [3]:
text_one = ["  I like chocolate. ", " Please, no more! ", "Check the fridge. . . . "]
html_text = """
    <div class='sample'> <span style='font-weight:bold'> This is it </span> BLAH!! </div> """

In [18]:
# Stip Whitespace & lowercase
cleaned_one = [sent.lower().strip() for sent in text_one]
print(cleaned_one)

['i like chocolate.', 'please, no more!', 'check the fridge. . . .']


In [20]:
# Remove punctuation: could use: string.replace()
def remove_punc(my_text):
    return re.sub(r"[^\w\s]", "", my_text)
cleaned_one = [remove_punc(sent).strip() for sent in cleaned_one]
print(cleaned_one)

['i like chocolate', 'please no more', 'check the fridge']


### Parses HTML

In [22]:
soup = BeautifulSoup(html_text)
soup.find("div").text

'  This is it  BLAH!! '

### Remove Punctuation

In [25]:
# dictionary of punctuation characters with None as values
punctuation = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))

In [26]:
# Remove punctuation
[string.translate(punctuation) for string in text_one]

['  I like chocolate ', ' Please no more ', 'Check the fridge    ']

### Tokenize Text

In [34]:
my_strings = [tokenize.word_tokenize(x) for x in cleaned_one]
my_strings = my_strings[0] + my_strings[1] + my_strings[2]
my_strings

['i', 'like', 'chocolate', 'please', 'no', 'more', 'check', 'the', 'fridge']

### Remove stop words

In [35]:
# Need to download stop words first time
nltk.download('stopwords')
# load stopwords
stop_words = corpus.stopwords.words('english')
# Remove
[word for word in my_strings if word not in stop_words]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jenn\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


['like', 'chocolate', 'please', 'check', 'fridge']

### Stemming Words

In [37]:
# Create Stemmer
my_porter = porter.PorterStemmer()
# Apply Stemmer
[my_porter.stem(word) for word in my_strings]

['i', 'like', 'chocol', 'pleas', 'no', 'more', 'check', 'the', 'fridg']

### Tag Parts of Speech
NLTK uses the Penn Treebank parts for speech tags

In [39]:
my_sentence = " ".join(my_strings)
my_sentence

'i like chocolate please no more check the fridge'

In [40]:
text_tagged = pos_tag(word_tokenize(my_sentence))
text_tagged

[('i', 'NNS'),
 ('like', 'VBP'),
 ('chocolate', 'NN'),
 ('please', 'NN'),
 ('no', 'DT'),
 ('more', 'JJR'),
 ('check', 'VB'),
 ('the', 'DT'),
 ('fridge', 'NN')]

In [42]:
# Find Nouns:
[word for word, tag in text_tagged if tag in ('NN', 'NNS', 'NNP', 'NNPS')]

['i', 'chocolate', 'please', 'fridge']

In [43]:
# Sample of getting the parts of speech of tweets and one_hot encoding what is present
tweets = ["Chocolate cake is here to stay", "The buck stops here pal", "This is where I am going"]

In [45]:
tagged_tweets = []
for tweet in tweets:
    tweet_tag = pos_tag(word_tokenize(tweet))
    tagged_tweets.append([tag for word, tag in tweet_tag])
tagged_tweets

[['NNP', 'NN', 'VBZ', 'RB', 'TO', 'VB'],
 ['DT', 'NN', 'VBZ', 'RB', 'JJ'],
 ['DT', 'VBZ', 'WRB', 'PRP', 'VBP', 'VBG']]

In [50]:
# One Hot encoding
tweet_one_hot = preprocessing.MultiLabelBinarizer()
tweet_one_hot.fit_transform(tagged_tweets)

array([[0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0],
       [1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1]])

In [51]:
pd.DataFrame(tweet_one_hot.fit_transform(tagged_tweets), columns=tweet_one_hot.classes_)

Unnamed: 0,DT,JJ,NN,NNP,PRP,RB,TO,VB,VBG,VBP,VBZ,WRB
0,0,0,1,1,0,1,1,1,0,0,1,0
1,1,1,1,0,0,1,0,0,0,0,1,0
2,1,0,0,0,1,0,0,0,1,1,1,1


### Creating own Tagger

In [52]:
# Get sentences
sentences = corpus.brown.tagged_sents(categories='news')
sentences[45]

[('Robert', 'NP'),
 ('Snodgrass', 'NP'),
 (',', ','),
 ('state', 'NN'),
 ('GOP', 'NN'),
 ('chairman', 'NN'),
 (',', ','),
 ('said', 'VBD'),
 ('a', 'AT'),
 ('meeting', 'NN'),
 ('held', 'VBN'),
 ('Tuesday', 'NR'),
 ('night', 'NN'),
 ('in', 'IN'),
 ('Blue', 'JJ-TL'),
 ('Ridge', 'NN-TL'),
 ('brought', 'VBD'),
 ('enthusiastic', 'JJ'),
 ('responses', 'NNS'),
 ('from', 'IN'),
 ('the', 'AT'),
 ('audience', 'NN'),
 ('.', '.')]

In [56]:
train = sentences[:4000]
test = sentences[4000:]

In [57]:
# create backoff taggerer
unigram = tag.UnigramTagger(train)
bigram = tag.BigramTagger(train, backoff=unigram)
trigram = tag.TrigramTagger(train, backoff=bigram)

In [58]:
trigram.evaluate(test)

0.8174734002697437

### Bag of Words

In [69]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [61]:
text_data = np.array(tweets)
text_data

array(['Chocolate cake is here to stay', 'The buck stops here pal',
       'This is where I am going'], dtype='<U30')

In [65]:
my_counts = CountVectorizer()
word_bag = my_counts.fit_transform(text_data)
word_bag.toarray()

array([[0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0],
       [1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1]], dtype=int64)

In [67]:
# See the features
pd.DataFrame(word_bag.toarray(), columns=my_counts.get_feature_names())

Unnamed: 0,am,buck,cake,chocolate,going,here,is,pal,stay,stops,the,this,to,where
0,0,0,1,1,0,1,1,0,1,0,0,0,1,0
1,0,1,0,0,0,1,0,1,0,1,1,0,0,0
2,1,0,0,0,1,0,1,0,0,0,0,1,0,1


Count Vectorizer:
- Stores as a sparse matrix by default
- ngram_range parameter to determine if single words or two words or more
- stop_words parameter to remove unimportant
- vocabulary parameter to restrict to only certain words


### Weighting Word Importance
term  frequency-inverse document frequency (tf-idf)
- Term Frequency: more a word appears in a document the more likely it is important
- Document Frequency: if a word appears in every document it is probably not that important
- mulitple tf by the inverse of document frequency
- sklearn normalizes the tf-idf vectors using L2

In [70]:
text_data

array(['Chocolate cake is here to stay', 'The buck stops here pal',
       'This is where I am going'], dtype='<U30')

In [72]:
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)
feature_matrix.toarray()

array([[0.        , 0.        , 0.44036207, 0.44036207, 0.        ,
        0.3349067 , 0.3349067 , 0.        , 0.44036207, 0.        ,
        0.        , 0.        , 0.44036207, 0.        ],
       [0.        , 0.46735098, 0.        , 0.        , 0.        ,
        0.35543247, 0.        , 0.46735098, 0.        , 0.46735098,
        0.46735098, 0.        , 0.        , 0.        ],
       [0.46735098, 0.        , 0.        , 0.        , 0.46735098,
        0.        , 0.35543247, 0.        , 0.        , 0.        ,
        0.        , 0.46735098, 0.        , 0.46735098]])

In [73]:
pd.DataFrame(feature_matrix.toarray(), columns=tfidf.vocabulary_)

Unnamed: 0,chocolate,cake,is,here,to,stay,the,buck,stops,pal,this,where,am,going
0,0.0,0.0,0.440362,0.440362,0.0,0.334907,0.334907,0.0,0.440362,0.0,0.0,0.0,0.440362,0.0
1,0.0,0.467351,0.0,0.0,0.0,0.355432,0.0,0.467351,0.0,0.467351,0.467351,0.0,0.0,0.0
2,0.467351,0.0,0.0,0.0,0.467351,0.0,0.355432,0.0,0.0,0.0,0.0,0.467351,0.0,0.467351
