## Handling Text

### Part 1: Cleaning Strings

#### Removing dots and tabulation

In [1]:
# Create text
text_data = [" Interrobang. By Aishwarya Henriette         ",
 "Parking And Going. By Karl Gautier             ",
 " Today Is The night. By Jarek Prakash     "]

# remove white space
text_data = [string.strip() for string in text_data]

# remove dot
text_data = [string.replace(".", "") for string in text_data]

# more: apply function
def uppercase(string : str) -> str:
    return string.upper()

[uppercase(string) for string in text_data]

['INTERROBANG BY AISHWARYA HENRIETTE',
 'PARKING AND GOING BY KARL GAUTIER',
 'TODAY IS THE NIGHT BY JAREK PRAKASH']

#### Removing punctuations

In [2]:
import unicodedata
import sys

# Create text
text_data = ['Hi!!!! I. Love. This. Song....',
 '10000% Agree!!!! #LoveIT',
 'Right?!?!']

# create dictionnary of punctuation char
punctuations = dict.fromkeys(i for i in range(sys.maxunicode)
    if unicodedata.category(chr(i)). startswith('P'))

# remove punctuations
[string.translate(punctuations) for string in text_data]

# note: removing punctuation can be useful at times, but let's
# not forget that they provide information relating to emotion

['Hi I Love This Song', '10000 Agree LoveIT', 'Right']

#### Tokenizing Text : split string to word arrays

In [3]:
from nltk.tokenize import word_tokenize

# Create text
string = "The science of today is the technology of tomorrow"

# tokenize to words
words = word_tokenize(string)

#### Tokenizing Text: split String to sentences

In [4]:
import nltk
from nltk.tokenize import sent_tokenize

# Create text
string = "The science of today is the technology of tomorrow. Tomorrow is today."

# tokenize sentences
sent_tokenize(string)

['The science of today is the technology of tomorrow.', 'Tomorrow is today.']

#### Removing Stop Words 

In [5]:
from nltk.corpus import stopwords

# Create word tokens
tokenized_words = ['i',
 'am',
 'going',
 'to',
 'go',
 'to',
 'the',
 'store',
 'and',
 'park']

# load stop words
stop_words = stopwords.words('english')

# remove stop words
[word for word in tokenized_words if word not in stop_words]

# show stop words
stop_words[:5]

# note: lowercase all words first

['i', 'me', 'my', 'myself', 'we']

#### Stemming Word: find root of each words by removing affixes

In [6]:
from nltk.stem.porter import PorterStemmer

# Create word tokens
tokenized_words = ['i', 'am', 'humbled', 'by', 'this', 'traditional', 'meeting']

# create stemmer
porter =PorterStemmer()

# apply stemmer
[porter.stem(word) for word in tokenized_words]

['i', 'am', 'humbl', 'by', 'thi', 'tradit', 'meet']

#### Tagging Parts of Speech

In [7]:
from nltk import pos_tag
from nltk import word_tokenize

# Create text
text_data = "Chris loved outdoor running"

# use pretrained part of speech tagger
text_tagged = pos_tag(word_tokenize(text_data))

# filter words based on tag
[word for word, tag in text_tagged if tag in ['NN','NNS','NNP','NNPS'] ]


['Chris']

#### Example with Tweets

In [9]:
from sklearn.preprocessing import MultiLabelBinarizer

# Create text
tweets = ["I am eating a burrito for breakfast",
 "Political science is an amazing field",
 "San Francisco is an awesome city"]

# create list
tagged_tweets = []

# tag each words in tweets
for tweet in tweets:
    tweet_tag = nltk.pos_tag(word_tokenize(tweet))
    tagged_tweets.append([tag for word, tag in tweet_tag])
    
# convert tags into features using one hot encoder
one_hot_multi = MultiLabelBinarizer()
one_hot_multi.fit_transform(tagged_tweets)

# show features names
one_hot_multi.classes_

array(['DT', 'IN', 'JJ', 'NN', 'NNP', 'PRP', 'VBG', 'VBP', 'VBZ'],
      dtype=object)

#### Create Custom Tagger

In [None]:
from nltk.corpus import brown
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

# Get some text from the Brown Corpus, broken into sentences
sentences = brown.tagged_sents(categories='news')

# Split into 4000 sentences for training and 623 for testing
train = sentences[:4000]
test = sentences[4000:]

# Create backoff tagger
unigram = UnigramTagger(train)
bigram = BigramTagger(train, backoff=unigram)
trigram = TrigramTagger(train, backoff=bigram)

# Show accuracy
trigram.evaluate(test)


#### Encoding Text as Bag of Words

In [12]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Create text
text_data = np.array(['I love Brazil. Brazil!',
 'Sweden is best',
'Germany beats both'])

# Create Bags of Words
count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)

# view matrix
mat_words = bag_of_words.toarray()

# show features names
features_names = count.get_feature_names()

# create data frame
df = pd.DataFrame(mat_words, columns= features_names)
df

Unnamed: 0,beats,best,both,brazil,germany,is,love,sweden
0,0,0,0,2,0,0,1,0
1,0,1,0,0,0,1,0,1
2,1,0,1,0,1,0,0,0


#### Encoding Text as Bag of Words for huge dataset: words combinations

In [13]:
# create feature matrix
count_2gram = CountVectorizer(ngram_range = (1,2),
                             stop_words='english',
                             vocabulary = ['brazil'])
# transform text to bag of words
bag = count_2gram.fit_transform(text_data)

# view bad of words
bag.toarray()

# view col names
count_2gram.vocabulary_

{'brazil': 0}

#### Weighting Words Importance

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create text
text_data = np.array(['I love Brazil. Brazil!',
 'Sweden is best',
'Germany beats both'])

# create text vectorizer
vectorizer = TfidfVectorizer()
feature_matrix = vectorizer.fit_transform(text_data)

# view feature matrix
feature_matrix.toarray()

# show features names
vectorizer.vocabulary_

# note: the more the word shows in a document,
# the more it is important

{'love': 6,
 'brazil': 3,
 'sweden': 7,
 'is': 5,
 'best': 1,
 'germany': 4,
 'beats': 0,
 'both': 2}