In [84]:
import numpy as np
import pandas as pd
import torch
import nltk
import re
import sklearn
from collections import Counter

In [85]:
docs = [
    "Tom plays soccer!",
    "Tom loves basketball.",
    "Basketball is his hobby?",
    "Sarah loves basketball;"
]

Tokenization

In [86]:
from nltk.tokenize import word_tokenize

In [87]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [88]:
tokenized_docs = []

for sentence in docs:
    tokenized_docs.append(word_tokenize(sentence))

In [89]:
tokenized_docs

[['Tom', 'plays', 'soccer', '!'],
 ['Tom', 'loves', 'basketball', '.'],
 ['Basketball', 'is', 'his', 'hobby', '?'],
 ['Sarah', 'loves', 'basketball', ';']]

To lower case

In [90]:
lowercased_docs = [[token.lower() for token in tokens] for tokens in tokenized_docs]
lowercased_docs

[['tom', 'plays', 'soccer', '!'],
 ['tom', 'loves', 'basketball', '.'],
 ['basketball', 'is', 'his', 'hobby', '?'],
 ['sarah', 'loves', 'basketball', ';']]

Tokenization and to lower case in one stage

In [91]:
lowercased_tokenized_docs = []
for sentence in docs:
    lowercased_tokenized_docs.append(word_tokenize(sentence.lower()))
lowercased_tokenized_docs

[['tom', 'plays', 'soccer', '!'],
 ['tom', 'loves', 'basketball', '.'],
 ['basketball', 'is', 'his', 'hobby', '?'],
 ['sarah', 'loves', 'basketball', ';']]

Removing punctuation and stop words

In [92]:
import string

In [93]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [94]:
stop_words = nltk.corpus.stopwords.words("english")
stop_words[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [95]:
filtered_docs = [[token for token in tokens if token not in string.punctuation and token not in stop_words] for tokens in lowercased_docs]
filtered_docs

[['tom', 'plays', 'soccer'],
 ['tom', 'loves', 'basketball'],
 ['basketball', 'hobby'],
 ['sarah', 'loves', 'basketball']]

Exploring strip() and split() methods

In [96]:
docs[0] = docs[0].strip() # removing leading and trailing whitespaces

In [97]:
sent = " ".join(docs[0].split()) # removing extra whitespaces

In [98]:
docs[0].split()

['Tom', 'plays', 'soccer!']

In [99]:
sent

'Tom plays soccer!'

Example of removing html tags

In [100]:
text_w_html = "<head> just random text </head>. cool text"

In [101]:
pattern = r"<[^>]+>"

In [102]:
cleaned_sent = re.sub(pattern, "", text_w_html)
cleaned_sent

' just random text . cool text'

Removing frequent words

In [103]:
all_tokens = [token for tokens in filtered_docs for token in tokens]
all_tokens

['tom',
 'plays',
 'soccer',
 'tom',
 'loves',
 'basketball',
 'basketball',
 'hobby',
 'sarah',
 'loves',
 'basketball']

In [104]:
fdist = nltk.FreqDist(all_tokens)

In [105]:
fdist

FreqDist({'basketball': 3, 'tom': 2, 'loves': 2, 'plays': 1, 'soccer': 1, 'hobby': 1, 'sarah': 1})

In [106]:
fdist['tom']

2

In [107]:
fdist.N()

11

In [108]:
rare_tokens = [token for token in all_tokens if fdist[token] < fdist.N() * 0.1]
rare_tokens

['plays', 'soccer', 'hobby', 'sarah']

Stemming

In [109]:
stemmer = nltk.stem.PorterStemmer()
stemmer

<PorterStemmer>

In [110]:
all_tokens

['tom',
 'plays',
 'soccer',
 'tom',
 'loves',
 'basketball',
 'basketball',
 'hobby',
 'sarah',
 'loves',
 'basketball']

In [111]:
stemmed_tokens = [stemmer.stem(token) for token in all_tokens]
stemmed_tokens

['tom',
 'play',
 'soccer',
 'tom',
 'love',
 'basketbal',
 'basketbal',
 'hobbi',
 'sarah',
 'love',
 'basketbal']

Lemmatization

In [112]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [113]:
lemmatizer = nltk.stem.WordNetLemmatizer()

In [114]:
all_tokens

['tom',
 'plays',
 'soccer',
 'tom',
 'loves',
 'basketball',
 'basketball',
 'hobby',
 'sarah',
 'loves',
 'basketball']

In [115]:
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in all_tokens]
lemmatized_tokens

['tom',
 'play',
 'soccer',
 'tom',
 'love',
 'basketball',
 'basketball',
 'hobby',
 'sarah',
 'love',
 'basketball']

Part of speech tagging

In [116]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [117]:
tagged_tokens = nltk.pos_tag(all_tokens)
tagged_tokens

[('tom', 'NN'),
 ('plays', 'VBZ'),
 ('soccer', 'NN'),
 ('tom', 'NN'),
 ('loves', 'VBZ'),
 ('basketball', 'NN'),
 ('basketball', 'NN'),
 ('hobby', 'NN'),
 ('sarah', 'NN'),
 ('loves', 'VBZ'),
 ('basketball', 'NN')]