In [1]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import pandas as pd

[nltk_data] Downloading package stopwords to /home/yurii/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from IPython.display import HTML

###### basic usage of regex


In [3]:
words = ['very','nice','lecture','day','moon']
expression = '|'.join(words)
expression

'very|nice|lecture|day|moon'

In [4]:
re.findall(expression, 'i attended a very nice lecture last year', re.M)

['very', 'nice', 'lecture']

###### Preprocessing the text 

In [5]:
sentence = 'John has been selected for the trial phase this time. Congrats!!'
sentence=sentence.lower()
sentence=sentence.replace('!','').replace('.', '')
words= sentence.split(' ')

# Define positive & negative words
positive_words=['awesome','good', 'nice', 'super', 'fun', 'delightful','congrats']
negative_words=['awful','lame','horrible','bad']

set(words)-set(positive_words)

{'been',
 'for',
 'has',
 'john',
 'phase',
 'selected',
 'the',
 'this',
 'time',
 'trial'}

###### Remove stopwords

In [6]:
tokens = nltk.word_tokenize(sentence)

stop_words = set(stopwords.words('english'))
new_tokens = [w for w in tokens if not w in stop_words]

print(f'Sentence with stop-words: {tokens}')
print(f'Sentence without stop-words: {new_tokens}')

Sentence with stop-words: ['john', 'has', 'been', 'selected', 'for', 'the', 'trial', 'phase', 'this', 'time', 'congrats']
Sentence without stop-words: ['john', 'selected', 'trial', 'phase', 'time', 'congrats']


In [7]:
tokens

['john',
 'has',
 'been',
 'selected',
 'for',
 'the',
 'trial',
 'phase',
 'this',
 'time',
 'congrats']

In [8]:
len(stop_words), type(stop_words)

(179, set)

###### Vectorization

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

Count how many times a specific word occurs in a text

In [10]:
vectorizer = CountVectorizer()
vectorizer_fit = vectorizer.fit_transform(sentence.split()).toarray()
feature_names = vectorizer.get_feature_names()

# print()
# print(vectorizer_fit.toarray())
print(sentence)
l = [print(row, feature_names[i]) for i, row in enumerate(vectorizer_fit)]

print('\n')

sentence2 = 'She should have been provided with a safe environment to recall her traumatic experiences. Instead she was filmed by the officer in charge on his phone as she described the ordeal.'
vectorizer_fit = vectorizer.fit_transform(sentence2.split()).toarray()
feature_names = vectorizer.get_feature_names()

l = [print(vectorizer_fit[i], word) for i, word in enumerate(feature_names)]



john has been selected for the trial phase this time congrats
[0 0 0 0 1 0 0 0 0 0 0] been
[0 0 0 1 0 0 0 0 0 0 0] congrats
[1 0 0 0 0 0 0 0 0 0 0] for
[0 0 0 0 0 0 1 0 0 0 0] has
[0 0 1 0 0 0 0 0 0 0 0] john
[0 0 0 0 0 0 0 1 0 0 0] phase
[0 0 0 0 0 0 0 0 0 0 1] selected
[0 0 0 0 0 1 0 0 0 0 0] the
[0 0 0 0 0 0 0 0 1 0 0] this
[0 0 0 0 0 0 0 0 0 1 0] time
[0 1 0 0 0 0 0 0 0 0 0] trial


[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0] as
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0] been
[0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] by
[0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] charge
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0] described
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1] environment
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] experiences
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0] filmed
[0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] have
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

###### TF-IDF  == terms frequency & inverse document frequency

$$
TF = \frac{\sum{specWords}}{\sum{Words}};
$$
$$
IDF = \log{\frac{\sum{docs}}{\sum{docsWithWord}}}
$$
Regarding to this:
$$
TF-IDF = TF * IDF == \frac{\sum{specWords}}{\sum{Words}} \cdot \log{\frac{\sum{docs}}{\sum{docsWithWord}}}
$$

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
text = 'A thorough list of publicly available NLP data sets has already been created by Nicolas Iderhoff. Beyond these, here are some projects I can recommend to any NLP novice wanting to get their hands dirty:'
text = text.split('.')
print(text)
vector = TfidfVectorizer()
X = vector.fit_transform(text)
X.todense()

['A thorough list of publicly available NLP data sets has already been created by Nicolas Iderhoff', ' Beyond these, here are some projects I can recommend to any NLP novice wanting to get their hands dirty:']


matrix([[0.26255634, 0.        , 0.        , 0.26255634, 0.26255634,
         0.        , 0.26255634, 0.        , 0.26255634, 0.26255634,
         0.        , 0.        , 0.        , 0.26255634, 0.        ,
         0.26255634, 0.26255634, 0.26255634, 0.186811  , 0.        ,
         0.26255634, 0.        , 0.26255634, 0.        , 0.26255634,
         0.        , 0.        , 0.        , 0.26255634, 0.        ,
         0.        ],
        [0.        , 0.22641916, 0.22641916, 0.        , 0.        ,
         0.22641916, 0.        , 0.22641916, 0.        , 0.        ,
         0.22641916, 0.22641916, 0.22641916, 0.        , 0.22641916,
         0.        , 0.        , 0.        , 0.1610991 , 0.22641916,
         0.        , 0.22641916, 0.        , 0.22641916, 0.        ,
         0.22641916, 0.22641916, 0.22641916, 0.        , 0.45283832,
         0.22641916]])

In [13]:
from textblob.classifiers import NaiveBayesClassifier
data = [
 ('I love my country.', 'positive'),
 ('This is an amazing place!', 'positive'),
 ('I do not like the smell of this place.', 'negative'),
 ('I do not like this restaurant', 'negative'),
 ('I am tired of hearing your nonsense.', 'negative'),
 ("I always aspire to be like him", 'positive'),
 ("It's a horrible performance.", "negative")
 ]
model = NaiveBayesClassifier(data)

In [14]:
model.classify('Dont watch it')

'positive'