In [None]:
!pip3 install --user --upgrade pip 
!pip3 install --user nltk

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('gutenberg')
nltk.download('stopwords')

In [None]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize 

corpus = '''Tokenization is the process of tokenizing or splitting a string, text into a list of tokens. One can think of token as parts like a word is a token in a sentence, and a sentence is a token in a paragraph.'''

print(sent_tokenize(corpus))
print(word_tokenize(corpus))

In [None]:
from nltk.stem import WordNetLemmatizer

corpus = ['rocks', 'gone', 'better']
lemmatizer = WordNetLemmatizer()

print([lemmatizer.lemmatize(w) for w in corpus])

In [None]:
from nltk import pos_tag

def lemmatize_sent(text): 
    pos_dict = {'NN':'n', 'JJ':'a', 'VB':'v', 'RB':'r'}
    word_list = []
    for word, tag in pos_tag(word_tokenize(text)):
        pos = pos_dict[tag[0:2]] if tag[0:2] in pos_dict else 'n'
        word_list.append(lemmatizer.lemmatize(word, pos=pos))
    return word_list

sentence = 'He is walking to school'
print('lemmatize word by word: ', [lemmatizer.lemmatize(w) for w in word_tokenize(sentence)])
print('lemmatize with context: ', lemmatize_sent(sentence))

In [None]:
from nltk.stem import PorterStemmer

corpus = ['rocks', 'going', 'history']
stemmer = PorterStemmer()
print([stemmer.stem(w) for w in corpus])

In [None]:
from nltk.corpus import gutenberg
import time
 
def timing(func):
    def decorate(*args, **kwargs):
        start = time.time()
        func(*args, **kwargs)
        print("%-30s: %-7.2f ms" % (func.__name__, (time.time() - start) * 1000))
    return decorate

@timing
def stemming(text):
    [stemmer.stem(w) for w in word_tokenize(sentence)]

@timing
def lemmatize(text):
    lemmatize_sent(text)

@timing
def lemmatize_without_context(text):  
    [lemmatizer.lemmatize(w) for w in word_tokenize(sentence)]

book = gutenberg.raw("austen-sense.txt")

stemming(book)
lemmatize(book)
lemmatize_without_context(book)

In [None]:
from nltk.corpus import stopwords

corpus = ['I', 'am', 'a', 'boy']
print([w for w in corpus if w not in set(stopwords.words('english'))])

In [None]:
!pip3 install --user scikit-learn

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
corpus = [
    'He is a teacher',
    'I am student',
    'She is also a student',
]
X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names())
print(list(vectorizer.stop_words_))
print(X.toarray())

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
corpus = [
    'He is a teacher',
    'I am student',
    'She is also a student',
]
X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names())
print(X.toarray())

In [None]:
!pip3 install --user numpy

In [None]:
import numpy as np

#The vector calculation of the sentence He is a teacher.
tfidf_he = 1/3 * (np.log((3+1)/(1+1))+1)
tfidf_is = 1/3 * (np.log((3+1)/(2+1))+1)
tfidf_teacher = 1/3 * (np.log((3+1)/(1+1))+1)

print(np.divide([tfidf_he, tfidf_is, tfidf_teacher], np.sqrt(tfidf_he*tfidf_he + tfidf_is*tfidf_is + tfidf_teacher* tfidf_teacher)))

In [None]:
!pip3 install --user gensim

In [None]:
import gensim.downloader
from gensim.models import Word2Vec

word2vec = gensim.downloader.load('word2vec-google-news-300')

In [None]:
print(word2vec.most_similar('car'))
print(word2vec.word_vec('car'))


In [None]:
!pip3 install tensorflow