In [13]:
import nltk 
text = "Coffee and Coding is the best event at Queen's University today!"

# tokanization
tokenizer = nltk.tokenize.WhitespaceTokenizer()
tokens = tokenizer.tokenize(text)
tokens

['Coffee',
 'and',
 'Coding',
 'is',
 'the',
 'best',
 'event',
 'at',
 "Queen's",
 'University',
 'today!']

In [11]:
tokenizer = nltk.tokenize.TreebankWordTokenizer()
tokenizer.tokenize(text)

['Coffee',
 'and',
 'Coding',
 'is',
 'the',
 'best',
 'event',
 'at',
 'Queen',
 "'s",
 'University',
 'today',
 '!']

In [14]:
# stemming 
stemmer = nltk.stem.PorterStemmer()
" ".join(stemmer.stem(token) for token in tokens)

"coffe and code is the best event at queen' univers today!"

In [15]:
# lemmatization
stemmer = nltk.stem.WordNetLemmatizer()
" ".join(stemmer.lemmatize(token) for token in tokens)

"Coffee and Coding is the best event at Queen's University today!"

In [16]:
# simple NER with spaCy
# visualization using displacy
# python -m spacy download en_core_web_sm

import spacy
from spacy import displacy
import en_core_web_sm
nlp = en_core_web_sm.load()

doc = nlp(text)
label = [(ent.text, ent.label_) for ent in doc.ents]
print(label)
displacy.render(doc, style='dep', jupyter=True)

[('Coffee and Coding', 'ORG'), ("Queen's University", 'ORG'), ('today', 'DATE')]


In [17]:
import pandas as pd

# bag of words
from sklearn.feature_extraction.text import CountVectorizer 

text = ["Coffee and Coding is the best event at Queen's University today!", 
        "The IKEA catalog is the most widely printed book in history", 
        "It is my favourite book"]

data_vectorizer = CountVectorizer(stop_words='english')
data_feature = data_vectorizer.fit_transform(text)

data_frame = pd.DataFrame(data = data_feature.todense(), columns = data_vectorizer.get_feature_names())
data_frame

Unnamed: 0,best,book,catalog,coding,coffee,event,favourite,history,ikea,printed,queen,today,university,widely
0,1,0,0,1,1,1,0,0,0,0,1,1,1,0
1,0,1,1,0,0,0,0,1,1,1,0,0,0,1
2,0,1,0,0,0,0,1,0,0,0,0,0,0,0


In [18]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_feature = tfidf_vectorizer.fit_transform(text)

data_frame = pd.DataFrame(data = tfidf_feature.todense(), columns = tfidf_vectorizer.get_feature_names())
data_frame

Unnamed: 0,best,book,catalog,coding,coffee,event,favourite,history,ikea,printed,queen,today,university,widely
0,0.377964,0.0,0.0,0.377964,0.377964,0.377964,0.0,0.0,0.0,0.0,0.377964,0.377964,0.377964,0.0
1,0.0,0.322002,0.423394,0.0,0.0,0.0,0.0,0.423394,0.423394,0.423394,0.0,0.0,0.0,0.423394
2,0.0,0.605349,0.0,0.0,0.0,0.0,0.795961,0.0,0.0,0.0,0.0,0.0,0.0,0.0
