In [285]:
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer

In [402]:
mytext = "<b>Hey there!</b> log in to https://www.google.com/ .&amp;  Here it is, are you seeing this? Test corpora"

In [403]:
mytext

'<b>Hey there!</b> log in to https://www.google.com/ .&amp;  Here it is, are you seeing this? Test corpora'

# Remove URL

In [404]:
def removeURL(inputText):
    return re.sub(r"http\S+", "", inputText)

In [405]:
mytext = removeURL(mytext)
mytext

'<b>Hey there!</b> log in to  .&amp;  Here it is, are you seeing this? Test corpora'

# Remove HTML 

In [406]:
from bs4 import BeautifulSoup

In [407]:
def removeHTML(inputText):
    return BeautifulSoup(inputText, "lxml").get_text()

In [408]:
mytext = removeHTML(mytext)
mytext

'Hey there! log in to  .&  Here it is, are you seeing this? Test corpora'

# Remove punctuation

In [409]:
import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [410]:
def removePunctuation(inputText):
    return inputText.translate(str.maketrans('', '', string.punctuation))   

In [411]:
mytext=removePunctuation(mytext)
mytext

'Hey there log in to    Here it is are you seeing this Test corpora'

# Tokenise

In [412]:
from nltk.tokenize import word_tokenize
my_tokens = word_tokenize(mytext)

In [413]:
my_tokens

['Hey',
 'there',
 'log',
 'in',
 'to',
 'Here',
 'it',
 'is',
 'are',
 'you',
 'seeing',
 'this',
 'Test',
 'corpora']

In [414]:
' '.join(my_tokens)

'Hey there log in to Here it is are you seeing this Test corpora'

# remove stop words

In [415]:
from nltk.corpus import stopwords
nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\utkar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [416]:
def removeStopwords(inputTokens):
    output_tokens = []
    for token in inputTokens:
        if token not in stopwords.words('english'):
            output_tokens.append(token)
    return output_tokens

In [417]:
my_tokens=removeStopwords(my_tokens)
my_tokens

['Hey', 'log', 'Here', 'seeing', 'Test', 'corpora']

#  Stemming

In [418]:
from nltk.stem import PorterStemmer

In [419]:
def porterStemming(inputTokens):
    porter = PorterStemmer()
    output_tokens = []
    for token in inputTokens:
        output_tokens.append(porter.stem(token))
    return output_tokens

In [420]:
my_tokens=porterStemming(my_tokens)
my_tokens

['hey', 'log', 'here', 'see', 'test', 'corpora']

# Lemmatisation

In [421]:
from nltk import WordNetLemmatizer

In [422]:
def lemmatisation(inputTokens):
    wnl = WordNetLemmatizer()
    output_tokens = []
    for token in inputTokens:
        output_tokens.append(wnl.lemmatize(token))
    return output_tokens

In [423]:
my_tokens=lemmatisation(my_tokens)
my_tokens

['hey', 'log', 'here', 'see', 'test', 'corpus']

# Preprocessing

In [427]:
def preprocess(text):
    text = removeURL(text)
    text = removeHTML(text)
    text=removePunctuation(text)
    tokens = word_tokenize(text)
    tokens=removeStopwords(tokens)
    tokens=porterStemming(tokens)
    tokens=lemmatisation(tokens)
    return " ".join(tokens)

In [428]:
preprocess(mytext)

'hey log here see test corpus'

# Create Vocabulary

In [15]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]


In [16]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [17]:
print(X.toarray())

[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]
