In [72]:
# Count vectorize and tfidf vectorizer breakdown

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from pandas import DataFrame


In [61]:
# Count Vectorize
# Takes in multiple documents in form of a python list = message_list
# Vectorizer converts from text to numbers
def create_document_term_matrix(message_list, vectorizer):
  # Transforms all messages into numbers
  doc_term_matrix = vectorizer.fit_transform(message_list)
  # Converts all into a matrix and return a dataframe
  return DataFrame(doc_term_matrix.toarray(),
                   columns=vectorizer.get_feature_names())

In [62]:
# Two documents
# Forth token Vivienne
msg_1 = ["My name is Vivienne",
         "Please subscribe to my Youtube channel"]

In [63]:
# Bag of words approach where each word in document is separated into tokens
count_vect = CountVectorizer()

In [64]:
# As you can see vivienne has appeared only once in 0th document, matrix 1 occurs 0 did not occur
# If using as classification task you can use this data as x train and y train data
# Some words are not significant for a classification task such as is, to etc
# Remove these unwanted words from corpus of words or give them a lower value compare to the important words
# We do this by creating a TF – Term frequency and IDF - Indicator Data Frequency
create_document_term_matrix(msg_1, count_vect)

Unnamed: 0,channel,is,my,name,please,subscribe,to,vivienne,youtube
0,0,1,1,1,0,0,0,1,0
1,1,0,1,0,1,1,1,0,1


In [65]:
# TF-IDF is how often a word occurs in a document. 
# If we have multiple occurances of one word we except this value of this word to rise.
# Contains 2 documents 
msg_2 = ["Vivienne is my name",
         "Vivienne likes Python programming language"]

In [66]:
# We create an instance of the tfidf vectoriser
tfidf_vect = TfidfVectorizer()


In [67]:
# Vivienne has the lowest value in the first row in the first document
# Similarly in the second document
# If you notice all the other values are the same in each document apart from vivienne
# Because it occurs in both documents the idf goes down
# Note: You might wonder by "is" in the first document has a larger value than "lanuage"
# in the second document. This is because of the length of the frist document.
# First document has four words and second has 5 words
# So is occurs 1 out of 4 times that is the terms frequency
# And "langauge" occurs 1 our of 5 times.
create_document_term_matrix(msg_2, tfidf_vect)

Unnamed: 0,is,language,likes,my,name,programming,python,vivienne
0,0.534046,0.0,0.0,0.534046,0.534046,0.0,0.0,0.379978
1,0.0,0.471078,0.471078,0.0,0.0,0.471078,0.471078,0.335176


In [68]:
# We expect the overall term frequency for Vivienne to increase based on the number of times it occurs
msg_3 = ["Vivienne Vivienne Vivienne is my name",
         "Vivienne likes Python programming language"]

In [69]:
create_document_term_matrix(msg_3, tfidf_vect)

Unnamed: 0,is,language,likes,my,name,programming,python,vivienne
0,0.363788,0.0,0.0,0.363788,0.363788,0.0,0.0,0.776515
1,0.0,0.471078,0.471078,0.0,0.0,0.471078,0.471078,0.335176


In [70]:
msg_4 = ["Vivienne Vivienne Vivienne is my name",
         "Vivienne likes Python programming language"]

In [71]:
create_document_term_matrix(msg_4, tfidf_vect)

Unnamed: 0,is,language,likes,my,name,programming,python,vivienne
0,0.363788,0.0,0.0,0.363788,0.363788,0.0,0.0,0.776515
1,0.0,0.471078,0.471078,0.0,0.0,0.471078,0.471078,0.335176
