# Tokanizer

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

In [5]:
# example text for model training (SMS messages)
simple_train = ['i will call you back later', 'cant talk, msg me', 'i will call you right back', 
                'Call me a cab', 'Please call me... PLEASE!']
simple_train

['i will call you back later',
 'cant talk, msg me',
 'i will call you right back',
 'Call me a cab',
 'Please call me... PLEASE!']

In [6]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()

# learn the 'vocabulary' of the training data (occurs in-place)
vect.fit(simple_train)

# examine the fitted vocabulary
vect.get_feature_names_out()

array(['back', 'cab', 'call', 'cant', 'later', 'me', 'msg', 'please',
       'right', 'talk', 'will', 'you'], dtype=object)

In [7]:
# transform training data into a 'document-term matrix'
simple_train_dtm = vect.transform(simple_train)
simple_train_dtm

<5x12 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [8]:
# convert sparse matrix to a dense matrix
simple_train_dtm.toarray()

array([[1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0],
       [1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1],
       [0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 0, 0]], dtype=int64)

In [9]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names())



Unnamed: 0,back,cab,call,cant,later,me,msg,please,right,talk,will,you
0,1,0,1,0,1,0,0,0,0,0,1,1
1,0,0,0,1,0,1,1,0,0,1,0,0
2,1,0,1,0,0,0,0,0,1,0,1,1
3,0,1,1,0,0,1,0,0,0,0,0,0
4,0,0,1,0,0,1,0,2,0,0,0,0


In [10]:
# check the type of the document-term matrix
print(type(simple_train_dtm))

# examine the sparse matrix contents
print(simple_train_dtm)

<class 'scipy.sparse._csr.csr_matrix'>
  (0, 0)	1
  (0, 2)	1
  (0, 4)	1
  (0, 10)	1
  (0, 11)	1
  (1, 3)	1
  (1, 5)	1
  (1, 6)	1
  (1, 9)	1
  (2, 0)	1
  (2, 2)	1
  (2, 8)	1
  (2, 10)	1
  (2, 11)	1
  (3, 1)	1
  (3, 2)	1
  (3, 5)	1
  (4, 2)	1
  (4, 5)	1
  (4, 7)	2


In [11]:
# example text for model testing
simple_test = ["please don't call me"]

In [12]:
# transform testing data into a document-term matrix (using existing vocabulary)
simple_test_dtm = vect.transform(simple_test)
simple_test_dtm.toarray()

array([[0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0]], dtype=int64)

In [13]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_test_dtm.toarray(), columns=vect.get_feature_names_out())

Unnamed: 0,back,cab,call,cant,later,me,msg,please,right,talk,will,you
0,0,0,1,0,0,1,0,1,0,0,0,0


## THANK YOU