In [None]:
# A corpus of sentences.
corpus = ['Jeep is an American car', 'Cars is an animated movie', 'Motorsports is a dangerous sport']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
vect

In [None]:
# Transforming the corpus into countvectorizer matrix.
x = vect.fit_transform(corpus)
x

<3x11 sparse matrix of type '<class 'numpy.int64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [None]:
# displays the formation of sentences (each word) into matrix
x.toarray()

array([[1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0],
       [0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1]])

In [None]:
#returns the feature names.
vect.get_feature_names_out()

array(['american', 'an', 'animated', 'car', 'cars', 'dangerous', 'is',
       'jeep', 'motorsports', 'movie', 'sport'], dtype=object)

In [None]:
#stores the words into the vocaabulary
vect.vocabulary_

{'jeep': 7,
 'is': 6,
 'an': 1,
 'american': 0,
 'car': 3,
 'cars': 4,
 'animated': 2,
 'movie': 9,
 'motorsports': 8,
 'dangerous': 5,
 'sport': 10}

In [None]:
# To retrive a special or specific letter
vect.vocabulary_.get('movie')

9

In [None]:
# Testing the new sentence into the object created returns null values as there is to sentence in the corpus.
vect.transform(['keep your distance']).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [None]:
analyze = vect.build_analyzer()
analyze("This is a text document to analyze.") == (['this', 'is', 'text', 'document', 'to', 'analyze'])

True

In [None]:
#creating a unigram and bigram so we can extract 2-grams of words in addition to the 1-grams (individual words):
bigram_vect = CountVectorizer(ngram_range=(1,2), token_pattern=r'\b\w+\b', min_df=1)

In [None]:
analyze = bigram_vect.build_analyzer()
analyze('Bi-grams are cool!') == (['bi', 'grams', 'are', 'cool', 'bi grams', 'grams are', 'are cool'])

True

In [None]:
y = bigram_vect.fit_transform(corpus).toarray()
y

array([[0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0,
        0],
       [0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1,
        0],
       [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0,
        1]])

In [None]:
bigram_vect.vocabulary_

{'jeep': 17,
 'is': 14,
 'an': 4,
 'american': 2,
 'car': 9,
 'jeep is': 18,
 'is an': 16,
 'an american': 5,
 'american car': 3,
 'cars': 10,
 'animated': 7,
 'movie': 21,
 'cars is': 11,
 'an animated': 6,
 'animated movie': 8,
 'motorsports': 19,
 'a': 0,
 'dangerous': 12,
 'sport': 22,
 'motorsports is': 20,
 'is a': 15,
 'a dangerous': 1,
 'dangerous sport': 13}

In [None]:
feature_index = bigram_vect.vocabulary_.get('is a')

In [None]:
y[:, feature_index]

array([0, 0, 1])

In [None]:
test_x = ['car','movie','car']

from sklearn import svm

In [None]:
clf_mdl = svm.SVC(kernel = 'linear')
clf_mdl.fit(x, test_x)

In [None]:
test = vect.transform(['cars 2 is a movie too'])
p = clf_mdl.predict(test)
p

array(['movie'], dtype='<U5')