## Count vectorizer

 Converts a collection of text documents into a matrix of token counts. Each row in the resulting matrix represents a document, and each column represents a word from the corpus, also known as a token. The values in the matrix represent the frequency of occurrence of the corresponding token in the corresponding document.



In [2]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import bucket

corpus_slav = bucket.f_slavic_gpt

vectorizer = CountVectorizer()
corpus_norwa = bucket.f_norwa_list
X = vectorizer.fit_transform(corpus_norwa)
Y = vectorizer.fit_transform(corpus_slav)
print(X[::-10])
X

  (0, 24)	1
  (1, 20)	1
  (2, 1)	1
  (3, 12)	1
  (4, 22)	1


<46x45 sparse matrix of type '<class 'numpy.int64'>'
	with 46 stored elements in Compressed Sparse Row format>

In [6]:
Y

<20x19 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [153]:
v = vectorizer.get_feature_names_out()
type(v)

numpy.ndarray

In [4]:
X.toarray()
Y.toarray()

type(X)

scipy.sparse._csr.csr_matrix

In [155]:
v

array(['ada', 'alma', 'amalie', 'anna', 'astrid', 'aurora', 'eline',
       'ella', 'ellinor', 'emilie', 'emma', 'eva', 'frida', 'hanna',
       'hedda', 'iben', 'ida', 'ilde', 'ingrid', 'leah', 'linnea', 'live',
       'maia', 'maja', 'maria', 'marie', 'mathilde', 'maya', 'mia',
       'nora', 'norah', 'oline', 'olivia', 'saga', 'sara', 'sarah',
       'selma', 'sofia', 'sofie', 'solveig', 'sophia', 'sophie', 'thea',
       'tiril', 'zara'], dtype=object)

In [156]:
print(vectorizer.vocabulary_.get('alma'))
a = np.array(X.toarray())
a

1


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## Memory optimization

In [184]:
import sys

# np array
print(f'{a.shape} {type(a)} with values {type(a[0][0])} (Boolean) \n Memory (bytes):  {sys.getsizeof(a)}')

(46, 45) <class 'numpy.ndarray'> with values <class 'numpy.int64'> (Boolean) 
 Memory (bytes):  16688


In [180]:
#scipy sparse csr
print(X.shape, type(X), "\n Memory bytes =", sys.getsizeof(X))

(46, 45) <class 'scipy.sparse._csr.csr_matrix'> 
 Memory bytes = 48


In [188]:
# array with strings
print(v.shape, type(v), type(v[0][0]), "\n Memory (bytes)=", sys.getsizeof(v))

(45,) <class 'numpy.ndarray'> <class 'str'> 
 Memory (bytes)= 472


In [7]:
# Train

In [16]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
labels = np.zeros(46)

clf = MultinomialNB()
clf.fit(X, labels)

In [17]:
predicted_label = clf.predict(Y)

ValueError: X has 19 features, but MultinomialNB is expecting 45 features as input.

## Dict vectorizer

In [161]:
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
