-
Notifications
You must be signed in to change notification settings - Fork 0
/
_vectorizer.py
22 lines (15 loc) · 889 Bytes
/
_vectorizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
from typing import Any, Dict, List, Tuple
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
class Vectorizer:
def __init__(self, is_tfidf: bool = True):
if is_tfidf is True:
self.vectorizer = TfidfVectorizer()
else:
self.vectorizer = CountVectorizer()
def create_vector_matrix(self, texts: Dict[str, str]) -> Tuple[Any, List[str]]:
corpus: List[str] = list(texts.values()) # each list element is a document
vector_matrix = self.vectorizer.fit_transform(corpus) # returns sparse matrix, [n_samples, n_features]
feature_names = self.vectorizer.get_feature_names() # returns list of token (feature names)
return vector_matrix, feature_names
def transform_documents_to_vectormatrix(self, documents: List[str]) -> Any:
return self.vectorizer.transform(documents)