In [1]:
class CountVectorizer :

    def __init__(self, ngram_size) :
        self.ngram_size = ngram_size
        self.vocab = {}

    def fit(self, corpus) :
        self.vocab.clear()
        all_token_list = []
        for x in corpus :
            tokens_list = []
            while len(x) >= self.ngram_size :
                token = x[:self.ngram_size]
                tokens_list.append(token)
                all_token_list.append(token)
                x = x[1 :]
        all_token_list = sorted(list(set(all_token_list)))
        self.vocab = {key : idx for idx, key in enumerate(all_token_list)}

    def transform(self, corpus) :
        intermediate_corpus = {}
        transformed_corpus = []
        for x in corpus :
            initial_token = x
            tokens_list = []
            while len(x) >= self.ngram_size :
                token = x[:self.ngram_size]
                tokens_list.append(token)
                x = x[1 :]
            intermediate_corpus[initial_token] = tokens_list
        for string_element in corpus :
            transformed_element_list = []
            for dic_token in sorted(self.vocab.keys()) :
                transformed_element_list.append(intermediate_corpus[string_element].count(dic_token))
            transformed_corpus.append(transformed_element_list)
        return transformed_corpus

    def fit_transform(self, corpus) :
        self.fit(corpus)
        return self.transform(corpus)


In [2]:
corpus = [
    'AATACAT',  # 'AA', 'AT', 'TA', 'AC', 'CA', 'AT'
    'CTACCCT',  # 'CT', 'TA', 'AC', 'CC', 'CC', 'CT'
    'TACCTAC',  # 'TA', 'AC', 'CC', 'CT', 'TA', 'AC'
]

In [3]:
correct_transformation = [
    [1, 1, 2, 1, 0, 0, 1],
    [0, 1, 0, 0, 2, 2, 1],
    [0, 2, 0, 0, 1, 1, 2],
]

In [4]:
vectorizer = CountVectorizer(2)

In [5]:
vectorizer.fit(corpus)

{'AA': 0, 'AC': 1, 'AT': 2, 'CA': 3, 'CC': 4, 'CT': 5, 'TA': 6}


In [6]:
vectorizer.transform(corpus) == correct_transformation

{'AATACAT': ['AA', 'AT', 'TA', 'AC', 'CA', 'AT'], 'CTACCCT': ['CT', 'TA', 'AC', 'CC', 'CC', 'CT'], 'TACCTAC': ['TA', 'AC', 'CC', 'CT', 'TA', 'AC']}


True

In [7]:
vectorizer = CountVectorizer(2)

In [8]:
vectorizer.fit_transform(corpus) == correct_transformation

{'AA': 0, 'AC': 1, 'AT': 2, 'CA': 3, 'CC': 4, 'CT': 5, 'TA': 6}
{'AATACAT': ['AA', 'AT', 'TA', 'AC', 'CA', 'AT'], 'CTACCCT': ['CT', 'TA', 'AC', 'CC', 'CC', 'CT'], 'TACCTAC': ['TA', 'AC', 'CC', 'CT', 'TA', 'AC']}


True

In [9]:
corpus_2 = ['TCAATCAC', 'GGGGGGGGGGG', 'AAAA']

In [10]:
vectorizer = CountVectorizer(2)

In [11]:
vectorizer.fit(corpus)

{'AA': 0, 'AC': 1, 'AT': 2, 'CA': 3, 'CC': 4, 'CT': 5, 'TA': 6}


In [12]:
vectorizer.transform(corpus_2) == [
    [1, 1, 1, 2, 0, 0, 0],
    [0, 0, 0, 0, 0, 0, 0],
    [3, 0, 0, 0, 0, 0, 0]
] 

{'TCAATCAC': ['TC', 'CA', 'AA', 'AT', 'TC', 'CA', 'AC'], 'GGGGGGGGGGG': ['GG', 'GG', 'GG', 'GG', 'GG', 'GG', 'GG', 'GG', 'GG', 'GG'], 'AAAA': ['AA', 'AA', 'AA']}


True

In [13]:
vectorizer = CountVectorizer(2)

In [14]:
print(vectorizer.fit_transform(['AAAA']))

{'AA': 0}
{'AAAA': ['AA', 'AA', 'AA']}
[[3]]


In [15]:
print(vectorizer.fit_transform(['BBBB']))

{'BB': 0}
{'BBBB': ['BB', 'BB', 'BB']}
[[3]]


In [16]:
vectorizer = CountVectorizer(2)

In [17]:
vectorizer.fit(['AAAA'])

{'AA': 0}


In [18]:
answer_1 = vectorizer.transform(['AACC'])

{'AACC': ['AA', 'AC', 'CC']}


In [19]:
answer_2 = vectorizer.transform(['BBCC'])

{'BBCC': ['BB', 'BC', 'CC']}


In [20]:
print(answer_1)
print(answer_2)

[[1]]
[[0]]
