## Inicializando tokenizer e stop words

In [1]:
import os
import re
# import spacy
import string
import numpy as np
import pandas as pd
from typing import Union, List
from collections import Counter
# from spacy.lang.en.stop_words import STOP_WORDS
import matplotlib.pyplot as plt

%matplotlib inline

### Criando modelo e carregando stop-words

In [2]:
stop_words = np.array(['because', '’m', 'name', 'therein', '’ll', 'already', 'that',
       'hundred', 'her', 'cannot', 'before', 'ever', 'regarding', 'get',
       'these', 'as', 'if', 'when', 'onto', 'ours', 'everything', '‘re',
       'from', 'whereby', 'side', 'and', 'do', 'must', 'three',
       'throughout', 'rather', 'its', 'was', 'amount', 'whose', 'how',
       'hereby', 'top', 'see', 'quite', 'thus', 'further', 'last',
       'myself', 'enough', 'himself', 'formerly', 'herself', 'more',
       'whereafter', 'per', 'yourselves', 'us', 'various', 'everywhere',
       'five', 'next', 'below', 'she', 'through', 'once', 'eight',
       'which', 'most', 'be', 'above', 'whither', 'wherein', 'up',
       'fifty', 'back', 'in', 'seeming', '’ve', 'after', 'full', 'mine',
       'yours', 'here', 'out', 'those', 'n‘t', 'eleven', 'all', 'same',
       'is', 'however', 'became', 'not', 'either', 'within', 'a', 'part',
       'nobody', 'did', 'without', 'many', 'but', 'might', 'nine', 'nor',
       'twenty', 'whatever', '’s', 'go', 'former', 'no', 'so', "'ll",
       'beside', 'therefore', 'about', 'hers', '‘s', 'third', 'much',
       "n't", 'everyone', 'own', 'over', '‘ve', "'ve", 'any', 'other',
       'during', 'else', 'still', 'towards', 'bottom', 'his', 'together',
       'perhaps', 'though', 'whole', 'besides', 'yourself', 'who',
       'using', 'noone', 'made', 'been', 'alone', 'whom', 'around',
       'please', 'along', 'are', 'thereupon', 'such', 'latterly', 'very',
       'sixty', 'anywhere', 'an', 'am', 'mostly', 'since', 'were',
       'become', 'first', 'less', 'moreover', '‘d', 'even', 'does', '’d',
       'each', 'now', 'while', 'indeed', 'our', 'becoming', 'empty',
       'some', 'unless', 'their', 'both', 'give', 'your', 'anything',
       'whereupon', 'nothing', 'of', 'neither', 'upon', 'beyond', 'least',
       'say', 'would', '‘ll', 'just', 'every', 'hereupon', 'via', 'down',
       'me', 'although', 'into', 'almost', 'seems', 'my', 'becomes',
       'whereas', 'latter', 'seem', 'then', 'he', 'serious', 'for',
       'front', 'the', 'can', 'few', 're', 'you', 'by', 'could', '’re',
       'to', 'six', 'elsewhere', 'than', 'well', "'d", 'namely', 'under',
       'i', 'someone', 'until', 'anyhow', 'move', 'itself', 'whether',
       'put', 'hence', 'toward', 'never', 'often', 'thru', 'or', 'with',
       'meanwhile', 'on', 'off', 'at', 'twelve', 'seemed', 'four', 'used',
       'done', 'two', 'otherwise', 'beforehand', 'hereafter', 'amongst',
       'across', 'between', 'due', 'they', 'call', 'may', 'afterwards',
       '‘m', "'s", 'one', 'wherever', 'we', 'always', 'has', 'against',
       'doing', 'being', 'n’t', 'another', 'should', 'ca', 'except',
       'thereby', 'what', 'him', 'forty', 'keep', 'show', 'themselves',
       'sometimes', 'whence', 'anyone', 'fifteen', 'it', 'somewhere',
       'also', 'take', 'nowhere', 'this', 'nevertheless', 'anyway',
       'ourselves', 'will', 'something', 'have', 'there', 'thence', 'why',
       "'re", 'ten', 'too', 'thereafter', 'none', 'make', 'somehow',
       'only', 'others', "'m", 'whoever', 'several', 'sometime', 'among',
       'had', 'behind', 'whenever', 'yet', 'them', 'really', 'again',
       'where', 'herein'], dtype='<U12')

### Carregando o dataset

In [3]:
imdb_dataset_filepath = os.path.abspath("imdb.csv")
imdb_dataset = pd.read_csv(imdb_dataset_filepath, names=["Text", "Prediction"], sep="\t")
imdb_dataset.head()

Unnamed: 0,Text,Prediction
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


### Criação do BoW

In [26]:
class SIA:
    bow: np.ndarray
    dbow: np.ndarray
    dbow_0: np.ndarray
    dbow_1: np.ndarray
    word_count: Counter
    sentences: List[str]
    processed_sentences: List[str]
    classification: List[int]
    detectors: List[np.ndarray]
    vectorized_format_text: object

    def __init__(self, sentences: Union[List[str], np.ndarray], classification: Union[List[int], np.ndarray]) -> None:
        self.sentences = sentences
        self.classification = classification
        self.vectorized_format_text = np.vectorize(self.format_text)

    def format_text(self, text: str) -> str:
        # remove caracteres que não sejam letras e numeros
        fixed_text = re.sub(r"[^A-Za-z\s]", "", text)
        fixed_text = re.sub(r"\s{2,}", r" ", fixed_text).casefold()
        fixed_text = re.sub(r"^\s+|\s+$", "", fixed_text)
        tokens = fixed_text.split()
        words = [token for token in tokens if token not in stop_words]
        return " ".join(words)
    
    def pre_process(self) -> None:
        self.processed_sentences = self.vectorized_format_text(self.sentences)
        self.set_bow()
        self.set_dbow()

    def set_bow(self) -> None:
        all_tokens = np.array(("".join(self.processed_sentences).split()))
        self.word_count = Counter(all_tokens)
        self.bow = np.array([*self.word_count.keys()])
        print(f"bow with {self.bow.shape[0]} words")
    
    def set_dbow(self) -> None:
        self.dbow = self.generate_dbow(self.sentences.shape[0], self.processed_sentences)
        self.dbow_0 = self.dbow[np.where(self.classification == 0)]
        self.dbow_1 = self.dbow[np.where(self.classification == 1)]
    
    def generate_dbow(self, X_size: int, sentences: Union[List[str], np.ndarray]) -> np.ndarray:
        dbow = np.zeros((X_size, self.bow.shape[0]))
        for i in range(X_size):
            sentence_counter = Counter(sentences[i].split())
            for j in range(dbow.shape[1]):
                dbow[i][j] = 1 if sentence_counter[self.bow[j]] > 0 else 0
        return dbow

    def generate_detectors(self, number_of_detectors: int) -> None:
        self.detectors = []
        while len(self.detectors) < number_of_detectors:
            candidate_detector = self.generate_candidate_detector()
            for detector in range(self.dbow_0):
                if self.match(detector, candidate_detector):
                    break
            else:
                self.detectors.append(candidate_detector)

    def generate_candidate_detector(self, word_probability = 0.01) -> np.ndarray:
        detector = np.zeros(self.bow.shape[0])
        for i in range(detector.shape[0]):
            if np.random.rand() < word_probability:
                detector[i] = 1
        return detector

    def match(self, match_set: np.ndarray, detector: np.ndarray, threshold = 5) -> bool:
        return ((match_set == detector).astype(int).sum() >= threshold)

    def detect(self, sentences: np.ndarray) -> np.ndarray:
        classification_results = np.zeros(sentences.shape[0], np.int8)
        pre_processed_sentences = self.vectorized_format_text(sentences)
        detect_dbow = self.generate_dbow(pre_processed_sentences.shape[0], pre_processed_sentences)

        for i in range(detect_dbow.shape[0]):
            for detector in self.detectors:
                if self.match(detect_dbow[i], detector):
                    classification_results[i] = 1
                    break
        return classification_results

    def export(self, file_path = os.getcwd()) -> None:
        with open(os.path.abspath(os.path.join(file_path, "bow.txt")), "w+") as f:
            f.write(str(list(self.bow)))
        with open(os.path.abspath(os.path.join(file_path, "dbow.txt")), "w+") as f:
            f.write(str(list(self.dbow)))

In [27]:
s = SIA(imdb_dataset["Text"].to_numpy(), imdb_dataset["Prediction"].to_numpy())

In [28]:
s.pre_process()

bow with 3229 words


In [29]:
print(s.sentences[0])
print(s.processed_sentences[0])

A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  
slowmoving aimless movie distressed drifting young man


In [30]:
s.word_count.most_common(10)

[('movie', 112),
 ('film', 111),
 ('bad', 44),
 ('good', 41),
 ('like', 40),
 ('great', 32),
 ('characters', 29),
 ('acting', 29),
 ('movies', 28),
 ('time', 27)]

In [31]:
s.generate_detectors(1)

KeyboardInterrupt: 

In [10]:
s.detectors

[array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 1., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 1., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 1., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 1.])]

In [11]:
s.detect(imdb_dataset.Text)

dbow [1. 1. 1. ... 0. 0. 0.] detector [0. 0. 0. ... 0. 0. 0.]
dbow [1. 1. 1. ... 0. 0. 0.] detector [0. 0. 0. ... 0. 0. 0.]
dbow [1. 1. 1. ... 0. 0. 0.] detector [0. 0. 0. ... 1. 0. 0.]
dbow [1. 1. 1. ... 0. 0. 0.] detector [0. 0. 0. ... 0. 0. 0.]
dbow [1. 1. 1. ... 0. 0. 0.] detector [0. 0. 0. ... 1. 0. 0.]
dbow [1. 1. 1. ... 0. 0. 0.] detector [0. 0. 0. ... 0. 0. 0.]
dbow [1. 1. 1. ... 0. 0. 0.] detector [0. 0. 0. ... 0. 0. 0.]
dbow [1. 1. 1. ... 0. 0. 0.] detector [0. 0. 0. ... 1. 0. 0.]
dbow [1. 1. 1. ... 0. 0. 0.] detector [0. 0. 0. ... 0. 0. 0.]
dbow [1. 1. 1. ... 0. 0. 0.] detector [0. 0. 0. ... 0. 0. 1.]
dbow [0. 0. 0. ... 0. 0. 0.] detector [0. 0. 0. ... 0. 0. 0.]
dbow [0. 0. 0. ... 0. 0. 0.] detector [0. 0. 0. ... 0. 0. 0.]
dbow [0. 0. 0. ... 0. 0. 0.] detector [0. 0. 0. ... 1. 0. 0.]
dbow [0. 0. 0. ... 0. 0. 0.] detector [0. 0. 0. ... 0. 0. 0.]
dbow [0. 0. 0. ... 0. 0. 0.] detector [0. 0. 0. ... 1. 0. 0.]
dbow [0. 0. 0. ... 0. 0. 0.] detector [0. 0. 0. ... 0. 0. 0.]
dbow [0.

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [21]:
pre_processed_sentences = s.vectorized_format_text(imdb_dataset["Text"])
new_dbow = s.generate_dbow(pre_processed_sentences.shape[0], pre_processed_sentences)

In [22]:
pre_processed_sentences[0]

'slowmoving aimless movie distressed drifting young man'

In [101]:
s.match(new_dbow[0], )

TypeError: match() missing 1 required positional argument: 'detector'

In [90]:
s.bow[np.where(new_dbow[0]>0)]

array(['slowmoving', 'aimless', 'movie', 'distressed', 'drifting',
       'young', 'man'], dtype='<U34')

'slowmoving'

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=993f98d4-e474-42c5-8e20-241471545034' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>