### Домашняя работа

Группа: Т12О-101М-20

Студент: Гриньков Владислав Леонидович

Тема: Реализация наивного байесовского классификатора

In [1]:
from collections import Counter
from typing import Generic, TypeVar, List, Dict, Any


InputType = TypeVar('InputType', bound=list)
OutputType = TypeVar('OutputType')


class NaiveBayesClassifier(Generic[InputType, OutputType]):
    """
    P(class|data) = (P(data|class) * P(class)) / P(data)
    """
    def __init__(self) -> None:
        self.count_of_unique_targets: Dict[OutputType, int] = {}
        self.aprior_probability_of_target: Dict[OutputType, float] = {}

        self.unique_input_column_counts: Dict[Any, int] = {}
        self.input_column_counts_by_target: Dict[OutputType, Dict[Any, int]] = {}
        self.input_column_probabilities_by_target: Dict[OutputType, Dict[Any, float]] = {}

    def fit(self, inputs: List[InputType], targets: List[OutputType]):
        if len(inputs) != len(targets):
            raise ValueError('length of inputs and targets mismatched')

        # Count probabilities of targets
        count_of_unique_targets = Counter(targets)
        self.count_of_unique_targets.update(count_of_unique_targets)

        all_items_count = sum(self.count_of_unique_targets.values())
        self.aprior_probability_of_target = {
            target: target_items_count / all_items_count
            for target, target_items_count in self.count_of_unique_targets.items()
        }

        # Count frequencies of inputs
        for target, input in zip(targets, inputs):
            self.input_column_counts_by_target.setdefault(target, {})

            for item, count in Counter(input).items():
                self.unique_input_column_counts.setdefault(item, 0)
                self.input_column_counts_by_target[target].setdefault(item, 0)

                self.unique_input_column_counts[item] += 1
                self.input_column_counts_by_target[target][item] += count

        all_column_values_count = sum(self.unique_input_column_counts.values())
        self.input_column_probabilities_by_target = {
            target: {
                input_item: count / all_column_values_count
                for input_item, count in input_counts.items()
            }
            for target, input_counts in self.input_column_counts_by_target.items()
        }

    def predict(self, input: InputType) -> Dict[OutputType, float]:
        aposterior_target_probas: Dict[OutputType, float] = dict.fromkeys(self.count_of_unique_targets.keys(), 1.0)

        for item in input:
            for target in aposterior_target_probas:
                aposterior_target_probas[target] *= self.input_column_probabilities_by_target[target].get(item, 0)

        return aposterior_target_probas


### Тестируем классфикатор

Будем использовать датасет с spam смс'ками

In [2]:
import pandas as pd

filename = 'data/sms_spam_collection.tar.gz'

df = pd.read_csv(
    filename,
    compression='gzip',
    header=1,
    sep='\t',
    encoding='utf8',
    names=['class', 'sms_text'],
    error_bad_lines=False
)


df.head(3)

Unnamed: 0,class,sms_text
0,spam,Free entry in 2 a wkly comp to win FA Cup fina...
1,ham,U dun say so early hor... U c already then say...
2,ham,"Nah I don't think he goes to usf, he lives aro..."


Посмотрим на его размерность (2 колонки и 5571 строчка)

In [3]:
df.shape

(5571, 2)

Выкинем все NaN значения чтобы не путать классификатор

In [4]:
df = df.dropna()

Сделаем функции для преобразования текста в вектор слов

In [5]:
import string

def text_preprocess(sms_text: str) -> str:
    """Преобразование текста для анализа"""
    text_no_punctuation = ''.join([
        char
        for char in sms_text
        if char not in string.punctuation
    ])
    text_lowercase = ' '.join([
        word.lower()
        for word in text_no_punctuation.split(sep=' ')
    ])
    
    return text_lowercase

def tokenize_text(text):
    tokens = []
    try:
        processed_text = text_preprocess(text)
        tokens = processed_text.split(' ')
    except TypeError:
        print(f'Ошибка при обработке текста sms: {text}')
    return tokens

Подготовим данные на которых будет обучаться классификато

In [6]:
inputs = [tokenize_text(s) for s in df['sms_text'].tolist()]
targets = df['class'].tolist()

In [7]:
clf = NaiveBayesClassifier()

Обучаем классфикатор

In [8]:
clf.fit(inputs, targets)

А вот и тест на рандомной строчке из датасета

In [16]:
import random

idx = random.randint(0, df.shape[0])
random_input = df['sms_text'].values[idx]
random_target = df['class'].values[idx]

print(f'At index {idx}: class={random_target} text={random_input!r}\n')

probas = clf.predict(tokenize_text(random_input))
print('Predicted    :', probas)
print('Most probably:', max(probas.items(), key=lambda x: x[1]))

At index 4929: class=spam text='Hi, the SEXYCHAT girls are waiting for you to text them. Text now for a great night chatting. send STOP to stop this service'

Predicted    : {'spam': 3.691409012291068e-77, 'ham': 0.0}
Most probably: ('spam', 3.691409012291068e-77)
