Лабораторна робота №1. Фільтрація спаму на основі наївного байєсівського класифікатора.

In [11]:
import pandas as pd
import re

Зчитування датасету

In [12]:
df = pd.read_csv('spam_ham_dataset.csv')
df.drop(['Unnamed: 0', 'label'], axis='columns', inplace=True)
df = df.rename(columns={'text': '_text'})
df = df[:10000]
df.head()

Unnamed: 0,_text,label_num
0,Subject: enron methanol ; meter # : 988291\r\n...,0
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,"Subject: photoshop , windows , office . cheap ...",1
4,Subject: re : indian springs\r\nthis deal is t...,0


Очистка даних

In [13]:
def prepair_text(text: str):
  res = text.replace('Subject', '')
  return ''.join(e if e.isalpha() else ' ' for e in res ).lower()

df['_text'] = df.apply(lambda row: prepair_text(row['_text']), axis=1)
df.head()

Unnamed: 0,_text,label_num
0,enron methanol meter this is a...,0
1,hpl nom for january see attached...,0
2,neon retreat ho ho ho we re around to t...,0
3,photoshop windows office cheap main ...,1
4,re indian springs this deal is to book th...,0


Бачимо що відношення неспаму до спаму 71% і 29%, тобто вибірку можемо вважати репрезентативною

In [14]:
df['label_num'].value_counts(normalize=True)

0    0.710114
1    0.289886
Name: label_num, dtype: float64

Підготовка тренувального і тестового сетів

In [15]:
data_randomized = df.sample(frac=1, random_state=1)

training_test_index = round(len(data_randomized) * 0.8)

training_set = data_randomized[:training_test_index].reset_index(drop=True)
test_set = data_randomized[training_test_index:].reset_index(drop=True)

print(training_set.shape)
print(test_set.shape)

(4137, 2)
(1034, 2)


In [16]:
training_set['label_num'].value_counts(normalize=True)

0    0.707276
1    0.292724
Name: label_num, dtype: float64

In [17]:
test_set['label_num'].value_counts(normalize=True)

0    0.72147
1    0.27853
Name: label_num, dtype: float64

Формування словнику із текстів повідомлень

In [18]:
vocabulary = []
training_set['_text'] = training_set['_text'].str.split()
for text in training_set['_text']:
   for word in text:
      vocabulary.append(word)

vocabulary = list(set(vocabulary))
len(vocabulary)

40697

Таблиця входжень слів в тексти повідомлень

In [19]:
word_counts_per_text = {unique_word: [0] * len(training_set['_text']) for unique_word in vocabulary}

for index, text in enumerate(training_set['_text']):
   for word in text:
      word_counts_per_text[word][index] += 1

word_counts = pd.DataFrame(word_counts_per_text)
word_counts.head()

Unnamed: 0,thruogh,armistice,vlm,lcuv,phharma,lstaring,bzjdl,tenuate,factua,htmlheadtitlelt,...,rebelled,adopts,gbryan,zzn,ewklfc,philips,agglomerate,supplying,looryking,fathersloveletter
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
training_set_clean = pd.concat([training_set, word_counts], axis=1)
training_set_clean.head()

Unnamed: 0,_text,label_num,thruogh,armistice,vlm,lcuv,phharma,lstaring,bzjdl,tenuate,...,rebelled,adopts,gbryan,zzn,ewklfc,philips,agglomerate,supplying,looryking,fathersloveletter
0,"[what, the, heck, daren, now, what, i, see, ke...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"[hilcorp, old, ocean, volume, according, to, g...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"[jurirne, get, latest, softwares, savings, cqu...",1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"[sitara, patch, a, patch, is, being, released,...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"[archived, great, shots, of, california, livin...",1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Підрахунок констант

In [21]:
spam_messages = training_set_clean[training_set_clean['label_num'] == 1]
ham_messages = training_set_clean[training_set_clean['label_num'] == 0]

p_spam = len(spam_messages) / len(training_set_clean)
p_ham = len(ham_messages) / len(training_set_clean)

n_words_per_spam_message = spam_messages['_text'].apply(len)
n_spam = n_words_per_spam_message.sum()

n_words_per_ham_message = ham_messages['_text'].apply(len)
n_ham = n_words_per_ham_message.sum()

n_vocabulary = len(vocabulary)

alpha = 1

Підрахунок параметрів

In [22]:
parameters_spam = {unique_word:0 for unique_word in vocabulary}
parameters_ham = {unique_word:0 for unique_word in vocabulary}

for word in vocabulary:
  n_word_given_spam = spam_messages[word].sum()
  p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha * n_vocabulary)
  parameters_spam[word] = p_word_given_spam

  n_word_given_ham = ham_messages[word].sum()
  p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha * n_vocabulary)
  parameters_ham[word] = p_word_given_ham

Перевірка точності моделі

In [23]:
def classify_test_set(message):
   message = prepair_text(message).split()

   p_spam_given_message = p_spam
   p_ham_given_message = p_ham

   for word in message:
      if word in parameters_spam:
         p_spam_given_message *= parameters_spam[word]

      if word in parameters_ham:
         p_ham_given_message *= parameters_ham[word]

   if p_ham_given_message > p_spam_given_message:
      return 0
   elif p_spam_given_message > p_ham_given_message:
      return 1
   else:
      return 3 # неможливо класифікувати, потрібна перевірка людини

In [24]:
test_set['predicted'] = test_set['_text'].apply(classify_test_set)
test_set.head()

Unnamed: 0,_text,label_num,predicted
0,enron hpl actuals for june teco...,0,0
1,anouncing a new player in the market qbbcpry...,1,3
2,re republic royalty done daren ...,0,3
3,instructions to remove spyware adware infe...,1,3
4,discounts on microsoft adobe autode...,1,1


In [25]:
correct = 0
total = test_set.shape[0]

for row in test_set.iterrows():
   row = row[1]
   if row['label_num'] == row['predicted']:
      correct += 1

print('Correct:', correct)
print('Incorrect:', total - correct)
print('Accuracy:', correct/total)

Correct: 632
Incorrect: 402
Accuracy: 0.6112185686653772
