In [35]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wnm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:
df = pd.read_csv('./spam.csv')
df.sample(5)

Unnamed: 0,spam,message,Unnamed: 2,Unnamed: 3,Unnamed: 4
3866,ham,"Alright we're hooked up, where you guys at",,,
427,ham,Any pain on urination any thing else?,,,
2406,ham,One of best dialogue in cute reltnship..!! \We...,Dont Come Near My Body..!! Bcoz My Hands May ...,,
5291,ham,Xy trying smth now. U eat already? We havent...,,,
5009,ham,"By the way, 'rencontre' is to meet again. Moun...",,,


In [37]:
df['spam'] = np.where(df['spam'] == 'spam', 1, 0)

In [38]:
df.sample(3)

Unnamed: 0,spam,message,Unnamed: 2,Unnamed: 3,Unnamed: 4
5345,0,Wat М_ doing now?,,,
1558,0,Message from . I am at Truro Hospital on ext. ...,,,
4179,0,"swhrt how u dey,hope ur ok, tot about u 2day.l...",,,


In [39]:
df = df[['spam', 'message']]

In [40]:
df

Unnamed: 0,spam,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will М_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [41]:
def clean_text(text: str):
    text = text.lower()
    # Удаление спец символов
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Удаление множественных пробелов
    text = re.sub(r'\s+', ' ', text)
    text = text.replace('\n', '')
    # Фильтрация стоп слов
    stop_words = set(stopwords.words('english'))
    filtered_sentence = [word for word in text.split(" ") if not word in stop_words]
    return ' '.join(filtered_sentence)
    #return text

In [42]:
df['message'][1]

'Ok lar... Joking wif u oni...'

In [43]:
clean_text(df['message'][1])

'ok lar joking wif u oni'

In [44]:
df['message'] = df['message'].apply(clean_text)

In [45]:
df.sample(3)

Unnamed: 0,spam,message
5134,0,sday joinedso training started today
4334,0,u sound like manky scouse boy stevelike travel...
932,0,samus shoulders yet


In [46]:
counts = df['spam'].value_counts()
spam_perc = round((counts[1] / df.shape[0]) * 100, 2)
print(f'Spam letters: {spam_perc}%, Normal letters {abs(100 - spam_perc)}%')

Spam letters: 13.41%, Normal letters 86.59%


In [47]:
counts

spam
0    4825
1     747
Name: count, dtype: int64

#### Делаю кол-во спам писем == не спаму (747 спам писем)

In [48]:
non_spam_sampled = df[df['spam'] == 0].sample(n=747, replace=False)
balanced_df = pd.concat([non_spam_sampled, df[df['spam'] == 1]])
balanced_df = balanced_df.sample(frac=1).reset_index(drop=True)
print(balanced_df['spam'].value_counts())

spam
1    747
0    747
Name: count, dtype: int64


In [49]:
#df = df.sample(frac=1) # Несбалансированная база
df = balanced_df.sample(frac=1) # Сбалансированная база спам == не спам
valid_size = 1000 # Размер валидационной выборки
x_train = df[0:-valid_size]
x_valid = df[-valid_size:df.shape[0]]

In [50]:
spam_prob = {}
non_spam_prob = {}
total_word_count = 0

def calc_word_probs(dtfr):
    global spam_prob, non_spam_prob, total_word_count
    
    spam: pd.DataFrame = dtfr[dtfr['spam'] == 1]
    non_spam: pd.DataFrame = dtfr[dtfr['spam'] == 0]
    
    spam_word_count = 0
    non_spam_word_count = 0

    # Count occurrences in spam
    for _, row in spam.iterrows():
        for word in row['message'].split():
            if word == '': continue
            spam_prob[word] = spam_prob.get(word, {'cnt': 0, 'val': 0})
            spam_prob[word]['cnt'] += 1
            spam_word_count += 1

    # Count occurrences in non-spam
    for _, row in non_spam.iterrows():
        for word in row['message'].split():
            if word == '': continue
            non_spam_prob[word] = non_spam_prob.get(word, {'cnt': 0, 'val': 0})
            non_spam_prob[word]['cnt'] += 1
            non_spam_word_count += 1

    total_word_count = spam_word_count + non_spam_word_count

    # Apply smoothing and calculate probabilities
    all_words = set(spam_prob.keys()).union(set(non_spam_prob.keys()))
    for word in all_words:
        if word not in spam_prob:
            spam_prob[word] = {'cnt': 0, 'val': 0}
        if word not in non_spam_prob:
            non_spam_prob[word] = {'cnt': 0, 'val': 0}

        # Apply Laplace smoothing
        spam_prob[word]['val'] = (spam_prob[word]['cnt'] + 1) / (spam_word_count + len(all_words))
        non_spam_prob[word]['val'] = (non_spam_prob[word]['cnt'] + 1) / (non_spam_word_count + len(all_words))
calc_word_probs(x_train)

In [51]:
def print_top_10(table: dict):
    item_list = [(word, data) for word, data in table.items()]
    sorted_spam_prob = sorted(item_list, key=lambda x: x[1]['val'], reverse=True)
    top_10_spam_words = sorted_spam_prob[:10]

    for word, data in top_10_spam_words:
        print(f"Word: '{word}' Count: {data['cnt']}, Value: {data['val']}")

In [52]:
print_top_10(spam_prob)

Word: 'call' Count: 276, Value: 0.01969287643964169
Word: 'free' Count: 177, Value: 0.0126546281814304
Word: '2' Count: 141, Value: 0.010095265178444475
Word: 'u' Count: 124, Value: 0.008886677093701124
Word: 'txt' Count: 118, Value: 0.00846011659320347
Word: 'ur' Count: 115, Value: 0.008246836342954643
Word: 'mobile' Count: 103, Value: 0.0073937153419593345
Word: '4' Count: 98, Value: 0.00703824825821129
Word: 'text' Count: 96, Value: 0.006896061424712071
Word: 'claim' Count: 90, Value: 0.006469500924214418


In [53]:
print_top_10(non_spam_prob)

Word: 'u' Count: 115, Value: 0.012824765063571034
Word: 'im' Count: 65, Value: 0.007296849087893864
Word: 'got' Count: 38, Value: 0.004311774461028192
Word: '2' Count: 36, Value: 0.004090657822001105
Word: 'call' Count: 34, Value: 0.003869541182974019
Word: 'ill' Count: 34, Value: 0.003869541182974019
Word: 'get' Count: 33, Value: 0.0037589828634604756
Word: 'dont' Count: 32, Value: 0.003648424543946932
Word: 'know' Count: 31, Value: 0.0035378662244333887
Word: 'ok' Count: 30, Value: 0.0034273079049198453


#### Вычисляю вероятность спама для 1 слова

In [54]:
test_word = "free"
P_SPAM = spam_prob[test_word]['val']
P_NOT_SPAM = (1 - spam_prob[test_word]['val'])

P_S = P_SPAM/(P_NOT_SPAM + P_SPAM)
P_S

0.0126546281814304

#### Вычисление вероятности спамовости предложения по формуле из лекции

In [56]:
def bayes_calc(sentence: str):
    clean_sent = clean_text(sentence)
    p_spam = 1 # произведение вероятностей спама слова F_i
    p_non_spam = 1 # произведение вероятностей не спама слова F_i
    p_word_given_spam = 0.5 # prior probability of spam
    p_word_given_non_spam = 0.5 # prior probability of non-spam
    
    for word in clean_sent.split():
        if word == '': continue
        
        if word in spam_prob:
            p_word_given_spam = spam_prob[word]['val']
        else:
            # Apply smoothing for unseen words
            p_word_given_spam = 1 / (sum(spam_prob[word]['cnt'] for word in spam_prob) + len(spam_prob))
        
        if word in non_spam_prob:
            p_word_given_non_spam = non_spam_prob[word]['val']
        else:
            # Apply smoothing for unseen words
            p_word_given_non_spam = 1 / (sum(non_spam_prob[word]['cnt'] for word in non_spam_prob) + len(non_spam_prob))

        p_spam *= p_word_given_spam
        p_non_spam *= p_word_given_non_spam

    # Calculate the posterior probability P(Spam|Sentence)
    p_spam_sentence = p_spam
    p_non_spam_sentence = p_non_spam
    
    return p_spam_sentence / (p_spam_sentence + p_non_spam_sentence)


0.98014403037333


In [57]:
bayes_calc("Good day everyone, the meeting is going to be today at 5 am") 

0.0002483200815954522

In [58]:
bayes_calc("get free money get credit from out company")

0.98014403037333

In [59]:
threshold = 0.5
def is_spam(sent):
    return 1 if bayes_calc(sent) > threshold else 0

In [60]:
is_spam("get free money get credit from out company")

1

In [61]:
is_spam("Good day everyone, the meeting is going to be today at 5 am")

0

In [62]:
is_spam("call free to ur txt") # Топ спам слова

1

In [63]:
right = 0 
error = 0
valid_size = len(x_valid)
for _, row in x_valid.iterrows():
    y_pred = is_spam(row['message'])
    y_true = row['spam'] 
    right += y_true == y_pred
    error += y_true != y_pred

In [64]:
right

273

In [65]:
error

27

In [66]:
valid_size

300

#### Наверно надо выбрать ещё нормальную метрику отклонений для бинарный классификации

In [67]:
print(f"Correct: {round(right/valid_size*100, 2)}% Errors: {round(error/valid_size*100, 2)}%")

Correct: 91.0% Errors: 9.0%


In [68]:
bayes_calc("free call get credit")

0.998570043454115

In [69]:
bayes_calc("hey bro how are you? how is your day? Meeting today")

0.009450825829812123