In [54]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wnm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [55]:
df = pd.read_csv('./spam.csv')
df.sample(5)

Unnamed: 0,spam,message,Unnamed: 2,Unnamed: 3,Unnamed: 4
2957,spam,U have a secret admirer. REVEAL who thinks U R...,,,
4346,spam,U 447801259231 have a secret admirer who is lo...,,,
3163,ham,TODAY is Sorry day.! If ever i was angry with ...,,,
81,ham,K. Did you call me just now ah?,,,
4388,ham,"K I'm ready, &lt;#&gt; ?",,,


In [56]:
df['spam'] = np.where(df['spam'] == 'spam', 1, 0)

In [57]:
df.sample(3)

Unnamed: 0,spam,message,Unnamed: 2,Unnamed: 3,Unnamed: 4
3486,0,Change windows logoff sound..,,,
1442,0,"Its ok., i just askd did u knw tht no?",,,
1612,1,RT-KIng Pro Video Club>> Need help? info@ringt...,,,


In [58]:
df = df[['spam', 'message']]

In [59]:
df

Unnamed: 0,spam,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will М_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [60]:
def clean_text(text: str):
    text = text.lower()
    # Удаление спец символов
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Удаление множественных пробелов
    text = re.sub(r'\s+', ' ', text)
    text = text.replace('\n', '')
    # Фильтрация стоп слов
    stop_words = set(stopwords.words('english'))
    filtered_sentence = [word for word in text.split(" ") if not word in stop_words]
    return ' '.join(filtered_sentence)
    #return text

In [61]:
df['message'][1]

'Ok lar... Joking wif u oni...'

In [62]:
clean_text(df['message'][1])

'ok lar joking wif u oni'

In [63]:
df['message'] = df['message'].apply(clean_text)

In [64]:
df.sample(3)

Unnamed: 0,spam,message
3499,1,dorothykiefercom bank granite issues strongbuy...
3905,0,went fast asleep deartake care
5356,0,tell something thats okay


In [65]:
counts = df['spam'].value_counts()
spam_perc = round((counts[1] / df.shape[0]) * 100, 2)
print(f'Spam letters: {spam_perc}%, Normal letters {abs(100 - spam_perc)}%')

Spam letters: 13.41%, Normal letters 86.59%


In [66]:
counts

spam
0    4825
1     747
Name: count, dtype: int64

#### Делаю кол-во спам писем == не спаму (747 спам писем)

In [67]:
non_spam_sampled = df[df['spam'] == 0].sample(n=747, replace=False)
balanced_df = pd.concat([non_spam_sampled, df[df['spam'] == 1]])
balanced_df = balanced_df.sample(frac=1).reset_index(drop=True)
print(balanced_df['spam'].value_counts())

spam
0    747
1    747
Name: count, dtype: int64


In [68]:
#df = df.sample(frac=1) # Несбалансированная база
df = balanced_df.sample(frac=1) # Сбалансированная база спам == не спам
valid_size = 300 # Размер валидационной выборки
x_train = df[0:-valid_size]
x_valid = df[-valid_size:df.shape[0]]

In [88]:
spam_prob: dict = {} 
non_spam_prob: dict = {}
# Словари [слово: 
#    { 
#      cnt: кол-во встречаемочти, в своём класса,  
#      val: (кол-во встречаемости)/(кол-во встречаемости во *всех типах*)
#     }]
total_word_count = 0

def calc_word_probs(dtfr):
    spam: pd.DataFrame = dtfr[dtfr['spam'] == 1]
    non_spam: pd.DataFrame = dtfr[dtfr['spam'] == 0]
    spam_word_cnt = 0
    # Насчитываю встречаемость в спаме
    for _, row in spam.iterrows():
        for word in row['message'].split(' '):
            if(word == ''): continue
            spam_word_cnt += 1
            if word in spam_prob:
                spam_prob[word]['cnt'] += 1
            else:
                spam_prob[word] = {'cnt': 1, 'val': 0}
    # Насчитываю встречаемость не в спаме + общее кол-во слов
    non_spam_word_cnt = 0
    for _, row in non_spam.iterrows():
        for word in row['message'].split(' '):
            if(word == ''): continue
            non_spam_word_cnt += 1
            if word in non_spam_prob:
                non_spam_prob[word]['cnt'] += 1
            else:
                non_spam_prob[word] = {'cnt': 1, 'val': 0}

    total_word_count = non_spam_word_cnt + spam_word_cnt
    # Вероятность слова = (кол-во встречаемости)/(кол-во встречаемости в спаме + не в спаме)
    # Пробовал делить только на спам, ничего сильно не менялось
    for key in spam_prob.keys():
        spam_prob[key]["val"] = spam_prob[key]['cnt'] / total_word_count
    for key in non_spam_prob.keys():
        non_spam_prob[key]["val"] = non_spam_prob[key]['cnt'] / total_word_count


calc_word_probs(x_train)

In [89]:
def print_top_10(table: dict):
    item_list = [(word, data) for word, data in table.items()]
    sorted_spam_prob = sorted(item_list, key=lambda x: x[1]['val'], reverse=True)
    top_10_spam_words = sorted_spam_prob[:10]

    for word, data in top_10_spam_words:
        print(f"Word: '{word}' Count: {data['cnt']}, Value: {data['val']}")

In [90]:
print_top_10(spam_prob)

Word: 'call' Count: 279, Value: 0.027947510768306122
Word: 'free' Count: 169, Value: 0.016928778924171092
Word: '2' Count: 136, Value: 0.013623159370930582
Word: 'u' Count: 123, Value: 0.012320945607532805
Word: 'txt' Count: 120, Value: 0.012020434739056396
Word: 'ur' Count: 108, Value: 0.010818391265150756
Word: 'stop' Count: 100, Value: 0.010017028949213663
Word: 'text' Count: 96, Value: 0.009616347791245117
Word: '4' Count: 95, Value: 0.00951617750175298
Word: 'mobile' Count: 89, Value: 0.00891515576480016


In [91]:
print_top_10(non_spam_prob)

Word: 'u' Count: 112, Value: 0.011219072423119304
Word: 'im' Count: 57, Value: 0.005709706501051788
Word: 'dont' Count: 39, Value: 0.003906641290193328
Word: '2' Count: 37, Value: 0.0037063007112090552
Word: 'ltgt' Count: 36, Value: 0.0036061304217169187
Word: 'get' Count: 36, Value: 0.0036061304217169187
Word: 'good' Count: 34, Value: 0.0034057898427326456
Word: 'ur' Count: 34, Value: 0.0034057898427326456
Word: 'go' Count: 34, Value: 0.0034057898427326456
Word: 'like' Count: 32, Value: 0.003205449263748372


#### Вычисляю вероятность спама для 1 слова

In [92]:
test_word = "free"
P_SPAM = spam_prob[test_word]['val']
P_NOT_SPAM = (1 - spam_prob[test_word]['val'])

P_S = P_SPAM/(P_NOT_SPAM + P_SPAM)
P_S

0.016928778924171092

#### Вычисление вероятности спамовости предложения по формуле из лекции

In [93]:
def bayes_calc(sentence: str):
    clean_sent = clean_text(sentence)
    num = 1 # произведение вероятностей спама слова F_i
    inv_prob = 1 # произведение обратных вероятностей слова F_i
    for word in clean_sent.split(' '):
        if word not in spam_prob: continue # Если слова нет в словаре - пропускаю
        spam_probability = spam_prob[word]['val']

        num *= spam_probability
        inv_prob *= (1 - spam_probability)
    
    return num / (num + inv_prob) 

In [94]:
bayes_calc("Good day everyone, the meeting is going to be today at 5 am") 

7.503108434743253e-20

In [95]:
bayes_calc("get free money get credit from out company")

2.795463454764443e-17

In [96]:
threshold = 0.6
def is_spam(sent):
    return 1 if bayes_calc(sent) > threshold else 0

In [97]:
is_spam("get free money get credit from out company")

0

In [98]:
is_spam("Good day everyone, the meeting is going to be today at 5 am")

0

In [99]:
is_spam("call free to ur txt") # Топ спам слова

0

In [100]:
right = 0 
error = 0
valid_size = len(x_valid)
for _, row in x_valid.iterrows():
    y_pred = 0#is_spam(row['message'])
    y_true = row['spam'] 
    right += y_true == y_pred
    error += y_true != y_pred

In [101]:
right

152

In [102]:
error

148

In [103]:
valid_size

300

#### Наверно надо выбрать ещё нормальную метрику отклонений для бинарный классификации

In [104]:
print(f"Correct: {round(right/valid_size*100, 2)}% Errors: {round(error/valid_size*100, 2)}%")

Correct: 50.67% Errors: 49.33%


In [105]:
bayes_calc("free call get credit")

1.5756126093450454e-09

In [106]:
bayes_calc("hey bro how are you? how is your day? Meeting today")

7.05630143684069e-10