# Bayes Classific Code

In [81]:
import re
import numpy as np
import pandas as pd
from collections import Counter

In [82]:
#data initializing
data = pd.read_csv('data.csv', sep = ',', header = None)
data.columns = ['Text', 'Class']
data.head()

Unnamed: 0,Text,Class
0,One of a kind Money maker Try it for free Fro...,0
1,link to my webcam you wanted Wanna see sexuall...,0
2,Re How to manage multiple Internet connection...,1
3,[SPAM] Give her hour rodeoEnhance your desi...,0
4,Best Price on the netf f m suddenlysusan Sto...,0


## Spliting Data

In [83]:
def split(data, numtest = 0.2):
    #choice test semples 
    ltest = int(data.shape[0] * numtest) # 0 - Class of Spam semples
    half = int(ltest / 2) # half - half of test dataset len
    class1 = data[data.Class == 1].shape[0] #all samples 1st class
    class0 = data[data.Class == 0].shape[0] #all samples 0 class
    #choice test and train sets for 1 and 0 classes
    test_data_1 = data[data.Class == 1][:half]
    train_data_1 = data[data.Class == 1][half:]
    test_data_0 = data[data.Class == 0][:half]
    train_data_0 = data[data.Class == 0][half:]
    #choice test and train sets for all dataset
    test=pd.concat((test_data_1,test_data_0))
    train=pd.concat((train_data_1,train_data_0))
    return train, test

In [84]:
train_data, test_data = split(data, numtest = 0.2)
print(f'train shape {train_data.shape}, test_shape {test_data.shape}')

train shape (896, 2), test_shape (222, 2)


In [85]:
def filtres(string):
    string = string.lower() #lowering data text
    patt = r'[A-Z,a-z]{3,}' #Sort strings 
    words = re.findall(patt, string)
    return words

## Bayes Funcs

In [86]:
def fit(x, Y):
   
    w0 = []
    w1 = []
    w01 = []
    x = np.array(x)
    Y = np.array(Y)
    for i in range (x.shape[0]):
        w01 += filtres(x[i].lower())
        if Y[i] == 0:
            w0 += filtres(x[i].lower())
        if Y[i] == 1:
            w1 += filtres(x[i].lower())  
            
    wfreq = Counter(w01)
    freq0 = Counter(w0)
    freq1 = Counter(w1)
    cl0_n = Counter(Y)[0]
    cl1_n = Counter(Y)[1]
    pcl_0 = cl0_n / Y.shape[0]
    pcl_1 = cl1_n / Y.shape[0]
   
    V = len(wfreq.keys())
    N0 = len(w0)
    N1 = len(w1)
    
    print(f'prob_class = [{pcl_0},{pcl_1}], N_classes = [{N0},{N1}], Vocabulary = {V}')
    
    return [pcl_0, pcl_1], [N0,N1], V, [freq0, freq1]

In [87]:
def predict(x, prob_classes, n_classes, V, freq):
    x = np.array(x)
    pcl_0 = prob_classes[0]
    pcl_1 = prob_classes[1]
    N0 = n_classes[0]
    N1 = n_classes[1]
    
    freq0 = freq[0]
    freq1 = freq[1]
    predict = []
 
    for j in range(x.shape[0]):
        words = filtres(x[j].lower())
        P0 = 1
        P1 = 1
        for i in words: 
            pwc0 = (freq0.get(i,0) +1) / (N0 + V)
            pwc1 = (freq1.get(i,0) + 1) / (N1 + V)
            P0 *= pwc0
            P1 *= pwc1
        P0 = P0 * pcl_0
        P1 = P1 * pcl_1    
        if P0>P1:
            predict.append(0)
        else:
            predict.append(1)
            
    return predict

In [88]:
def accuracy(Y, predict):
    Y = np.array(Y)
    predict = np.array(predict)
    acc = np.sum(Y == predict) / Y.shape[0]
    return acc

In [89]:
prod_classes, n_classes, V, freq = fit(data.Text, data.Class)
predictions = predict(data.Text, prob_classes, n_classes,  V, freq)
print(f'accuracy = {accuracy(data.Class, predictions)*100}%')

prob_class = [0.33989266547406083,0.6601073345259392], N_classes = [107762,168259], Vocabulary = 32449
accuracy = 79.60644007155635%


## Test 

In [90]:
train=['Captain Morgan', 'Captain Jack', 'Captain John', 'Pirate Jack']
Class=[0,0,0,1]
test=['Captain Jack Captain John Pirate Jack']

In [91]:
prod_classes, n_classes, V, freq = fit(train, data.Class)
predictions = predict(test, prob_classes, n_classes,  V, freq)
print(f'accuracy = {accuracy(data.Class, predictions)*100}%')

prob_class = [0.33989266547406083,0.6601073345259392], N_classes = [6,2], Vocabulary = 5
accuracy = 66.01073345259391%


## TF/IDF

In [92]:
def fit_tf(Z):
    Z['words'] = Z.Text.apply(filtres)
    w01 = Z.words.sum()
    w0 = Z[Z.Class == 0].words.sum()
    w1 = Z[Z.Class == 1].words.sum()
    wfreq = Counter(w01)
    freq0 = Counter(w0)
    freq1 = Counter(w1)
    #TF-IDF, frequencies calculation            
    idf_dic0 = {}
    idf_dic1 = {}
    ncl_0 = Z[Z.Class == 0].shape[0]
    ncl_1 = Z[Z.Class == 1].shape[0]
    for w in freq0.keys():
        idf_dic0[w] = np.sum([w in Z[Z.Class == 0].words.iloc[i] for i in range(ncl_0)])
    for w in freq1.keys():
        idf_dic1[w] = np.sum([w in Z[Z.Class == 1].words.iloc[i] for i in range(ncl_1)])
    
    cl0_n = Counter(Z.Class)[0]
    cl1_n = Counter(Z.Class)[1]
    
    pcl_0 = cl0_n / Z.shape[0]
    pcl_1 = cl1_n / Z.shape[0]
    V = len(wfreq.keys())
    N0 = len(w0)
    N1 = len(w1)
    
    return [pcl_0, pcl_1], [N0,N1], V, [freq0, freq1 ], [idf_dic0,idf_dic1]

In [93]:
def predict_tf(x, prob_classes, n_classes, V, freq, idf_dic):  

    x = np.array(x)
    pcl_0 = prob_classes[0]
    pcl_1 = prob_classes[1]
    N0 = n_classes[0]
    N1 = n_classes[1]
    
    freq0 = freq[0]
    freq1 = freq[1]
    
    idf_dic0 = idf_dic[0]
    idf_dic1 = idf_dic[1]
    
    predict = []
    
    for j in docs:
        words = filtres(j)
        words_freq = Counter(words)
        P0=1
        P1=1
        for i in words:
            pwc_0=(words_freq[i] / len(words)) * np.log(N0/idf_dic0.get(i,(N0-0.00001)))
            pwc_1=(words_freq[i] / len(words)) * np.log(N1/idf_dic1.get(i,(N1-0.00001)))
            P0 *= pwc_0
            P1 *= pwc_1
        P0 *= pcl_0
        P1 *= pcl_1
        
        if P0>P1:
            predict.append(0)
        else:
            predict.append(1)
    
    return predict

In [None]:
prob_classes, n_classes, V, freq, idf_dic = fit_tf(train_data)

x0 = train_data[train_data.Class == 0].shape[0]
x1 = train_data[train_data.Class == 1].shape[0]

predictions = predict_tf(test_data.Text, prob_classes, [x0,x1], V, freq, idf_dic)
print(f'Accuracy with TF-IDF = {accuracy(test_data.Class,predictions)*100}%')