In [276]:
import numpy as np
import pandas as pd
import re
from collections import Counter
import pickle

In [74]:
#read dataset
data=pd.read_csv('data.csv',sep=',', header=None)
data.columns=['message', 'labels']
print(f'dataset shape {data.shape}')
data.head()

dataset shape (1118, 2)


Unnamed: 0,message,labels
0,One of a kind Money maker Try it for free Fro...,0
1,link to my webcam you wanted Wanna see sexuall...,0
2,Re How to manage multiple Internet connection...,1
3,[SPAM] Give her hour rodeoEnhance your desi...,0
4,Best Price on the netf f m suddenlysusan Sto...,0


In [76]:
# Spam samples
print(f'shape of spam class {data[data.labels==0].shape}')
data[data.labels==0].head()

shape of spam class (380, 2)


Unnamed: 0,message,labels
0,One of a kind Money maker Try it for free Fro...,0
1,link to my webcam you wanted Wanna see sexuall...,0
3,[SPAM] Give her hour rodeoEnhance your desi...,0
4,Best Price on the netf f m suddenlysusan Sto...,0
11,Enter now hibody off having N...,0


In [77]:
# Ham samples
print(f'shape of ham class {data[data.labels==1].shape}')
data[data.labels==1].head()

shape of ham class (738, 2)


Unnamed: 0,message,labels
2,Re How to manage multiple Internet connection...,1
5,linux ie mailing list memberships reminderThis...,1
6,Re Apple Sauced againAt AM on ...,1
7,Re results for giant mass check phew I never...,1
8,Re RPM s post postun etcHave you tried reb...,1


In [61]:
def split_dataset(data,n_test=0.2):
    """
    Split dataset into train and test with class balance.
    Arguments:
    data -- dataset (format pd.DataFrame, column 'message' contains documents, column 'labels' - labels of documents)
    n_test -- split parametr, percentage of test dataset
    Returns:
    train -- train dataset
    test -- dataset
    """
    len_test=int(data.shape[0]*0.2)
    part=int(len_test/2)
    n_1=data[data.labels==1].shape[0]
    test_data_1=data[data.labels==1][:part]
    train_data_1=data[data.labels==1][part:]
    n_0=data[data.labels==0].shape[0]
    test_data_0=data[data.labels==0][:part]
    train_data_0=data[data.labels==0][part:]
    test=pd.concat((test_data_1,test_data_0))
    train=pd.concat((train_data_1,train_data_0))
    return train, test

In [72]:
#split data into train and test
train_data, test_data=split_dataset(data,n_test=0.2)
print(f'train shape {train_data.shape}, test_shape {test_data.shape}')

train shape (896, 2), test_shape (222, 2)


In [232]:
def split_string(s):
    """
    Method splits string into words with re pattern [A-Z,a-z]{3,}
    Argument:
    s -- string
    Return:
    words -- list of words
    """
    s=s.lower()
    pattern=r'[A-Z,a-z]{3,}'
    words=re.findall(pattern,s)
    return words

In [68]:
# Realization of NB model
def NB_fit(docs,labels):
    """
    Naive Bayes fit method.
    Arguments:
    docs -- collection of documents
    labels -- labels of documents
    Returns:
    [p_cl_0, p_cl_1] -- prior probability of classes
    [N_0, N_1] -- shape of classes
    V -- len of vocabulary(unic words from all classes)
    [freq_0, freq_1] -- word frequencies in each classes
    """ 
    all_words=[]
    all_words_0=[]
    all_words_1=[]
    docs=np.array(docs)
    labels=np.array(labels)
    
    for i in range (docs.shape[0]):
        all_words+=split_string(docs[i].lower())
        if labels[i]==0:
            all_words_0+=split_string(docs[i].lower())
        if labels[i]==1:
            all_words_1+=split_string(docs[i].lower())      
    print(f'The number of words = {len(all_words)}')
    all_words_freq=Counter(all_words)
    print(f'The number unique words = {len(all_words_freq.keys())}')
    
    freq_0=Counter(all_words_0)
    freq_1=Counter(all_words_1)
    
    cl_0_n=Counter(labels)[0]
    cl_1_n=Counter(labels)[1]
    
    p_cl_0=cl_0_n/labels.shape[0]
    p_cl_1=cl_1_n/labels.shape[0]
    V=len(all_words_freq.keys())
    N_0=len(all_words_0)
    N_1=len(all_words_1)
    
    print(f'prob_class=[{p_cl_0},{p_cl_1}], N_classes=[{N_0},{N_1}], Vocab={V}')
    
    return [p_cl_0, p_cl_1], [N_0,N_1], V, [freq_0, freq_1 ]
    
def NB_predict(docs,prob_classes, n_classes, V, freq, log=True, test_mode=False):
    """
    Naive Bayes predict method. This method predicts classes for documents.
    
    Arguments:
    docs -- collection of documents
    prob_classes -- prior probability of classes
    n_classes -- shape of classes
    V -- len of vocabulary(unic words from all classes
    freq -- word frequencies in each classes
    log -- flag turn on using Natural logarithm for calculating conditional probability P(document|class)
    test_mode -- flag turn on printing test information
    
    Return:
    predict -- classes prediction 
    """
    
    docs=np.array(docs)
    p_cl_0=prob_classes[0]
    p_cl_1=prob_classes[1]
    N_0=n_classes[0]
    N_1=n_classes[1]
    
    freq_0=freq[0]
    freq_1=freq[1]
    predict=[]
 
    for k in range(docs.shape[0]):
        words=split_string(docs[k].lower())
        if log:
            P_0=0
            P_1=0
        else:
            P_0=1
            P_1=1
        for i in words:
            p_wc_0=(freq_0.get(i,0) +1)/(N_0 + V)
            p_wc_1=(freq_1.get(i,0)+1)/(N_1 +V)
            if test_mode:
                print(f'p_wc_0={p_wc_0}, p_wc_1={p_wc_1}')
            if log:
                P_0+=np.log(p_wc_0)
                P_1+=np.log(p_wc_1)
            else:
                P_0*=p_wc_0
                P_1*=p_wc_1
                
        if log:
            P_0=P_0+np.log(p_cl_0)
            P_1=P_1+np.log(p_cl_1)
        else:
            P_0=P_0*p_cl_0
            P_1=P_1*p_cl_1
        if test_mode:
            print(f'P_0={P_0}, P_1={P_1}')
            
        if P_0>P_1:
            predict.append(0)
        else:
            predict.append(1)
    
    return predict

In [4]:
def accuracy(labels,predict):
    """
    Calculate accuracy of prediction
    Arguments:
    labels -- real labels of data
    predict -- predict labels of data
    Return
    acc - acuracy
    """
    labels=np.array(labels)
    predict=np.array(predict)
    acc=np.sum(labels==predict)/labels.shape[0]
    return acc*100

In [32]:
#Let's check model and predict labels for itself
prob_classes, n_classes, V, freq=NB_fit(data.message,data.labels)
predictions=NB_predict(data.message,prob_classes, n_classes, V, freq,log=1)
accuracy(data.labels,predictions)

The number of words = 276021
The number unic words = 32449
prob_class=[0.33989266547406083,0.6601073345259392], N_classes=[107762,168259], Vocab=32449


99.19499105545617

In [69]:
#Calculate prediction for test dataset
prob_classes, n_classes, V, freq=NB_fit(train_data.message,train_data.labels)
predictions=NB_predict(test_data.message,prob_classes, n_classes, V, freq,log=1)
accuracy(test_data.labels,predictions)

The number of words = 226343
The number unic words = 28958
prob_class=[0.3002232142857143,0.6997767857142857], N_classes=[80126,146217], Vocab=28958


88.28828828828829

I think it's good enough result for NB.

## Example

In [16]:
#c-0 j-1
#use NB model for predict classes for data from presentation
train=['Chinese Beijing Chinese', 'Chinese Chinese Shanghai', 'Chinese Macao', 'Tokyo Japan Chinese']
labels=[0,0,0,1]
test=['Chinese Chinese Chinese Tokyo Japan',]

In [20]:
prob_classes, n_classes, V, freq=NB_fit(train,labels)
predictions=NB_predict(test,prob_classes, n_classes, V, freq, log=0,test_mode=True)
print (f'predic class: {predictions}')

The number of words = 11
The number unic words = 6
prob_class=[0.75,0.25], N_classes=[8,3], Vocab=6
p_wc_0=0.42857142857142855, p_wc_1=0.2222222222222222
p_wc_0=0.42857142857142855, p_wc_1=0.2222222222222222
p_wc_0=0.42857142857142855, p_wc_1=0.2222222222222222
p_wc_0=0.07142857142857142, p_wc_1=0.2222222222222222
p_wc_0=0.07142857142857142, p_wc_1=0.2222222222222222
P_0=0.00030121377997263036, P_1=0.00013548070246744226
predic class: [0]


## TF/IDF algorithm

In [324]:
# Realization model NB + TF-IDF
def NB_fit_tf(data_frame):
    
    """
    Naive Bayes + TF-IDF fit method.
    Arguments:
    data_frame -- dataset (format pd.DataFrame, column 'message' contains documents, column 'labels' - labels of documents)
    Returns:
    [p_cl_0, p_cl_1] -- prior probability of classes
    [N_0, N_1] -- shape of classes
    V -- len of vocabulary(unic words from all classes)
    [freq_0, freq_1] -- word frequencies in each class
    [idf_dic_0,idf_dic_1] -- IDF word frequencies in each class
    """ 
    
    data_frame['words']=data_frame.message.apply(split_string)
    all_words=data_frame.words.sum()
    all_words_0=data_frame[data_frame.labels==0].words.sum()
    all_words_1=data_frame[data_frame.labels==1].words.sum()

    all_words_freq=Counter(all_words)
    freq_0=Counter(all_words_0)
    freq_1=Counter(all_words_1)
    
    #TF-IDF, frequencies calculation            
    idf_dic_0={}
    idf_dic_1={}
    
    n_cl_0=data_frame[data_frame.labels==0].shape[0]
    n_cl_1=data_frame[data_frame.labels==1].shape[0]
    for w in freq_0.keys():
        idf_dic_0[w]=np.sum([w in data_frame[data_frame.labels==0].words.iloc[i] for i in range(n_cl_0)])
    for w in freq_1.keys():
        idf_dic_1[w]=np.sum([w in data_frame[data_frame.labels==1].words.iloc[i] for i in range(n_cl_1)])
    
    cl_0_n=Counter(data_frame.labels)[0]
    cl_1_n=Counter(data_frame.labels)[1]
    
    p_cl_0=cl_0_n/data_frame.shape[0]
    p_cl_1=cl_1_n/data_frame.shape[0]
    V=len(all_words_freq.keys())
    N_0=len(all_words_0)
    N_1=len(all_words_1)
    
    return [p_cl_0, p_cl_1], [N_0,N_1], V, [freq_0, freq_1 ], [idf_dic_0,idf_dic_1]
    
def NB_predict_tf(docs,prob_classes, n_classes, V, freq,idf_dic, log=True, test_mode=False):  
    """
    Naive Bayes predict method. This method predicts classes for documents.
    
    Arguments:
    docs -- collection of documents
    prob_classes -- prior probability of classes
    n_classes -- shape of classes
    V -- len of vocabulary(unic words from all classes
    freq -- word frequencies in each classes
    idf_dic -- IDF word frequencies in each class
    log -- flag turn on using Natural logarithm for calculating conditional probability P(document|class)
    test_mode -- flag turn on printing test information
    
    Return:
    predict -- classes prediction 
    """
    docs=np.array(docs)
    p_cl_0=prob_classes[0]
    p_cl_1=prob_classes[1]
    N_0=n_classes[0]
    N_1=n_classes[1]
    
    freq_0=freq[0]
    freq_1=freq[1]
    
    idf_dic_0=idf_dic[0]
    idf_dic_1=idf_dic[1]
    
    predict=[]
    
    for k in docs:
        words=split_string(k)
        words_freq=Counter(words)
        if log:
            P_0=0
            P_1=0
        else:
            P_0=1
            P_1=1
        for i in words:
            p_wc_0=(words_freq[i]/len(words))*np.log(N_0/idf_dic_0.get(i,(N_0-0.00001)))
            p_wc_1=(words_freq[i]/len(words))*np.log(N_1/idf_dic_1.get(i,(N_1-0.00001)))
            if log:
                P_0+=np.log(p_wc_0)
                P_1+=np.log(p_wc_1)
            else:
                P_0*=p_wc_0
                P_1*=p_wc_1
        if log:
            P_0=P_0+np.log(p_cl_0)
            P_1=P_1+np.log(p_cl_1)
        else:
            P_0=P_0*p_cl_0
            P_1=P_1*p_cl_1
        
        if test_mode:
            print(f'P_0={P_0}, P_1={P_1}')
            
        if P_0>P_1:
            predict.append(0)
        else:
            predict.append(1)
    
    return predict

In [274]:
%time
#Operation takes about 3 hours
prob_classes, n_classes, V, freq, idf_dic=NB_fit_tf(train_data)

Wall time: 0 ns


In [278]:
#save IDF frequencies for each class
with open('idf_dic.pickle', 'wb') as f:
    pickle.dump(idf_dic,f)

In [326]:
#Naive Bayes with TF-IDF algorithm
docs_0=train_data[train_data.labels==0].shape[0]
docs_1=train_data[train_data.labels==1].shape[0]

predictions=NB_predict_tf(test_data.message,prob_classes, [docs_0,docs_1], V, freq, idf_dic, log=1)
acc=accuracy(test_data.labels,predictions)
print(f'Accuracy for text classification with TF-IDF = {acc}')

Accuracy for text classification with TF-IDF = 90.54054054054053


TF-IDF increase accuracy text classification