In [1]:
import json
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as snb

from collections import defaultdict
from sklearn.model_selection import train_test_split 
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv("datasets.csv")
columns = df.columns 
print(columns)
df['prsf'] = df['prefix'] + df['suffix']
df.head()

Index(['word_count', 'word', 'prefix', 'first_prefix', 'second_prefix',
       'suffix', 'second_suffix', 'first_suffix', 'adjective_specificity'],
      dtype='object')


Unnamed: 0,word_count,word,prefix,first_prefix,second_prefix,suffix,second_suffix,first_suffix,adjective_specificity,prsf
0,2,በላ,በላ,በ,ላ,በላ,በ,ላ,1,በላበላ
1,3,በላች,በላ,በ,ላ,ላች,ላ,ች,0,በላላች
2,4,ይበላል,ይበ,ይ,በ,ላል,ላ,ል,1,ይበላል
3,5,ትበላለች,ትበ,ት,በ,ለች,ለ,ች,0,ትበለች
4,2,ብላ,ብላ,ብ,ላ,ብላ,ብ,ላ,1,ብላብላ


In [17]:
class NaiveBayesClassifer:
    def __init__(self, words, labels):
        self.letter_frequency = defaultdict(float)
        self.nm = 0
        self.nf = 0

        for word, label in zip(words, labels):
            for letter in word:
                self.letter_frequency[(letter, label)] += 1.0
                self.nm += 1 if label == 1 else 0
                self.nf += 1 if label == 0 else 0
        
        self.v = len(set([letter for letter,label in self.letter_frequency.keys() if self.letter_frequency[(letter,label)] > 0]))
        self.f_class, self.m_class = labels.value_counts()
        self.logpior = np.log(self.m_class) - np.log(self.f_class)
        self.loglikelihood = defaultdict(float)
    
    def fit(self, X, y):
        for word in X:
            loglikelihood = 0
            for letter in word:
                freq_m = self.letter_frequency[(letter, 1)] 
                freq_f = self.letter_frequency[(letter, 0)]

                pw_m = (freq_m + 1) / (self.nm + self.v)
                pw_f = (freq_f + 1) / (self.nf + self.v)

                loglikelihood += np.log(pw_m) - np.log(pw_f)
            
            self.loglikelihood[word] = loglikelihood
    
    def predict(self, X):
        y_pred = []
        for word in X:
            pred = self.logpior + self.loglikelihood[word]
            y_pred.append(1 if pred > 0 else 0)
        
        return y_pred  

X, y = df['prsf'], df['adjective_specificity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)      
nbc = NaiveBayesClassifer(X_train, y_train)
nbc.fit(X_train, y_train)
y_pred = nbc.predict(X_test)
print('Accuracy: ', np.mean(np.array(y_pred) == y_test))

Accuracy:  0.6326530612244898


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=ff44dff4-4ba1-40f3-adfa-172f20dbf513' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>