# Naive Bayes Classifier from scratch 

In [1]:
from math import exp, pi, sqrt
from sklearn.model_selection import train_test_split
from scipy.stats import norm
from math import log
import numpy as np
import pandas as pd

In [2]:
raw_data = pd.read_csv('Development Index.csv')
raw_data

Unnamed: 0,Population,Area (sq. mi.),Pop. Density,GDP ($ per capita),Literacy (%),Infant mortality,Development Index
0,9944201,1284000,7.7,1200,47.5,93.82,2
1,5450661,43094,126.5,31100,100.0,4.56,4
2,26783383,437072,61.3,1500,40.4,50.25,2
3,9439,102,92.5,3400,97.0,7.35,4
4,3431932,176220,19.5,12800,98.0,11.95,3
...,...,...,...,...,...,...,...
220,74777981,1127127,66.3,700,42.7,95.32,2
221,474413,2586,183.5,55100,100.0,4.81,4
222,1065842,5128,207.9,9500,98.6,24.31,3
223,3042004,111370,27.3,1000,57.5,128.87,1


In [3]:
raw_data.describe()

Unnamed: 0,Population,Area (sq. mi.),Pop. Density,GDP ($ per capita),Literacy (%),Infant mortality,Development Index
count,225.0,225.0,225.0,225.0,225.0,225.0,225.0
mean,28983600.0,602336.3,380.545778,9729.333333,83.987556,35.261956,2.968889
std,118387900.0,1797679.0,1667.386671,10053.936329,19.455371,35.453113,0.883333
min,7026.0,2.0,0.0,500.0,17.6,0.0,1.0
25%,439117.0,4167.0,29.3,1900.0,76.2,7.87,2.0
50%,5042920.0,86600.0,78.8,5600.0,93.0,20.97,3.0
75%,17654840.0,446550.0,188.5,15700.0,98.5,55.51,4.0
max,1313974000.0,17075200.0,16271.5,55100.0,100.0,191.19,4.0


## Drop unnecessary column

In [4]:
df = raw_data.drop(columns=['Pop. Density '])
df

Unnamed: 0,Population,Area (sq. mi.),GDP ($ per capita),Literacy (%),Infant mortality,Development Index
0,9944201,1284000,1200,47.5,93.82,2
1,5450661,43094,31100,100.0,4.56,4
2,26783383,437072,1500,40.4,50.25,2
3,9439,102,3400,97.0,7.35,4
4,3431932,176220,12800,98.0,11.95,3
...,...,...,...,...,...,...
220,74777981,1127127,700,42.7,95.32,2
221,474413,2586,55100,100.0,4.81,4
222,1065842,5128,9500,98.6,24.31,3
223,3042004,111370,1000,57.5,128.87,1


In [5]:
df = df.astype('float32')

In [6]:
df.dtypes

Population            float32
Area (sq. mi.)        float32
GDP ($ per capita)    float32
Literacy (%)          float32
Infant mortality      float32
Development Index     float32
dtype: object

## Sort the data into classes

In [7]:
y_true = df.iloc[:, -1:]
y_true

Unnamed: 0,Development Index
0,2.0
1,4.0
2,2.0
3,4.0
4,3.0
...,...
220,2.0
221,4.0
222,3.0
223,1.0


In [8]:
y_true.value_counts()

Development Index
3.0                  89
4.0                  71
2.0                  52
1.0                  13
dtype: int64

## Split the training and test sets

In [9]:
df.iloc[:, :-1], df.iloc[:, -1]

(     Population  Area (sq. mi.)  GDP ($ per capita)  Literacy (%)  \
 0     9944201.0       1284000.0              1200.0     47.500000   
 1     5450661.0         43094.0             31100.0    100.000000   
 2    26783384.0        437072.0              1500.0     40.400002   
 3        9439.0           102.0              3400.0     97.000000   
 4     3431932.0        176220.0             12800.0     98.000000   
 ..          ...             ...                 ...           ...   
 220  74777984.0       1127127.0               700.0     42.700001   
 221    474413.0          2586.0             55100.0    100.000000   
 222   1065842.0          5128.0              9500.0     98.599998   
 223   3042004.0        111370.0              1000.0     57.500000   
 224  22409572.0        239460.0              2200.0     74.800003   
 
      Infant mortality   
 0            93.820000  
 1             4.560000  
 2            50.250000  
 3             7.350000  
 4            11.950000  
 .

In [12]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:], df.iloc[:, -1], test_size=0.2, random_state=0)

In [50]:
X_train

Unnamed: 0,Population,Area (sq. mi.),GDP ($ per capita),Literacy (%),Infant mortality,Development Index
212,439117.0,163270.0,4000.0,93.000000,23.570000,3.0
33,221736.0,960.0,11400.0,96.699997,10.030000,3.0
170,33987.0,160.0,25000.0,100.000000,4.700000,4.0
83,10235455.0,78866.0,15700.0,99.900002,3.930000,4.0
201,452776.0,1780.0,8000.0,90.000000,8.600000,4.0
...,...,...,...,...,...,...
67,188078224.0,8511965.0,7600.0,86.400002,29.610001,3.0
192,27307134.0,447400.0,1700.0,99.300003,71.099998,2.0
117,13287.0,21.0,5000.0,99.000000,9.950000,4.0
47,127463608.0,377835.0,28200.0,99.000000,3.260000,4.0


In [13]:
Xy1 = X_train.loc[(X_train["Development Index"] == 1)]
Xy2 = X_train.loc[(X_train["Development Index"] == 2)]
Xy3 = X_train.loc[(X_train["Development Index"] == 3)]
Xy4 = X_train.loc[(X_train["Development Index"] == 4)]
Xy1.shape, Xy2.shape, Xy3.shape, Xy4.shape

((9, 6), (42, 6), (68, 6), (61, 6))

## Calculate Priors

In [14]:
def fit_distribution(data):
    mean = data.mean()
    sigma = data.std()
    print("Mean:", mean, "Sigma:", sigma)
    dist = norm(mean, sigma)
    return dist

In [15]:
priory1 = len(Xy1) / len(X_train)
priory2 = len(Xy2) / len(X_train)
priory3 = len(Xy3) / len(X_train)
priory4 = len(Xy4) / len(X_train)
priory1, priory2, priory3, priory4

(0.05, 0.23333333333333334, 0.37777777777777777, 0.3388888888888889)

In [17]:
# Create PDF HDI == 1
X1y1 = fit_distribution(Xy1.iloc[:, 0])
X2y1 = fit_distribution(Xy1.iloc[:, 1])
X3y1 = fit_distribution(Xy1.iloc[:, 2])
X4y1 = fit_distribution(Xy1.iloc[:, 3])
X5y1 = fit_distribution(Xy1.iloc[:, 4])
print()

# Create PDF HDI == 2
X1y2 = fit_distribution(Xy2.iloc[:, 0])
X2y2 = fit_distribution(Xy2.iloc[:, 1])
X3y2 = fit_distribution(Xy2.iloc[:, 2])
X4y2 = fit_distribution(Xy2.iloc[:, 3])
X5y2 = fit_distribution(Xy2.iloc[:, 4])
print()

# Create PDF HDI == 3
X1y3 = fit_distribution(Xy3.iloc[:, 0])
X2y3 = fit_distribution(Xy3.iloc[:, 1])
X3y3 = fit_distribution(Xy3.iloc[:, 2])
X4y3 = fit_distribution(Xy3.iloc[:, 3])
X5y3 = fit_distribution(Xy3.iloc[:, 4])
print()

# Create PDF HDI == 4
X1y4 = fit_distribution(Xy4.iloc[:, 0])
X2y4 = fit_distribution(Xy4.iloc[:, 1])
X3y4 = fit_distribution(Xy4.iloc[:, 2])
X4y4 = fit_distribution(Xy4.iloc[:, 3])
X5y4 = fit_distribution(Xy4.iloc[:, 4])
print()


Mean: 9840249.0 Sigma: 9304303.0
Mean: 462174.12 Sigma: 511906.9
Mean: 900.0 Sigma: 452.76926
Mean: 43.922222 Sigma: 16.077219
Mean: 131.08667 Sigma: 29.922348

Mean: 44148460.0 Sigma: 168546820.0
Mean: 547294.4 Sigma: 680784.25
Mean: 2197.6191 Sigma: 1830.4321
Mean: 66.38094 Sigma: 21.008015
Mean: 71.46644 Sigma: 14.415833

Mean: 38962436.0 Sigma: 162323230.0
Mean: 609764.6 Sigma: 1618710.6
Mean: 6247.0586 Sigma: 4383.295
Mean: 89.27795 Sigma: 11.173808
Mean: 24.581175 Sigma: 9.597111

Mean: 16897094.0 Sigma: 43395668.0
Mean: 534781.44 Sigma: 1983192.5
Mean: 20572.13 Sigma: 10259.372
Mean: 96.54918 Sigma: 6.8243833
Mean: 5.986886 Sigma: 2.2562215



In [18]:
def probability(X, prior, dists):
    prob = prior
    idx = 0
    for dist in dists:
        tmp = dist.pdf(X[idx])
        if tmp <= 0:
            tmp = 0.1
        res = log(tmp)
        
        prob = prob + res
        idx = idx + 1
    
    return prob

In [19]:
print(probability([9944201, 1284000, 1200, 47.5, 93.82], priory1, [X1y1, X2y1, X3y1, X4y1 ,X5y1]))
print(probability([9944201, 1284000, 1200, 47.5, 93.82], priory2, [X1y2, X2y2, X3y2, X4y2 ,X5y2]))
print(probability([9944201, 1284000, 1200, 47.5, 93.82], priory3, [X1y3, X2y3, X3y3, X4y3 ,X5y3]))
print(probability([9944201, 1284000, 1200, 47.5, 93.82], priory4, [X1y4, X2y4, X3y4, X4y4 ,X5y4]))

-48.33656133445865
-52.32134841027359
-84.25997819141132
-76.57784935623623


In [20]:
def predict(data):
    category_ls = []
    category_ls.append(probability(data, priory1, [X1y1, X2y1, X3y1, X4y1 ,X5y1]))
    category_ls.append(probability(data, priory2, [X1y2, X2y2, X3y2, X4y2 ,X5y2]))
    category_ls.append(probability(data, priory3, [X1y3, X2y3, X3y3, X4y3 ,X5y3]))
    category_ls.append(probability(data, priory4, [X1y4, X2y4, X3y4, X4y4 ,X5y4]))
    
    maximum = np.argmax(category_ls)
    return maximum + 1
    
    

In [21]:
predict([9944201, 1284000, 1200, 47.5, 93.82])

1

In [22]:
row1 = df.iloc[17]
row1

Population            2.828715e+07
Area (sq. mi.)        1.471810e+05
GDP ($ per capita)    1.400000e+03
Literacy (%)          4.520000e+01
Infant mortality      6.698000e+01
Development Index     2.000000e+00
Name: 17, dtype: float32

In [23]:
row = df.iloc[17, :-1]
row.to_numpy()

array([2.8287148e+07, 1.4718100e+05, 1.4000000e+03, 4.5200001e+01,
       6.6980003e+01], dtype=float32)

In [24]:
predict(row)

2

## Prepare to test and evaluate

In [26]:
y_true = X_test.iloc[:, -1]
X_test, y_true

(      Population  Area (sq. mi.)  GDP ($ per capita)  Literacy (%)  \
 111  107449528.0       1972550.0              9000.0     92.199997   
 101  147365344.0        144000.0              1900.0     43.099998   
 197      56361.0       2166086.0             20000.0     99.000000   
 183  142893536.0      17075200.0              8900.0     99.599998   
 75     8192880.0         83870.0             30000.0     98.000000   
 64    10293011.0        207600.0              6100.0     99.599998   
 145   18881360.0        185180.0              3300.0     76.900002   
 137    2602713.0         82880.0             23200.0     77.900002   
 207    7862944.0        112620.0              1100.0     40.900002   
 168    4610820.0        323802.0             37800.0    100.000000   
 143      89703.0           344.0              5000.0     98.000000   
 166    2279723.0         47000.0              1300.0     42.200001   
 107     905949.0         18270.0              5800.0     93.699997   
 8    

In [45]:
res = [predict(x) for x in X_test.iloc[:, :-1].to_numpy()]
res_np = np.array(res)
res_np, len(res_np)

(array([3, 2, 3, 4, 4, 3, 3, 1, 1, 4, 3, 1, 3, 4, 4, 1, 3, 4, 2, 3, 2, 3,
        1, 2, 3, 1, 3, 3, 3, 3, 2, 4, 4, 4, 3, 4, 4, 3, 4, 3, 2, 2, 4, 1,
        4], dtype=int64),
 45)

In [46]:
y_true_np = y_true.to_numpy()
y_true_np = y_true_np.astype('int64')
y_true_np, len(y_true)

(array([3, 2, 3, 3, 4, 3, 3, 3, 2, 4, 3, 1, 3, 4, 4, 1, 3, 3, 2, 3, 2, 3,
        2, 3, 3, 2, 2, 3, 3, 3, 2, 3, 4, 1, 3, 4, 4, 3, 4, 3, 2, 2, 4, 1,
        4], dtype=int64),
 45)

In [48]:
from sklearn.metrics import classification_report
target_names = ['1', '2', '3', '4']

print(classification_report(y_true_np, res_np, target_names=target_names))

              precision    recall  f1-score   support

           1       0.43      0.75      0.55         4
           2       0.86      0.60      0.71        10
           3       0.94      0.76      0.84        21
           4       0.71      1.00      0.83        10

    accuracy                           0.78        45
   macro avg       0.74      0.78      0.73        45
weighted avg       0.83      0.78      0.78        45



In [49]:
correct = 0
incorrect = 0
idx = 0
for item in y_true_np:
    if item == res_np[idx]:
        correct += 1
    else:
        incorrect += 1
    idx += 1

print("Correct:", correct, "Incorrect:", incorrect, "Total:", correct + incorrect)

Correct: 35 Incorrect: 10 Total: 45
