# Scikit-learn Gaussian Naive-Bayes HDI implementation

In [1]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
!dir

 Volume in drive D is Windows
 Volume Serial Number is DECA-A6A6

 Directory of D:\CodeProject\naivebayes

11/06/2021  11:09    <DIR>          .
11/06/2021  11:09    <DIR>          ..
11/05/2021  22:51    <DIR>          .ipynb_checkpoints
11/05/2021  22:25             8,500 Development Index.csv
11/06/2021  11:09            52,713 scikit_nb.ipynb
               2 File(s)         61,213 bytes
               3 Dir(s)  726,164,078,592 bytes free


## Load Human Development Index (HDI) data

In [3]:
raw_data = pd.read_csv('Development Index.csv')
raw_data

Unnamed: 0,Population,Area (sq. mi.),Pop. Density,GDP ($ per capita),Literacy (%),Infant mortality,Development Index
0,9944201,1284000,7.7,1200,47.5,93.82,2
1,5450661,43094,126.5,31100,100.0,4.56,4
2,26783383,437072,61.3,1500,40.4,50.25,2
3,9439,102,92.5,3400,97.0,7.35,4
4,3431932,176220,19.5,12800,98.0,11.95,3
...,...,...,...,...,...,...,...
220,74777981,1127127,66.3,700,42.7,95.32,2
221,474413,2586,183.5,55100,100.0,4.81,4
222,1065842,5128,207.9,9500,98.6,24.31,3
223,3042004,111370,27.3,1000,57.5,128.87,1


In [4]:
raw_data.describe()

Unnamed: 0,Population,Area (sq. mi.),Pop. Density,GDP ($ per capita),Literacy (%),Infant mortality,Development Index
count,225.0,225.0,225.0,225.0,225.0,225.0,225.0
mean,28983600.0,602336.3,380.545778,9729.333333,83.987556,35.261956,2.968889
std,118387900.0,1797679.0,1667.386671,10053.936329,19.455371,35.453113,0.883333
min,7026.0,2.0,0.0,500.0,17.6,0.0,1.0
25%,439117.0,4167.0,29.3,1900.0,76.2,7.87,2.0
50%,5042920.0,86600.0,78.8,5600.0,93.0,20.97,3.0
75%,17654840.0,446550.0,188.5,15700.0,98.5,55.51,4.0
max,1313974000.0,17075200.0,16271.5,55100.0,100.0,191.19,4.0


## Remove redundant columns

In [5]:
df = raw_data.drop(columns=['Pop. Density '])
df

Unnamed: 0,Population,Area (sq. mi.),GDP ($ per capita),Literacy (%),Infant mortality,Development Index
0,9944201,1284000,1200,47.5,93.82,2
1,5450661,43094,31100,100.0,4.56,4
2,26783383,437072,1500,40.4,50.25,2
3,9439,102,3400,97.0,7.35,4
4,3431932,176220,12800,98.0,11.95,3
...,...,...,...,...,...,...
220,74777981,1127127,700,42.7,95.32,2
221,474413,2586,55100,100.0,4.81,4
222,1065842,5128,9500,98.6,24.31,3
223,3042004,111370,1000,57.5,128.87,1


## Prepare data for training

In [6]:
X, y = df.iloc[:, 0:-1], df.iloc[:, -1:]


In [7]:
X = X.to_numpy()
X, X.shape

(array([[9.9442010e+06, 1.2840000e+06, 1.2000000e+03, 4.7500000e+01,
         9.3820000e+01],
        [5.4506610e+06, 4.3094000e+04, 3.1100000e+04, 1.0000000e+02,
         4.5600000e+00],
        [2.6783383e+07, 4.3707200e+05, 1.5000000e+03, 4.0400000e+01,
         5.0250000e+01],
        ...,
        [1.0658420e+06, 5.1280000e+03, 9.5000000e+03, 9.8600000e+01,
         2.4310000e+01],
        [3.0420040e+06, 1.1137000e+05, 1.0000000e+03, 5.7500000e+01,
         1.2887000e+02],
        [2.2409572e+07, 2.3946000e+05, 2.2000000e+03, 7.4800000e+01,
         5.1430000e+01]]),
 (225, 5))

In [8]:
y = y.to_numpy().squeeze()
y, y.shape

(array([2, 4, 2, 4, 3, 3, 4, 4, 4, 2, 3, 3, 3, 4, 2, 4, 3, 2, 4, 2, 4, 3,
        4, 1, 1, 3, 4, 1, 2, 3, 2, 4, 2, 3, 3, 2, 3, 3, 4, 4, 1, 4, 3, 4,
        2, 4, 2, 4, 4, 4, 2, 4, 3, 4, 3, 1, 3, 4, 3, 3, 3, 4, 4, 4, 3, 3,
        4, 3, 3, 2, 2, 3, 3, 2, 4, 4, 4, 3, 3, 3, 3, 2, 4, 4, 3, 2, 4, 2,
        3, 3, 2, 4, 1, 3, 3, 4, 3, 2, 4, 3, 4, 2, 1, 4, 4, 4, 3, 3, 3, 3,
        4, 3, 3, 3, 3, 2, 3, 4, 3, 3, 3, 3, 2, 3, 4, 2, 4, 3, 2, 3, 4, 4,
        2, 2, 3, 3, 2, 3, 2, 3, 2, 4, 3, 3, 4, 3, 4, 4, 3, 3, 2, 3, 1, 3,
        3, 3, 1, 2, 2, 3, 3, 2, 2, 4, 2, 2, 1, 3, 4, 4, 4, 3, 2, 4, 2, 4,
        2, 3, 2, 3, 2, 1, 3, 3, 3, 3, 3, 2, 3, 3, 4, 2, 2, 4, 4, 3, 3, 3,
        4, 4, 3, 4, 3, 3, 4, 4, 2, 2, 1, 4, 2, 4, 3, 2, 4, 4, 4, 4, 3, 3,
        2, 4, 3, 1, 2], dtype=int64),
 (225,))

## Split data to training sets and test sets

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_test, y_train, y_test

(array([[4.39117000e+05, 1.63270000e+05, 4.00000000e+03, 9.30000000e+01,
         2.35700000e+01],
        [2.21736000e+05, 9.60000000e+02, 1.14000000e+04, 9.67000000e+01,
         1.00300000e+01],
        [3.39870000e+04, 1.60000000e+02, 2.50000000e+04, 1.00000000e+02,
         4.70000000e+00],
        [1.02354550e+07, 7.88660000e+04, 1.57000000e+04, 9.99000000e+01,
         3.93000000e+00],
        [4.52776000e+05, 1.78000000e+03, 8.00000000e+03, 9.00000000e+01,
         8.60000000e+00],
        [3.32412590e+07, 4.46550000e+05, 4.00000000e+03, 5.17000000e+01,
         4.16200000e+01],
        [3.19131900e+06, 7.82000000e+04, 6.30000000e+03, 9.26000000e+01,
         2.04700000e+01],
        [1.31397371e+09, 9.59696000e+06, 5.00000000e+03, 9.09000000e+01,
         2.41800000e+01],
        [4.00214000e+05, 3.16000000e+02, 1.77000000e+04, 9.28000000e+01,
         3.89000000e+00],
        [1.68458000e+05, 6.16000000e+02, 5.40000000e+03, 6.70000000e+01,
         1.35300000e+01],
        [4

## Apply Gaussian Naive Bayes Classifier

In [10]:
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 45 points : 25


## Classification Metrics

In [12]:
from sklearn.metrics import classification_report
y_true = y_test
y_true, y_pred

(array([3, 2, 3, 3, 4, 3, 3, 3, 2, 4, 3, 1, 3, 4, 4, 1, 3, 3, 2, 3, 2, 3,
        2, 3, 3, 2, 2, 3, 3, 3, 2, 3, 4, 1, 3, 4, 4, 3, 4, 3, 2, 2, 4, 1,
        4], dtype=int64),
 array([3, 2, 4, 4, 4, 1, 1, 4, 1, 4, 1, 1, 1, 4, 4, 1, 1, 1, 2, 3, 2, 1,
        1, 1, 1, 1, 1, 1, 3, 1, 1, 4, 4, 1, 1, 4, 4, 4, 4, 1, 1, 3, 4, 1,
        4], dtype=int64))

In [15]:
target_names = ['1', '2', '3', '4']
print(classification_report(y_true, y_pred, target_names=target_names))

              precision    recall  f1-score   support

           1       0.17      1.00      0.30         4
           2       1.00      0.30      0.46        10
           3       0.75      0.14      0.24        21
           4       0.67      1.00      0.80        10

    accuracy                           0.44        45
   macro avg       0.65      0.61      0.45        45
weighted avg       0.74      0.44      0.42        45

