In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
data = pd.read_csv(url, header=None)

columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
           'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
data.columns = columns
print(data.head())



   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital-gain  capital-loss  hours-per-week  native-country  income  
0          2174             0              40   United-States   <=50

In [3]:
data = data.dropna()

data = pd.get_dummies(data, drop_first=True)

X = data.drop('income_ >50K', axis=1)
y = data['income_ >50K']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = GaussianNB()
model.fit(X_train, y_train)

GaussianNB()

In [5]:
y_pred = model.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100 ,"%")

predictions = model.predict(X_test.iloc[:5])
print("Predictions for the first 5 instances:", predictions)

sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
print("Sensitivity:", sensitivity)
print("Specificity:", specificity)

Accuracy: 79.90173499155534 %
Predictions for the first 5 instances: [0 0 0 0 0]
Sensitivity: 0.32017823042647997
Specificity: 0.9512343180898422


In [6]:
print("Classes:", model.classes_)

Classes: [0 1]


In [7]:
new_data = X_test.iloc[0]  
posterior_probabilities = model.predict_proba([new_data])
probability_over_50k = posterior_probabilities[0][1]
print("Posterior probability of making over 50K a year:", probability_over_50k)

Posterior probability of making over 50K a year: 0.004445545856429435


