In [1]:
import sys
sys.path.append("..")

In [2]:
from collections import Counter

In [3]:
from ucimlrepo import fetch_ucirepo 

In [4]:
from recipes.naive_bayes_classifier import NaiveBayesClassifier
from recipes.cross_validation import cross_validate

# Sample missing values from distribution

In [5]:
congressional_voting_records = fetch_ucirepo(id=105) 
X = congressional_voting_records.data.features 
y = congressional_voting_records.data.targets 

In [6]:
X

Unnamed: 0,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,n,y,n,y,y,y,n,n,n,y,,y,y,y,n,y
1,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,
2,,y,y,,y,y,n,n,n,n,y,n,y,y,n,n
3,n,y,y,n,,y,n,n,n,n,y,n,y,n,n,y
4,y,y,y,n,y,y,n,n,n,n,y,,y,y,y,y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,n,n,y,y,y,y,n,n,y,y,n,y,y,y,n,y
431,n,n,y,n,n,n,y,y,y,y,n,n,n,n,n,y
432,n,,n,y,y,y,n,n,n,n,y,y,y,y,n,y
433,n,n,n,y,y,y,,,,,n,y,y,y,n,y


In [7]:
y

Unnamed: 0,Class
0,republican
1,republican
2,democrat
3,democrat
4,democrat
...,...
430,republican
431,democrat
432,republican
433,republican


In [8]:
y = y['Class']
y

0      republican
1      republican
2        democrat
3        democrat
4        democrat
          ...    
430    republican
431      democrat
432    republican
433    republican
434    republican
Name: Class, Length: 435, dtype: object

In [9]:
Counter(y)

Counter({'democrat': 267, 'republican': 168})

In [None]:
for attribute in X.columns:
    null_rows = X[attribute].isnull()
    X.loc[null_rows] = X.dropna().sample(null_rows.sum()).values

In [11]:
X

Unnamed: 0,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,n,n,y,n,n,y,y,y,y,y,n,n,y,n,n,y
1,y,y,y,n,n,n,y,y,y,y,y,n,n,n,n,y
2,n,n,n,y,y,y,n,n,n,y,n,y,n,y,n,y
3,y,n,y,n,n,n,y,y,y,n,y,n,n,n,y,y
4,y,y,y,n,n,n,y,y,y,n,n,n,n,n,y,y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,n,n,y,y,y,y,n,n,y,y,n,y,y,y,n,y
431,n,n,y,n,n,n,y,y,y,y,n,n,n,n,n,y
432,n,n,n,y,y,y,n,n,n,n,n,y,y,y,n,y
433,y,n,n,y,y,n,y,y,y,n,n,y,y,y,n,y


In [12]:
classifier = NaiveBayesClassifier()

In [13]:
cross_validate(X, y, classifier, 10)

([0.7906976744186046,
  0.7209302325581395,
  0.627906976744186,
  0.627906976744186,
  0.7906976744186046,
  0.627906976744186,
  0.7441860465116279,
  0.7209302325581395,
  0.7441860465116279,
  0.6744186046511628],
 0.7069767441860464)

# Fill missing values with '?'

In [14]:
congressional_voting_records = fetch_ucirepo(id=105) 
X = congressional_voting_records.data.features 
y = congressional_voting_records.data.targets['Class']

In [15]:
X = X.fillna('?')

In [16]:
cross_validate(X, y, classifier, 10)

([0.9534883720930233,
  0.9302325581395349,
  0.9534883720930233,
  0.8604651162790697,
  0.9069767441860465,
  0.8837209302325582,
  0.8837209302325582,
  0.9069767441860465,
  0.813953488372093,
  0.8604651162790697],
 0.8953488372093024)

# Fill missing values with mode

In [17]:
congressional_voting_records = fetch_ucirepo(id=105) 
X = congressional_voting_records.data.features 
y = congressional_voting_records.data.targets['Class']

In [18]:
X = X.fillna(X.mode().iloc[0])

In [19]:
cross_validate(X, y, classifier, 10)

([0.9534883720930233,
  0.9302325581395349,
  0.9534883720930233,
  0.9069767441860465,
  0.9302325581395349,
  0.8837209302325582,
  0.8837209302325582,
  0.9302325581395349,
  0.813953488372093,
  0.8604651162790697],
 0.9046511627906977)