In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
%matplotlib inline

In [2]:
class Feature(object):
    
    def process_probability(self):
        pass
    
    def get_probability(self,value):
        pass
    
    def get_conditional_probability(self,value):
        pass

In [3]:
class Discrete_feature(Feature):
    
    def __init__(self, data, label=None):
        self.data = data
        self.label = label
        self.conditional = None
        self.unique_label = None
        self.unique_features = None
        self.unique_probabs = None
        self.probability = None
        
    def set_label(self, label):
        self.label = label    
    
    def _values_and_frequencies(self,feature, unique_features):
        counts = np.zeros_like(unique_features)
        for i, uf in enumerate(unique_features):
            counts[i] = np.sum(feature == uf)
            
        freq = counts/counts.sum()
        return freq
    
    def _multinoulli_distribution(self,feature, ufeat, probabs):
        probabilities = np.zeros(len(feature))
        for uf, prob in zip(ufeat, probabs):
            probabilities[feature == uf] = prob
        return probabilities

        
    def process_probability(self):
        unique_vals = np.unique(self.label)
        
        splits = []
        for unique in unique_vals:
            feature_split = self.data[self.label==unique]
            splits.append(feature_split)
        probabs = {}
        
        unique_features, counts = np.unique(self.data, return_counts=True)
        self.probability = counts/counts.sum()
        
        for unique, labdata in zip(unique_vals, splits):
            prob = self._values_and_frequencies(labdata, unique_features)
            probabs[unique] = prob

        self.unique_label = unique_vals
        self.unique_features = unique_features
        self.unique_probabs = probabs

    
    def get_probability(self,value):
        try: len(value)
        except TypeError: value = np.array([value])
        return self._multinoulli_distribution(value, self.unique_features, self.probability)

    
    def get_conditional_probability(self, value):
        probabs = {}
        try: len(value)
        except TypeError: value = np.array([value])
            
        for uniq in self.unique_label:
            uprob = self.unique_probabs[uniq]
            t = self._multinoulli_distribution(value, self.unique_features, uprob)
            probabs[uniq] = t
        self.conditional = probabs
        return self.conditional
        

In [4]:
class Continuous_feature(Feature):
    def __init__(self, data, label=None):
        self.data = data
        self.label = label
        self.unique_label = None
        self.means_and_vars = None
        self.conditional = None
        self.mean = None
        self.var = None
        
        
    def _mean_and_variance(self,feature):
        return np.mean(feature), np.var(feature)

    def _gaussian_distribution(self,feature, mean, var):
        return 1/np.sqrt(2*np.pi*var)*np.e**-((feature - mean)**2/(2*var))

    def set_label(self, label):
        self.label = label

    def process_probability(self):
        self.mean, self.var = self._mean_and_variance(self.data)
        unique_vals = np.unique(self.label)
        splits = []
        for unique in unique_vals:
            feature_split = self.data[self.label==unique]
            splits.append(feature_split)
        means_and_vars = {}
        for unique, labdata in zip(unique_vals, splits):
            mean, var = self._mean_and_variance(labdata)
            means_and_vars[unique] = (mean, var)

        self.unique_label = unique_vals
        self.means_and_vars = means_and_vars
        
    def get_probability(self, value):
        return self._gaussian_distribution(value, self.mean, self.var)
        
    def get_conditional_probability(self, value):
        probabs = {}
        for unique in self.unique_label:
#             print(unique)
            mean, var = self.means_and_vars[unique]
            t = self._gaussian_distribution(value, mean, var)
            probabs[unique] = t
        self.conditional = probabs
        return self.conditional

In [5]:
class NaiveBayesClassifier(object):
    
    def __init__(self, label=None):
        self.features = []
        self.features_cont = []
        self.featureClass = []
        self.label = label
        self.featureName = []
        self.unique_label = []
        self.probab_label = []
    
    def add_feature(self, feature, is_continuous:bool, featurename=None):
        self.features.append(feature)
        self.features_cont.append(is_continuous)
        if featurename is None:
            featurename = 'feature-{}'.format(len(self.featureName))
        self.featureName.append(featurename)
            
        if is_continuous:
            self.featureClass.append(Continuous_feature(feature, self.label))
        else:
            self.featureClass.append(Discrete_feature(feature, self.label))


    def add_feature_from_dataframe(self, df):
        
        def is_feature_continuous(df):
            if type(df) is not pd.core.frame.DataFrame:
                df = pd.DataFrame(df)
            feature_types = []
            n_unique_values_threshold = 10

            for column in df.columns:
                unique_values = df[column].unique()
                example_value = unique_values[0]

                if (isinstance(example_value, str)) or (len(unique_values) <= n_unique_values_threshold):
                    feature_types.append(False)
                else:
                    feature_types.append(True)

            return feature_types

        columns = df.columns
        conti = is_feature_continuous(df)
        for col, conti, featurename in zip(columns, conti, list(df.columns)):
            self.add_feature(df[col].values, conti, featurename)
        return
        
        
    def set_label(self, label):
        self.label = label
        for fc in self.featureClass:
            fc.set_label(self.label)
        
    def compute_probabilities(self):
        self.unique_label, counts = np.unique(self.label, return_counts=True)
        self.probab_label = counts/counts.sum()
        for fc in self.featureClass:
            fc.process_probability()
    
    def predict(self, values):
#         if type(values) is list :
#             values = np.array(values)
#         if len(values.shape) == 1:
#             values = values.reshape(1,-1)
        
        # Preprocessing values for parallel prediction
        trans = []
        for fc in self.features_cont:
            trans.append([])
        for i, dat in enumerate(values):
            for j, feat in enumerate(dat):
                trans[j].append(feat)
        
        condProb = {}
        for label, pro in zip(self.unique_label, self.probab_label):
            condProb[label] = pro # ths is is prior probability
            
        for featvals, featclass in zip(trans, self.featureClass):
#             print(featvals)
            likelyhood = featclass.get_conditional_probability(featvals)
            evidence = featclass.get_probability(featvals)
#             print(likelyhood , evidence)    
            for key in likelyhood:
                val = likelyhood[key]
#                 print(key, val)
                condProb[key] = condProb[key]* val/evidence
                
        # Normalize conditional probablility
        sums = 0
        for key in condProb:
            sums = sums + condProb[key]
        label = []
        for key in condProb:
            condProb[key] = condProb[key]/sums
        probs = np.array(list(condProb.values()))
#         normalized = probs/probs.sum(axis=1)
#         print(condProb)
        maxes = probs.argmax(axis=0)
        prediction = self.unique_label[maxes]
        
        return prediction, condProb

In [6]:
df = pd.read_csv("./titanic.csv")
df['label'] = df.Survived
df = df.drop(["PassengerId",'Survived','Name', 'Ticket', 'Cabin'], axis=1)

#handling missing balues
median_age = df.Age.median()
mode_embarked = df.Embarked.mode()[0]

df = df.fillna({'Age': median_age, 'Embarked':mode_embarked})

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
Pclass      891 non-null int64
Sex         891 non-null object
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    891 non-null object
label       891 non-null int64
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [8]:
df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,label
0,3,male,22.0,1,0,7.25,S,0
1,1,female,38.0,1,0,71.2833,C,1
2,3,female,26.0,0,0,7.925,S,1
3,1,female,35.0,1,0,53.1,S,1
4,3,male,35.0,0,0,8.05,S,0


In [9]:
# changing string to categorical value

def scq_to_012(x):
    if x == 'S':
        return 0
    elif x == 'C':
        return 1
    else: return 2

df['Embarked'] = df['Embarked'].apply(scq_to_012)
df['Sex'] = df['Sex'].apply(lambda x: int(x!='male'))
df.head(), df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
Pclass      891 non-null int64
Sex         891 non-null int64
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    891 non-null int64
label       891 non-null int64
dtypes: float64(2), int64(6)
memory usage: 55.8 KB


(   Pclass  Sex   Age  SibSp  Parch     Fare  Embarked  label
 0       3    0  22.0      1      0   7.2500         0      0
 1       1    1  38.0      1      0  71.2833         1      1
 2       3    1  26.0      0      0   7.9250         0      1
 3       1    1  35.0      1      0  53.1000         0      1
 4       3    0  35.0      0      0   8.0500         0      0, None)

In [10]:
df[df.columns[:-1]].head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,22.0,1,0,7.25,0
1,1,1,38.0,1,0,71.2833,1
2,3,1,26.0,0,0,7.925,0
3,1,1,35.0,1,0,53.1,0
4,3,0,35.0,0,0,8.05,0


In [11]:
list(df.columns)

['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'label']

In [12]:
test_data = [[3,0,22.0,1,0,7.2500, 0], [1,1,35.0,1,0,53.1000,0]]
test_data = df[df.columns[:-1]].get_values()[:5].tolist()
nbc = NaiveBayesClassifier()
nbc.set_label(df['label'].values)
nbc.add_feature_from_dataframe(df[df.columns[:-1]])
nbc.compute_probabilities()
nbc.predict(test_data)

(array([0, 1, 0, 1, 0]),
 {0: array([0.93698822, 0.03021437, 0.73812607, 0.12925275, 0.97413386]),
  1: array([0.06301178, 0.96978563, 0.26187393, 0.87074725, 0.02586614])})

In [13]:
df[df.columns[:-1]].get_values()[:5]
# df.get_values()

array([[ 3.    ,  0.    , 22.    ,  1.    ,  0.    ,  7.25  ,  0.    ],
       [ 1.    ,  1.    , 38.    ,  1.    ,  0.    , 71.2833,  1.    ],
       [ 3.    ,  1.    , 26.    ,  0.    ,  0.    ,  7.925 ,  0.    ],
       [ 1.    ,  1.    , 35.    ,  1.    ,  0.    , 53.1   ,  0.    ],
       [ 3.    ,  0.    , 35.    ,  0.    ,  0.    ,  8.05  ,  0.    ]])

In [14]:
for fc, fn, cont in zip(nbc.featureClass, nbc.featureName, nbc.features_cont):
    print(fn,'=',['Discrete','Continuous',][int(cont)])
    if cont:
        print('mean,var',fc.mean, fc.var)
        print('label m&v',fc.means_and_vars)
    else:
        print('uniq feat',fc.unique_features)
        print('uniq prob',fc.probability)
        print('label prob',fc.unique_probabs)
    print()

Pclass = Discrete
uniq feat [1 2 3]
uniq prob [0.24242424 0.20650954 0.55106622]
label prob {0: array([0.14571949, 0.17668488, 0.67759563]), 1: array([0.39766082, 0.25438596, 0.34795322])}

Sex = Discrete
uniq feat [0 1]
uniq prob [0.64758698 0.35241302]
label prob {0: array([0.85245902, 0.14754098]), 1: array([0.31871345, 0.68128655])}

Age = Continuous
mean,var 29.36158249158249 169.32224856193815
label m&v {0: (30.028233151183972, 155.96504988370975), 1: (28.29143274853801, 188.9054309881673)}

SibSp = Discrete
uniq feat [0 1 2 3 4 5 8]
uniq prob [0.68237935 0.2345679  0.03142536 0.01795735 0.02020202 0.00561167
 0.00785634]
label prob {0: array([0.72495446, 0.17668488, 0.0273224 , 0.02185792, 0.0273224 ,
       0.00910747, 0.01275046]), 1: array([0.61403509, 0.32748538, 0.0380117 , 0.01169591, 0.00877193,
       0.        , 0.        ])}

Parch = Discrete
uniq feat [0 1 2 3 4 5 6]
uniq prob [0.76094276 0.13243547 0.08978676 0.00561167 0.00448934 0.00561167
 0.00112233]
label prob {

In [15]:
continuous = is_feature_continuous(df)
continuous

NameError: name 'is_feature_continuous' is not defined

In [None]:
df.columns