In [1]:
import math
import statistics
import collections
import itertools
import functools
import operator
import multiprocessing
import numpy as np
import scipy as sp
import scipy.stats
import pandas as pd
from matplotlib import pyplot as plt
from IPython.display import clear_output
%load_ext snakeviz

In [31]:
train = pd.read_csv("../data/raw/train.csv", index_col="PassengerId")
test = pd.read_csv("../data/raw/test.csv", index_col="PassengerId")

def enrich(ds):
    ds["Cabin"] = ds["Cabin"].replace("T", np.nan)
    ds["Desk"] = ds["Cabin"].dropna().apply(lambda x: x[0])

enrich(train)
enrich(test)

train

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Desk
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,C
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,
...,...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,B
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,C


In [67]:
class Discrete:
    def __init__(self, values):
        self._values = values
        self._prob = None

    def fit(self, data):
        count = collections.defaultdict(int)
        for x in data:
            if x not in self._values:
                raise ValueError(f"{x} not in {self._values}")
            count[x] += 1
        total_count = sum(count.values())
        self._prob = {
            val: count[val] / total_count
            for val in self._values
        }

    def pdf(self, x):
        return self._prob[x]


class Model:
    def __init__(self):
        self._conditional_p = None
        self._finite_feats = None
        self._gauss_feats = None
        self._target_name = None
        self._target_classes = None
        self._apriori = None

    def reset_features(self, **kwargs):
        self._finite_feats = {}
        self._gauss_feats = []
        for feature, conf in kwargs.items():
            if conf["type"] == "finite":
                self._finite_feats[feature] = {"values": conf["values"]}
            elif conf["type"] == "gauss":
                self._gauss_feats.append(feature)
            else:
                raise NotImplementedError()
        return self

    def reset_target(self, name, classes):
        self._target_name = name
        self._target_classes = classes
        return self

    def relearn(self, df):
        self._conditional_p = {}
        for cls in self._target_classes:
            dfc = df[df[self._target_name] == cls]
            for gf in self._gauss_feats:
                self._conditional_p[cls, gf] = sp.stats.norm(*sp.stats.norm.fit(dfc[gf].dropna()))
            for ff, c in self._finite_feats.items():
                self._conditional_p[cls, ff] = Discrete(c["values"])
                self._conditional_p[cls, ff].fit(dfc[ff].dropna())
        self._apriori = Discrete(self._target_classes)
        self._apriori.fit(df[self._target_name].dropna())

    def one_prob(self, z):
        z = z.dropna()
        cls = z[self._target_name]
        res = 1
        for gf in self._gauss_feats:
            if gf in z:
                res *= self._conditional_p[cls, gf].pdf(z[gf])
        for ff in self._finite_feats:
            if ff in z:
                res *= self._conditional_p[cls, ff].pdf(z[ff])
        return res * self._apriori.pdf(cls)

    def prob(self, z):
        return z.apply(self.one_prob, axis=1)

    def cond_prob(self, z):
        res = pd.DataFrame()
        for cls in self._target_classes:
            hyp = z.assign(**{self._target_name: cls})
            res[cls] = self.prob(hyp)
        res_tot = res.sum(axis=1)
        return res.div(res_tot, axis=0)

def mean_succ_prob(m, sample, target, splits=10):
    sz = len(sample.index) // splits
    res = []
    for split_id in range(splits):
        tra = pd.concat([sample[:split_id * sz], sample[(split_id + 1)* sz:]])
        tes = sample[split_id * sz:(split_id + 1)* sz]
        m.relearn(tra)
        pred = m.cond_prob(tes.drop(columns=[target]))
        pred["best_prediction"] = pred.apply(lambda r: r.argmax(), axis=1)
        pred["actual"] = tes[target]
        pred["correct"] = pred["best_prediction"] == pred["actual"]
        res.append(np.mean(pred["correct"]))
    return np.mean(res)

naive_bayes_v01 = Model().reset_features(**{
    "Sex": {"type": "finite", "values": ["male", "female"]},
    "Age": {"type": "gauss"},
    "Pclass": {"type": "finite", "values": [1, 2, 3]},
    "Desk": {"type": "finite", "values": ["A", "B", "C", "D", "E", "F", "G"]},
    "SibSp": {"type": "gauss"},
    "Parch": {"type": "gauss"},
}).reset_target("Survived", [0, 1])

mean_succ_prob(naive_bayes_v01, train, "Survived")

0.7808988764044944

In [68]:
naive_bayes_v01.relearn(train)
(
    (naive_bayes_v01.cond_prob(test)[1] > 0.5)
    .rename("Survived")
    .astype(int)
    .to_csv("../models/naive_bayes_v01/naive_bayes_v01.csv")
)