# Probabilstic Generative Model

In [1]:
import pandas as pd
import numpy as np
from random import shuffle
from numpy.linalg import inv
from math import log
import os

### read and process dataset

In [2]:
train_data = pd.read_csv("dataset/train.csv")
test_data = pd.read_csv("dataset/test.csv")

In [3]:
print("train_data.shape : ", train_data.shape)
train_data.head()

train_data.shape :  (32561, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
print("test_data.shape : ", test_data.shape)
test_data.head()

test_data.shape :  (16281, 14)


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States


In [5]:
def preprocess_data_x(raw_data):
    data = raw_data.drop(["sex", "income"], axis=1) if "income" in raw_data.columns else raw_data.drop(["sex"], axis=1)
    
    # split the object column and num column
    list_object_column = [col for col in data.columns if data[col].dtypes == "object"]
    list_num_column = [d for d in list(data) if d not in list_object_column]
    
    # get data by column name
    object_data, num_data = data[list_object_column], data[list_num_column]
    
    # add a column(sex) in num_data, male=0, female=1
    num_data.insert(0, "sex", (raw_data["sex"] == " Female").astype(np.int))
    
    # for example, column(country) have three attributes : A ,B, C ，and then
    # there are three columns: country A, country B, country C to do onehot
    object_data = pd.get_dummies(object_data)
    
    data = pd.concat([num_data, object_data], axis=1)
    data_x = data.astype("int64")
    data_x = (data_x - data_x.mean()) / (data_x.std() + 1e-8)
    
    return data_x

def preprocess_data_y(raw_data):
    data_y = raw_data["income"]
    data_y = pd.DataFrame((data_y == " >50K").astype("int64"), columns=["income"])
    return data_y

In [6]:
x_train = preprocess_data_x(train_data).drop(["native_country_ Holand-Netherlands"], axis=1).values
x_test = preprocess_data_x(test_data).values
y_train = preprocess_data_y(train_data).values

In [7]:
def shuffle(X, Y):
    randomize = np.arange(X.shape[0])
    np.random.shuffle(randomize)
    return X[randomize], Y[randomize]

def split_dataset(data_x, data_y, percentage):
    
    valid_size = int(data_x.shape[0] * percentage)
    data_x, data_y = shuffle(data_x, data_y)
    
    x_train, x_valid = data_x[valid_size:], data_x[:valid_size]
    y_train, y_valid = data_y[valid_size:], data_y[:valid_size]
    
    return x_train, y_train, x_valid, y_valid

In [8]:
X_train, Y_train, X_valid, Y_valid = split_dataset(x_train, y_train, 0.2)

In [9]:
X_train[100]

array([-0.70306053,  0.03067009,  0.44161671, -0.42005317, -0.14591824,
       -0.2166562 , -0.19740595, -0.24444643, -0.17429242,  3.81531739,
       -0.01466358, -1.51676898, -0.18838643, -0.2909312 , -0.20375831,
       -0.02073966, -0.1717506 , -0.19348364, -0.11609016, -0.07201489,
       -0.10164797, -0.14226961, -0.12664299, -0.18406093, -0.21053109,
       -0.44365023, -0.11334212,  1.44937493, -0.23637027, -0.03960681,
       -0.13419346, -0.53713599, -0.39750195, -0.02658653,  1.083594  ,
       -0.11403502, -0.69877302, -0.18028182, -0.17735539, -0.24493989,
       -0.36185587, -0.01662744, -0.37948933, -0.37773974, -0.17744748,
       -0.20957474,  3.90688696, -0.33553617, -0.06780059, -0.38165751,
       -0.14260628, -0.35531062, -0.17127623, -0.22710006,  1.21161301,
       -0.58513159, -0.17624701, -0.42933921, -0.34402702, -0.22492334,
       -0.09819935, -0.18154914, -0.32576322, -0.09161022,  0.41301335,
       -0.13502119, -0.02416283, -0.06107248, -0.04804805, -0.04

### train

In [10]:
def train(X_train, Y_train):
    train_data_size = X_train.shape[0]
    
    index1 = np.where(Y_train == 1)[0]
    index2 = np.where(Y_train == 0)[0]
    
    index1_size = index1.shape[0]
    index2_size = index2.shape[0]
    
    mu1 = X_train[index1].mean(0)
    mu2 = X_train[index2].mean(0)
    
    sigma1 = np.dot(np.transpose(X_train[index1] - mu1), X_train[index1] - mu1) / index1_size
    sigma2 = np.dot(np.transpose(X_train[index2] - mu2), X_train[index2] - mu2) / index2_size
    
    sigma = (float(index1_size / train_data_size)) * sigma1 + (float(index2_size / train_data_size)) * sigma2
    
    return mu1, mu2, sigma, index1_size, index2_size

In [11]:
mu1, mu2, sigma, N1, N2 = train(x_train, y_train)

### valid

In [13]:
print("mu1.shape : ", mu1.shape)

mu1.shape :  (106,)


In [None]:
def valid(X_valid, Y_valid, mu1, mu2, sigma, N1, N2):
    sigma_inv = inv(sigma) # get the inverse of sigma
    
    weight = np.dot(mu1 - mu2, sigma_inv)
    bias = 

### Ref
[http://www.linzehui.me/2018/09/16/](http://www.linzehui.me/2018/09/16/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E4%B8%8E%E6%B7%B1%E5%BA%A6%E5%AD%A6%E4%B9%A0%E7%AE%97%E6%B3%95%E7%9F%A5%E8%AF%86/Lecture%204%20Classification%20%20Probabilistic%20Generative%20Model/)

[https://github.com/maplezzz/NTU_ML2017_Hung-yi-Lee_HW/blob/master/HW2/gp.py](https://github.com/maplezzz/NTU_ML2017_Hung-yi-Lee_HW/blob/master/HW2/gp.py)