# Week2 NaiveBayes

## Phân loại tình trạng bệnh lý (condition) với Bernoulli Naïve Bayes

In [15]:
import numpy as np
import pandas as pd
from numpy.conftest import dtype
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score



### 1. Đọc dữ liệu

- File `bernoulli_nb_symptoms.csv` chứa các triệu chứng (symptoms) của bệnh nhân và cột nhãn `condition`.
- Sử dụng `pd.read_csv()` để đọc dữ liệu vào DataFrame.
- Kiểm tra các cột và vài dòng đầu tiên để hiểu cấu trúc dữ liệu.

In [16]:
file_path = 'bernoulli_nb_symptoms.csv'
data = pd.read_csv(file_path)
print(data.head())

     condition  fever  chills  runny_nose  sneezing  cough  sore_throat  \
0  food_poison      1       0           1         0      0            0   
1          flu      0       0           0         0      0            0   
2          flu      0       0           0         0      1            0   
3  stomach_bug      0       0           0         0      1            1   
4  food_poison      0       0           0         0      0            0   

   headache  nausea  vomiting  ...  dehydration_signs  dry_mouth  dark_urine  \
0         0       0         0  ...                  0          0           0   
1         1       0         0  ...                  0          0           0   
2         0       0         0  ...                  0          1           0   
3         0       0         0  ...                  0          0           0   
4         0       0         0  ...                  0          0           0   

   rapid_heart_rate  sweating  itchy_eyes  watery_eyes  nasal_conges

### 2. Tách đặc trưng (X) và nhãn (y)

- `X`: tất cả các cột triệu chứng, bỏ cột `condition`.
- `y`: cột nhãn `condition`, mô tả tình trạng bệnh của từng mẫu.

In [17]:
target_feature = 'condition'
Y = data[target_feature]
labels = np.zeros((len(set(Y)), 2), dtype=Y.dtype)
id = 0
for label in set(Y):
    labels[id, 0] = label
    labels[id, 1] = (Y == label).sum()/len(Y)
    id += 1

In [18]:
print(labels)

[['allergy' np.float64(0.1184)]
 ['food_poison' np.float64(0.1064)]
 ['covid_like' np.float64(0.1532)]
 ['cold' np.float64(0.1576)]
 ['dehydration' np.float64(0.094)]
 ['flu' np.float64(0.1532)]
 ['stomach_bug' np.float64(0.1236)]
 ['migraine' np.float64(0.0936)]]


### 3. Chia dữ liệu Train/Test
- Tỉ lệ: 60% Train, 40% Test.

In [19]:
Train_size = round(len(data)*0.6)
X_Train = data[:Train_size]
Y_Train = data[target_feature][:Train_size]
X_Test = data[Train_size:].drop(target_feature, axis = 1).reset_index(drop=True, inplace=False)
Y_Test = data[target_feature][Train_size:].reset_index(drop=True, inplace=False)

### 4.Huấn luyện Bernoulli Naïve Bayes

- Tính các giá trị P_ik ứng với xác suất để trường thứ i nhận giá trị 1: P_ik := [|{xm in class k : x_im = xi}| ] / [N_k]
- Lấy tên mỗi trường (field) của dữ liệu và số lựa chọn ứng với trường đó
- BernoulliNB phù hợp với **dữ liệu nhị phân (0/1)**, ví dụ triệu chứng có hay không.
- Huấn luyện trên tập Train và chuẩn bị dự đoán trên tập Test.

#### Tính các giá trị P_ik ứng với xác suất để trường thứ i nhận giá trị 1: P_ik := [|{xm in class k : x_im = xi}| ] / [N_k]

In [20]:
def Pik_feature_per_class(X):
    X = np.array(X)
    count = (X == 1).sum()
    return count/len(X)

##### Lấy tên mỗi trường (field) của dữ liệu và số lựa chọn ứng với trường đó

In [23]:
def predict_output_label(X, x_input, p_labels, target_feature):
    p = np.log(np.array(p_labels[:, 1], dtype=float))

    for k in range(len(p)):
        Xk = X[X[target_feature] == p_labels[k, 0]]
        for i in range(len(x_input)):
            Pik = Pik_feature_per_class(Xk.iloc[:,i+1])
            if (Pik != 0):
                p[k] += np.log(x_input.iloc[i] * Pik + (1-x_input.iloc[i]) * (1-Pik))
    y_star = np.argmax(p)
    return p_labels[y_star, 0]

In [26]:
Y_pred = np.zeros(len(Y_Test), dtype=Y_Test.dtype)
for i in range(len(X_Test)):
    Y_pred[i] = predict_output_label(X_Train, X_Test.iloc[i, :], labels, target_feature)
print(Y_pred)

['migraine' 'food_poison' 'cold' 'stomach_bug' 'migraine' 'migraine'
 'cold' 'flu' 'flu' 'stomach_bug' 'stomach_bug' 'flu' 'allergy' 'cold'
 'cold' 'stomach_bug' 'covid_like' 'dehydration' 'cold' 'food_poison'
 'allergy' 'food_poison' 'food_poison' 'allergy' 'stomach_bug' 'migraine'
 'flu' 'covid_like' 'food_poison' 'cold' 'flu' 'cold' 'cold' 'flu'
 'stomach_bug' 'cold' 'stomach_bug' 'food_poison' 'migraine' 'cold' 'cold'
 'allergy' 'covid_like' 'covid_like' 'flu' 'flu' 'dehydration' 'cold'
 'cold' 'allergy' 'dehydration' 'cold' 'stomach_bug' 'covid_like'
 'allergy' 'cold' 'flu' 'food_poison' 'allergy' 'food_poison' 'flu'
 'allergy' 'allergy' 'allergy' 'allergy' 'food_poison' 'flu' 'flu' 'cold'
 'stomach_bug' 'food_poison' 'food_poison' 'flu' 'food_poison' 'flu'
 'allergy' 'migraine' 'cold' 'cold' 'covid_like' 'cold' 'allergy'
 'covid_like' 'food_poison' 'covid_like' 'covid_like' 'covid_like'
 'covid_like' 'migraine' 'dehydration' 'flu' 'migraine' 'food_poison'
 'covid_like' 'allergy' 

In [27]:
print(accuracy_score(Y_pred, Y_Test))

0.479


In [28]:
print(predict_output_label(X_Train, X_Train.iloc[3, 1:], labels, target_feature))

stomach_bug


## Richter's Predictor


### Exercise 1 Type: Categorical dùng Multinomail Naive Bayes

In [3]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score

#### Đọc dữ liệu và combine hai bảng, lấy các thuộc tính như yêu cầu đề bài

In [29]:
train_values = pd.read_csv("train_values.csv")
train_labels = pd.read_csv("train_labels.csv")
test_values  = pd.read_csv("test_values.csv")
submission_format = pd.read_csv("submission_format.csv")

categorical_cols = [
    "land_surface_condition",
    "foundation_type",
    "roof_type",
    "ground_floor_type",
    "other_floor_type",
    "position",
    "plan_configuration",
    "legal_ownership_status"
]

# Ghép nhãn vào train
df = train_values[categorical_cols].copy()
df["damage_grade"] = train_labels["damage_grade"]

target_feature = "damage_grade"
unique_labels = np.unique(df[target_feature])
num_labels_M = {col: df[col].nunique() for col in categorical_cols}

In [30]:
def multinomial_pxik_feature_per_class(X, num_nomials_M, alpha=1.0):
    values, counts = np.unique(X, return_counts=True)
    total = len(X)

    probs = {}
    for v, count in zip(values, counts):
        probs[v] = (count + alpha) / (total + num_nomials_M * alpha)
    return probs


In [31]:
cond_probs = {c: {} for c in unique_labels}
for c in unique_labels:
    Xc = df[df[target_feature] == c]
    for col in categorical_cols:
        M = num_labels_M[col]
        cond_probs[c][col] = multinomial_pxik_feature_per_class(Xc[col], M, alpha=1.0)

# Tính prior P(y)
priors = {
    c: len(df[df[target_feature] == c]) / len(df)
    for c in unique_labels
}

In [32]:
def predict_naive_bayes(x_input):
    scores = {}
    for c in unique_labels:
        score = np.log(priors[c])
        for col in categorical_cols:
            val = x_input[col]
            prob = cond_probs[c][col].get(val, 1e-9)  # tránh chia 0
            score += np.log(prob)
        scores[c] = score
    return max(scores, key=scores.get)

In [34]:
y_pred_test = test_values[categorical_cols].apply(predict_naive_bayes, axis=1)
submission = submission_format.copy()
submission["damage_grade"] = y_pred_test.values
submission.to_csv("submission.csv", index=False)
y_pred_train = train_values[categorical_cols].apply(predict_naive_bayes, axis=1)
y_true_train = train_labels["damage_grade"].values
train_acc = accuracy_score(y_true_train, y_pred_train)

print("Train Accuracy:", train_acc)



Train Accuracy: 0.5675112528347934


##### Sklearn

In [27]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Đọc dữ liệu
train_values = pd.read_csv("train_values.csv")
train_labels = pd.read_csv("train_labels.csv")
test_values  = pd.read_csv("test_values.csv")
submission_format = pd.read_csv("submission_format.csv")

# Các cột categorical
categorical_cols = [
    "land_surface_condition",
    "foundation_type",
    "roof_type",
    "ground_floor_type",
    "other_floor_type",
    "position",
    "plan_configuration",
    "legal_ownership_status"
]

# Mã hóa categorical thành số nguyên
encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    train_values[col] = le.fit_transform(train_values[col])
    test_values[col] = le.transform(test_values[col])  # áp dụng cùng encoder
    encoders[col] = le

# Train model Naive Bayes
X_train = train_values[categorical_cols]
y_train = train_labels["damage_grade"]

model = CategoricalNB()
model.fit(X_train, y_train)

# Dự đoán trên train để check accuracy
y_pred_train = model.predict(X_train)
acc = accuracy_score(y_train, y_pred_train)
print("Train Accuracy (CategoricalNB):", acc)

# Dự đoán trên test
y_pred_test = model.predict(test_values[categorical_cols])

# Xuất submission
submission = submission_format.copy()
submission["damage_grade"] = y_pred_test
submission.to_csv("submission.csv", index=False)

print(submission.head())


Train Accuracy (CategoricalNB): 0.5675112528347934
   building_id  damage_grade
0       300051             2
1        99355             2
2       890251             2
3       745817             1
4       421793             3


### Exercise 2 Type: Categorical dùng Bernoulli Naive Bayes

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

train_values = pd.read_csv("train_values.csv")
train_labels = pd.read_csv("train_labels.csv")
binary_cols = [
    "has_superstructure_adobe_mud",
    "has_superstructure_mud_mortar_stone",
    "has_superstructure_stone_flag",
    "has_superstructure_cement_mortar_stone",
    "has_superstructure_mud_mortar_brick",
    "has_superstructure_cement_mortar_brick",
    "has_superstructure_timber",
    "has_superstructure_bamboo",
    "has_superstructure_rc_non_engineered",
    "has_superstructure_rc_engineered",
    "has_superstructure_other",
    "has_secondary_use",
    "has_secondary_use_agriculture",
    "has_secondary_use_hotel",
    "has_secondary_use_rental",
    "has_secondary_use_institution",
    "has_secondary_use_school",
    "has_secondary_use_industry",
    "has_secondary_use_health_post",
    "has_secondary_use_gov_office",
    "has_secondary_use_use_police",
    "has_secondary_use_other"
]

X = train_values[binary_cols]
y = train_labels["damage_grade"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

class BernoulliNBManual:
    def __init__(self, alpha=1.0):
        self.alpha = alpha

    def fit(self, X, y):
        self.classes_ = np.unique(y)
        self.class_priors_ = {}
        self.feature_probs_ = {}

        for c in self.classes_:
            Xc = X[y == c]
            self.class_priors_[c] = len(Xc) / len(X)

            probs = (Xc.sum(axis=0) + self.alpha) / (len(Xc) + 2*self.alpha)
            self.feature_probs_[c] = probs

    def predict_one(self, x):
        scores = {}
        for c in self.classes_:
            log_prob = np.log(self.class_priors_[c])
            probs = self.feature_probs_[c]
            for j, val in enumerate(x):
                if val == 1:
                    log_prob += np.log(probs.iloc[j])
                else:
                    log_prob += np.log(1 - probs.iloc[j])
            scores[c] = log_prob
        return max(scores, key=scores.get)

    def predict(self, X):
        return np.array([self.predict_one(row) for row in X.values])

model_manual = BernoulliNBManual(alpha=1.0)
model_manual.fit(X_train, y_train)
y_pred_manual = model_manual.predict(X_test)

print("Accuracy Bernoulli NB:", accuracy_score(y_test, y_pred_manual))

Accuracy Bernoulli NB: 0.5676698942198233


##### Sklearn

In [4]:
from sklearn.naive_bayes import BernoulliNB

model_sklearn = BernoulliNB(alpha=1.0)
model_sklearn.fit(X_train, y_train)
y_pred_sklearn = model_sklearn.predict(X_test)

print("Accuracy BernoulliNB:", accuracy_score(y_test, y_pred_sklearn))

Accuracy BernoulliNB: 0.5676698942198233


## Phân loại món ăn Multinomial Naive Bayes

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score

### Đọc dữ liệu

In [215]:
df = pd.read_csv("multinomial_nb_orders.csv")
print(df.head())

      cuisine     city device user_segment    promo fav_ingredient  \
0    american  city_16    web          new  promo_1          onion   
1        thai  city_10    ios       family  promo_7       cilantro   
2    american  city_18    ios      student  promo_1         cheese   
3    japanese  city_10    web       family  promo_3         ginger   
4  vietnamese  city_10    web       family     none     fish_sauce   

  spice_level day_of_week price_bucket  
0         hot         Wed         high  
1        mild         Wed          mid  
2         hot         Fri          mid  
3        mild         Tue         high  
4        mild         Wed          low  


In [216]:
target_feature = 'cuisine'
Y = df[target_feature]
labels = np.zeros((len(set(Y)), 2), dtype=Y.dtype)
# Liệt kê các nhãn (label) khác nhau (từ trường cuisine) k =1, 2, ... C
# và tính xác suất P_k của mỗi nhãn. Sử dụng nhãn thay cho chỉ số
id = 0
for label in set(Y):
    labels[id, 0] = label
    labels[id, 1] = (Y == label).sum()/len(Y)
    id += 1
labels

array([['chinese', np.float64(0.12366666666666666)],
       ['american', np.float64(0.13466666666666666)],
       ['vietnamese', np.float64(0.09933333333333333)],
       ['mexican', np.float64(0.102)],
       ['mediterranean', np.float64(0.082)],
       ['korean', np.float64(0.065)],
       ['japanese', np.float64(0.10133333333333333)],
       ['indian', np.float64(0.09333333333333334)],
       ['thai', np.float64(0.10333333333333333)],
       ['italian', np.float64(0.09533333333333334)]], dtype=object)

In [217]:
def get_distinct_value_in_fields():

# Get the column names as a Pandas Index object
    column_names_index = df.columns
    tmp_list = column_names_index.values
    num_nomials_per_fields = np.zeros((len(tmp_list ), 2), dtype=tmp_list.dtype)
    # Convert the Index object to a NumPy array
    num_nomials_per_fields[:, 0] = tmp_list
    for i in range(len(tmp_list)):
        X = df[tmp_list[i]]
        unique_elements = set(X)
        # Get the count of unique elements
        num_nomials_per_fields[i, 1] = len(unique_elements)
    return num_nomials_per_fields
num_labels_M = get_distinct_value_in_fields()
print(num_labels_M)

[['cuisine' 10]
 ['city' 20]
 ['device' 5]
 ['user_segment' 6]
 ['promo' 9]
 ['fav_ingredient' 18]
 ['spice_level' 5]
 ['day_of_week' 7]
 ['price_bucket' 4]]


In [218]:
def Pxik_feature_per_class(X, xi, num_nomials_M, alpla = 1.0):
    X = np.array(X)
    count = (X == xi).sum()
    return (count+alpla)/(len(X) + num_nomials_M*alpla)

In [219]:
def predict_output_label(X, x_input, p_labels, target_feature):
    num_labels_M = get_distinct_value_in_fields()
    #score for each class
    p = np.log(np.array(p_labels[:, 1], dtype=float))
    for k in range(len(p)):
        #print(X['cuisine']== p_labels[k, 0])
        Xk = X[X[target_feature] == p_labels[k, 0]]
        print(Xk.shape)
        for i in range(len(x_input)):
            p[k] += np.log(Pxik_feature_per_class(Xk.iloc[:, i+1],

            x_input.iloc[i], num_labels_M[i+1, 1]))

    y_star = np.argmax(p)
    return p_labels[y_star, 0]

In [220]:
Train_size = round(len(df)*0.7)
X_Train = df[:Train_size]
Y_Train = df[target_feature][:Train_size]
X_Test = df[Train_size:].drop(target_feature, axis = 1).reset_index(drop=True,
inplace=False)
Y_Test = df[target_feature][Train_size:].reset_index(drop=True, inplace=False)

In [221]:
Y_pred = np.zeros(len(Y_Test), dtype = Y_Test.dtype)
print(len(X_Test), len(Y_Test))
print(X_Test.index, Y_Test.index)
for i in range(len(X_Test)):
    Y_pred[i] = predict_output_label(X_Train,X_Test.iloc[i, :],labels, target_feature)
from sklearn.metrics import accuracy_score
print(accuracy_score(Y_pred, Y_Test))
print(predict_output_label(X_Train, X_Train.iloc[3, 1:], labels, target_feature) )

900 900
RangeIndex(start=0, stop=900, step=1) RangeIndex(start=0, stop=900, step=1)
(242, 9)
(295, 9)
(203, 9)
(211, 9)
(191, 9)
(134, 9)
(210, 9)
(194, 9)
(220, 9)
(200, 9)
(242, 9)
(295, 9)
(203, 9)
(211, 9)
(191, 9)
(134, 9)
(210, 9)
(194, 9)
(220, 9)
(200, 9)
(242, 9)
(295, 9)
(203, 9)
(211, 9)
(191, 9)
(134, 9)
(210, 9)
(194, 9)
(220, 9)
(200, 9)
(242, 9)
(295, 9)
(203, 9)
(211, 9)
(191, 9)
(134, 9)
(210, 9)
(194, 9)
(220, 9)
(200, 9)
(242, 9)
(295, 9)
(203, 9)
(211, 9)
(191, 9)
(134, 9)
(210, 9)
(194, 9)
(220, 9)
(200, 9)
(242, 9)
(295, 9)
(203, 9)
(211, 9)
(191, 9)
(134, 9)
(210, 9)
(194, 9)
(220, 9)
(200, 9)
(242, 9)
(295, 9)
(203, 9)
(211, 9)
(191, 9)
(134, 9)
(210, 9)
(194, 9)
(220, 9)
(200, 9)
(242, 9)
(295, 9)
(203, 9)
(211, 9)
(191, 9)
(134, 9)
(210, 9)
(194, 9)
(220, 9)
(200, 9)
(242, 9)
(295, 9)
(203, 9)
(211, 9)
(191, 9)
(134, 9)
(210, 9)
(194, 9)
(220, 9)
(200, 9)
(242, 9)
(295, 9)
(203, 9)
(211, 9)
(191, 9)
(134, 9)
(210, 9)
(194, 9)
(220, 9)
(200, 9)
(242, 9)
(295, 9