In [8]:
# Imports
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Initialize data

# Training data
data = [
    ['D1', 'Hà-Nội phở cháo-lòng Hà-Nội', 'B'],
    ['D2', 'Hà-Nội bún-chả phở ô-mai', 'B'],
    ['D3', 'Phở bánh-giò ô-mai', 'B'],
    ['D4', 'Sài-Gòn hủ-tiếu bánh-bò phở', 'N']
]

# Validation data
D5 = 'Hà-Nội Hà-Nội bún-chả hủ-tiếu'
D6 = 'Phở hủ-tiếu bánh-bò'

df = pd.DataFrame(data, columns=['No.', 'Text', 'Class'])
print(df)

# Problem summary: From the training data, we must classify D5 and D6 as 
# belonging to 'B' for Northen Vietnamese speech or 'N' for Southern Vietnamese speech

  No.                         Text Class
0  D1  Hà-Nội phở cháo-lòng Hà-Nội     B
1  D2     Hà-Nội bún-chả phở ô-mai     B
2  D3           Phở bánh-giò ô-mai     B
3  D4  Sài-Gòn hủ-tiếu bánh-bò phở     N


In [9]:
# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer on the training data and transform the text
X_train = vectorizer.fit_transform(df['Text'])

# Transform the validation data
X_val = vectorizer.transform([D5, D6])

# Convert to dense format for easy visualization
X_train_dense = X_train.toarray()
X_val_dense = X_val.toarray()

# Get the feature names (words) for reference
feature_names = vectorizer.get_feature_names_out()

# Print training data vectors and corresponding feature names
print("Training Data Vectors (Dense Format):")
print(pd.DataFrame(X_train_dense, columns=feature_names))

# Print validation data vectors
print("\nValidation Data Vectors (Dense Format):")
print(pd.DataFrame(X_val_dense, columns=feature_names))

Training Data Vectors (Dense Format):
   bánh  bò  bún  cháo  chả  giò  gòn  hà  hủ  lòng  mai  nội  phở  sài  tiếu
0     0   0    0     1    0    0    0   2   0     1    0    2    1    0     0
1     0   0    1     0    1    0    0   1   0     0    1    1    1    0     0
2     1   0    0     0    0    1    0   0   0     0    1    0    1    0     0
3     1   1    0     0    0    0    1   0   1     0    0    0    1    1     1

Validation Data Vectors (Dense Format):
   bánh  bò  bún  cháo  chả  giò  gòn  hà  hủ  lòng  mai  nội  phở  sài  tiếu
0     0   0    1     0    1    0    0   2   1     0    0    2    0    0     1
1     1   1    0     0    0    0    0   0   1     0    0    0    1    0     1


In [10]:
# Scikit-learn Multinomial Naive Bayes implemetation
clf = MultinomialNB()

# Fitting the model on training data
clf.fit(X_train, df['Class'])

# Predicting the class of D5 and D6
print('Predicting class of D5:', str(clf.predict(X_val[0])))
print('Predicting class of D6:', str(clf.predict(X_val[1])))

# Predicting the probability of D5 and D6 for each class
print('Probability of D5 in each class:', clf.predict_proba(X_val[0]))
print('Probability of D6 in each class:', clf.predict_proba(X_val[1]))

Predicting class of D5: ['B']
Predicting class of D6: ['N']
Probability of D5 in each class: [[0.97457435 0.02542565]]
Probability of D6 in each class: [[0.10329395 0.89670605]]
