# 베르누이 나이브베이즈

In [2]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
data = pd.read_csv("./spam.csv")
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
data['label'] = data['Category'].map({'spam' : 1, "ham" : 0})
data.head()

Unnamed: 0,Category,Message,label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [8]:
X = data["Message"]
y = data['label']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 103)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(3900,) (1672,) (3900,) (1672,)


In [9]:
cv = CountVectorizer(max_features = 1000, binary = True)
x_train_cv = cv.fit_transform(x_train)

In [11]:
encoded = x_train_cv.toarray() # 행렬 형식
encoded

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [12]:
cv.inverse_transform(encoded[0])

[array(['and', 'come', 'down', 'face', 'feel', 'for', 'have', 'heart',
        'into', 'life', 'loved', 'making', 'me', 'my', 'on', 'smile',
        'sun', 'the', 'you'], dtype='<U15')]

In [13]:
cv.get_feature_names()

['000',
 '04',
 '0800',
 '08000839402',
 '08000930705',
 '08712460324',
 '10',
 '100',
 '1000',
 '10p',
 '11',
 '12hrs',
 '150',
 '150p',
 '150ppm',
 '16',
 '18',
 '1st',
 '20',
 '200',
 '2000',
 '2003',
 '250',
 '2day',
 '2lands',
 '2nd',
 '30',
 '350',
 '50',
 '500',
 '5000',
 '50p',
 '750',
 '800',
 '8007',
 '86688',
 '87066',
 'able',
 'about',
 'abt',
 'ac',
 'account',
 'across',
 'actually',
 'address',
 'aft',
 'after',
 'afternoon',
 'again',
 'age',
 'age16',
 'ago',
 'ah',
 'aight',
 'all',
 'almost',
 'alone',
 'already',
 'alright',
 'also',
 'always',
 'am',
 'amp',
 'an',
 'and',
 'angry',
 'another',
 'ans',
 'answer',
 'any',
 'anyone',
 'anything',
 'anytime',
 'anyway',
 'apply',
 'ard',
 'are',
 'area',
 'around',
 'as',
 'asap',
 'ask',
 'askd',
 'asked',
 'ass',
 'at',
 'attempt',
 'auction',
 'available',
 'await',
 'award',
 'awarded',
 'away',
 'awesome',
 'b4',
 'babe',
 'baby',
 'back',
 'bad',
 'bak',
 'balance',
 'bank',
 'bath',
 'bb',
 'bcoz',
 'be',
 'be

In [21]:
len(cv.get_feature_names()) #1000개의 데이터가 모두 나왔는지 확인

1000

## 베르누이 나이브베이즈 분류

In [14]:
nb_clf = BernoulliNB()

nb_clf.fit(x_train_cv, y_train)

BernoulliNB()

In [16]:
x_test_cv = cv.fit_transform(x_test)

In [18]:
encoded2 = x_test_cv.toarray()
encoded2

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [19]:
pred = nb_clf.predict(x_test_cv)

In [20]:
accuracy_score(y_test, pred) # 정확도를 높이고 싶으면 불용어 제거해주기

0.8086124401913876