## 베이즈 텍스트 분석

다루는 내용
--

- 메일이 스팸메일인지 아닌지 예측

데이터
--
- https://bit.ly/2WC9nvn

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

In [2]:
!curl -L https://bit.ly/2WC9nvn -o data/spam.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100   141  100   141    0     0    142      0 --:--:-- --:--:-- --:--:--   143
100   141  100   141    0     0    142      0 --:--:-- --:--:-- --:--:--   142

  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0

  0     0    0     0    0     0      0      0 --:--:--  0:00:02 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:02 --:--:--     0

  0     0    0     0    0     0      0      0 --:--:--  0:00:03 --:--:--     0
 35  488k   35  174k    0     0  46509      0  0:00:10  0:00:03  0:00:07  222k
100  488k  100  488k    0     0   119k      0  0:00:04  0:00:04 --:--:--  479k


데이터 읽기
--

In [4]:
sms = pd.read_csv('data/spam.csv')
sms.head()

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
sms.shape

(5574, 2)

In [6]:
sms['type'].value_counts()

ham     4827
spam     747
Name: type, dtype: int64

In [7]:
sms_corpus = sms['text']

단어-벡터 생성
--

In [8]:
vectorizer = CountVectorizer(min_df=10) # 10번 이하로 발생한 단어 무시
X = vectorizer.fit_transform(sms_corpus)
X.shape

(5574, 1018)

In [15]:
type(X), X[:5]

(scipy.sparse.csr.csr_matrix,
 <5x1018 sparse matrix of type '<class 'numpy.int64'>'
 	with 47 stored elements in Compressed Sparse Row format>)

In [10]:
y = sms['type'] 
y[:5]

0     ham
1     ham
2    spam
3     ham
4     ham
Name: type, dtype: object

In [17]:
X[:5].toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [14]:
z = [[0,0,0,1,0],
     [0,0,0,1,1],
     [0,1,0,0,1],
     [0,0,0,0,1]]
z= np.asarray(z)
z.argmax(axis=1)
    

array([3, 3, 1, 4], dtype=int64)

In [12]:
X[:5].toarray().argmax(axis=1) #axis = 0 : 컬럼기반으로 큰 수 탐색, asis = 1 : 행 기반으로 큰 수 탐색

array([ 92, 462, 866, 731, 393], dtype=int64)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y, test_size=0.25)

베이즈 알고리즘 적용
--

In [23]:
clf = GaussianNB() 
clf.fit(X_train, y_train)
print("Accuracy: {}".format(clf.score(X_test, y_test).round(4)))

Accuracy: 0.8164


In [20]:
y_predict = clf.predict(X_test) 
print(metrics.classification_report(y_test, y_predict)) 

              precision    recall  f1-score   support

         ham       0.98      0.80      0.88      1201
        spam       0.42      0.92      0.58       193

   micro avg       0.82      0.82      0.82      1394
   macro avg       0.70      0.86      0.73      1394
weighted avg       0.91      0.82      0.84      1394



In [21]:
print(metrics.confusion_matrix(y_test, y_predict))

[[961 240]
 [ 16 177]]


In [24]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import SVC 
from sklearn.linear_model import SGDClassifier, LogisticRegression

In [27]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf.score(X_test, y_test)



0.9705882352941176

In [30]:
svc = SVC()
svc.fit(X_train, y_train)
svc.score(X_test, y_test)



0.9096126255380201

In [31]:
sgd = SGDClassifier()
sgd.fit(X_train, y_train)
sgd.score(X_test, y_test)



0.9777618364418939

In [33]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)



0.9791965566714491