In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score


# Data 준비

deceptive review 와 truthful review 데이터를 읽어서 1-gram bag-of-words 훈련 세트와 테스트 세트 만들기 

- data 읽어오기

In [6]:
TRAIN_SPAM_REVIEW_PATH = "op_spam_v1.4/positive_polarity/deceptive_from_MTurk/all"
TRAIN_TRUE_REVIEW_PATH = "op_spam_v1.4/positive_polarity/truthful_from_TripAdvisor/all"

text_data = []
spam_count = 0
for root, dirs, files in os.walk(TRAIN_SPAM_REVIEW_PATH):
    for fname in files:
        full_fname = os.path.join(root, fname)
        with open(full_fname, 'r') as myfile:
            data=myfile.read().replace('\n', '')
            text_data.append(data)
        spam_count += 1
print(spam_count)

true_count = 0

for root, dirs, files in os.walk(TRAIN_TRUE_REVIEW_PATH):
    for fname in files:
        full_fname = os.path.join(root, fname)
        with open(full_fname, 'r') as myfile:
            data=myfile.read().replace('\n', '')
            text_data.append(data)
        true_count += 1
print(true_count)
label = np.array([1]*spam_count + [0] *true_count)
print(label.shape)

400
400
(800,)


- 1-gram bag-of-words 특성 vector로 변환하기

In [100]:
vectorizer = CountVectorizer(analyzer='word',ngram_range=(3, 3))
bag_of_words = vectorizer.fit(text_data,)
bag_of_words = vectorizer.transform(text_data)

In [101]:
print(vectorizer)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(3, 3), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [102]:
print(bag_of_words.shape)

(800, 68293)


In [66]:
print(vectorizer.vocabulary_)

{'after': 1079, 'recent': 29678, 'week': 41378, 'stay': 33845, 'at': 4237, 'the': 35485, 'affinia': 1043, 'hotels': 18177, 'can': 7302, 'definitely': 10388, 'say': 31304, 'will': 42400, 'be': 5158, 'coming': 8935, 'back': 4768, 'they': 37062, 'offer': 25814, 'so': 32850, 'many': 22564, 'in': 18776, 'room': 30764, 'amenities': 1953, 'and': 2152, 'services': 31828, 'just': 20425, 'very': 39907, 'comfortable': 8821, 'relaxed': 29984, 'place': 28096, 'to': 37739, 'my': 23990, 'most': 23727, 'enjoyable': 12131, 'experience': 13073, 'hotel': 17882, 'was': 40496, 'amazing': 1857, 'customization': 10013, 'offered': 25845, 'would': 43254, 'recommend': 29762, 'anyone': 3335, 'looking': 21964, 'for': 14313, 'nice': 24622, 'after recent': 1124, 'recent week': 29685, 'week stay': 41394, 'stay at': 33855, 'at the': 4381, 'the affinia': 35540, 'affinia hotels': 1052, 'hotels can': 18186, 'can definitely': 7316, 'definitely say': 10424, 'say will': 31336, 'will be': 42407, 'be coming': 5182, 'coming b

In [67]:
bag_of_words.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [81]:
vectorizer.get_feature_names()

['00',
 '00 am',
 '00 and',
 '00 bucks',
 '00 but',
 '00 compared',
 '00 fare',
 '00 for',
 '00 how',
 '00 night',
 '00 nightly',
 '00 of',
 '00 per',
 '00 rate',
 '00 with',
 '000',
 '000 night',
 '000 plus',
 '00pm',
 '00pm we',
 '04',
 '04 05',
 '05',
 '05 greeted',
 '06',
 '06 04',
 '06 totally',
 '07',
 '07 and',
 '08',
 '08 we',
 '09',
 '09 chechecked',
 '09 through',
 '09 we',
 '10',
 '10 00',
 '10 000',
 '10 10',
 '10 120',
 '10 30',
 '10 45',
 '10 and',
 '10 day',
 '10 dollars',
 '10 for',
 '10 girls',
 '10 id',
 '10 in',
 '10 many',
 '10 mins',
 '10 minutes',
 '10 more',
 '10 star',
 '10 thru',
 '10 to',
 '10 walk',
 '10 we',
 '100',
 '100 00',
 '100 and',
 '100 days',
 '100 including',
 '100 night',
 '100 per',
 '100 percent',
 '100 the',
 '100 through',
 '100 wireless',
 '100 yards',
 '105',
 '105 on',
 '10pm',
 '10pm due',
 '10th',
 '10th and',
 '10th floor',
 '10th wedding',
 '10yo',
 '10yo daughter',
 '11',
 '11 30',
 '11 great',
 '11 not',
 '11 too',
 '11 yr',
 '1112',


- 훈련 세트와 테스트 세트 만들기 

In [108]:
X_train, X_test, y_train, y_test = train_test_split(bag_of_words, label, test_size=0.2, random_state=42)

In [109]:
print(len(y_train), "train +", len(y_test), "test")

640 train + 160 test


# SVM을 이용한 훈련

In [110]:
svm_clf = LinearSVC(C=1,loss="hinge",random_state=42)

In [111]:
print(svm_clf)

LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=42, tol=0.0001, verbose=0)


In [112]:
svm_clf.fit(X_train,y_train)

LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=42, tol=0.0001, verbose=0)

- 훈련 정확도 측정

In [113]:
score = cross_val_score(svm_clf, X_train, y_train, cv=5, verbose=3)
score.mean()

[CV]  ................................................................
[CV] ................................ , score=0.6953125, total=   0.0s
[CV]  ................................................................
[CV] ................................. , score=0.796875, total=   0.0s
[CV]  ................................................................
[CV] ................................ , score=0.7734375, total=   0.0s
[CV]  ................................................................
[CV] .................................. , score=0.78125, total=   0.0s
[CV]  ................................................................
[CV] ................................ , score=0.7265625, total=   0.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished


0.7546875

- 확인

In [114]:
some_data = X_train[0:5]
some_labels = y_train[0:5]

In [115]:
print("Predict:", list(svm_clf.predict(some_data)))

Predict: [1, 0, 1, 1, 1]


In [116]:
print("Labels:", list(some_labels))

Labels: [1, 0, 1, 1, 1]


In [117]:
for i in range(400):
    a = svm_clf.predict(bag_of_words[i,:])
    b = label[i]
    #print(i)
    #print(a)
    #print(b)
    if a != b:
        print(a)
        print(b)
        

[0]
1
[0]
1
[0]
1
[0]
1
[0]
1
[0]
1
[0]
1
[0]
1
[0]
1
[0]
1
[0]
1
[0]
1
[0]
1
[0]
1


In [88]:
print(X_train[1,:])

  (0, 1079)	1
  (0, 1123)	1
  (0, 1328)	1
  (0, 1421)	1
  (0, 2152)	4
  (0, 2558)	1
  (0, 2614)	1
  (0, 2812)	1
  (0, 3027)	1
  (0, 3977)	2
  (0, 3981)	1
  (0, 3995)	1
  (0, 4675)	1
  (0, 4677)	1
  (0, 4902)	1
  (0, 4938)	1
  (0, 5324)	1
  (0, 5329)	1
  (0, 5608)	1
  (0, 5639)	1
  (0, 5828)	1
  (0, 5835)	1
  (0, 6078)	1
  (0, 6110)	1
  (0, 6418)	1
  :	:
  (0, 38535)	1
  (0, 38562)	1
  (0, 38729)	1
  (0, 38731)	1
  (0, 39269)	1
  (0, 39273)	1
  (0, 39854)	1
  (0, 39875)	1
  (0, 39907)	1
  (0, 39966)	1
  (0, 40496)	2
  (0, 40788)	1
  (0, 40985)	1
  (0, 41064)	1
  (0, 41066)	1
  (0, 41137)	2
  (0, 41154)	1
  (0, 41207)	1
  (0, 41633)	2
  (0, 41732)	1
  (0, 41758)	1
  (0, 41919)	1
  (0, 41979)	1
  (0, 43477)	2
  (0, 43502)	2


# 테스트 세트로 시스템을 평가

In [78]:
y_pred = svm_clf.predict(X_test)

In [79]:
from sklearn.metrics import precision_score, recall_score
print("Precision: {:.2f}%".format(100*precision_score(y_test,y_pred)))
print("Recall: {:.2f}%".format(100*recall_score(y_test,y_pred)))

Precision: 89.16%
Recall: 92.50%
