In [30]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC as svc

In [23]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.model_selection import GridSearchCV

In [27]:
from sklearn.metrics import make_scorer, roc_auc_score
from scipy import stats

# Data 준비

deceptive review 와 truthful review 데이터를 읽어서 1-gram bag-of-words 훈련 세트와 테스트 세트 만들기 

- data 읽어오기

In [2]:
TRAIN_SPAM_REVIEW_PATH = "/Users/sanghyub/review_data/positive_polarity/deceptive_from_MTurk/all"
TRAIN_TRUE_REVIEW_PATH = "/USers/sanghyub/review_data/positive_polarity/truthful_from_TripAdvisor/all"

text_data = []
spam_count = 0
for root, dirs, files in os.walk(TRAIN_SPAM_REVIEW_PATH):
    for fname in files:
        full_fname = os.path.join(root, fname)
        with open(full_fname, 'r') as myfile:
            data=myfile.read().replace('\n', '')
            text_data.append(data)
        spam_count += 1
print(spam_count)

true_count = 0

for root, dirs, files in os.walk(TRAIN_TRUE_REVIEW_PATH):
    for fname in files:
        full_fname = os.path.join(root, fname)
        with open(full_fname, 'r') as myfile:
            data=myfile.read().replace('\n', '')
            text_data.append(data)
        true_count += 1
print(true_count)
label = np.array([1]*spam_count + [0] *true_count)
print(label.shape)

400
400
(800,)


- 1-gram bag-of-words 특성 vector로 변환하기

In [105]:
vectorizer = CountVectorizer(analyzer='word',ngram_range=(2, 2))
bag_of_words = vectorizer.fit(text_data,)
bag_of_words = vectorizer.transform(text_data)

In [98]:
sorted(vectorizer.vocabulary_.items(),reverse=True )

[('zoo and', 43750),
 ('zoo', 43749),
 ('zone and', 43748),
 ('zone', 43747),
 ('zipped up', 43746),
 ('zipped', 43745),
 ('zest wow', 43744),
 ('zest was', 43743),
 ('zest that', 43742),
 ('zest restaurant', 43741),
 ('zest is', 43740),
 ('zest hotel', 43739),
 ('zest but', 43738),
 ('zest and', 43737),
 ('zest', 43736),
 ('zagat rated', 43735),
 ('zagat', 43734),
 ('yummy ve', 43733),
 ('yummy rooms', 43732),
 ('yummy martinis', 43731),
 ('yummy it', 43730),
 ('yummy breakfast', 43729),
 ('yummy before', 43728),
 ('yummy', 43727),
 ('yummo overall', 43726),
 ('yummo', 43725),
 ('yum and', 43724),
 ('yum', 43723),
 ('yr olds', 43722),
 ('yr', 43721),
 ('youve entered', 43720),
 ('youve decided', 43719),
 ('youve', 43718),
 ('yourself to', 43717),
 ('yourself of', 43716),
 ('yourself buffet', 43715),
 ('yourself', 43714),
 ('yours will', 43713),
 ('yours romantic', 43712),
 ('yours romance', 43711),
 ('yours jamie', 43710),
 ('yours grady', 43709),
 ('yours for', 43708),
 ('yours betwe

In [99]:
print(vectorizer)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [106]:
print(bag_of_words.shape)

(800, 38203)


In [52]:
print(vectorizer.vocabulary_)

{'after recent': 845, 'recent week': 25732, 'week stay': 36005, 'stay at': 29177, 'at the': 3892, 'the affinia': 30612, 'affinia hotels': 780, 'hotels can': 15700, 'can definitely': 6442, 'definitely say': 8971, 'say will': 27091, 'will be': 36961, 'be coming': 4576, 'coming back': 7807, 'back they': 4254, 'they offer': 32184, 'offer so': 22428, 'so many': 28392, 'many in': 19543, 'in room': 16451, 'room amenities': 26603, 'amenities and': 1611, 'and services': 2560, 'services just': 27497, 'just very': 17802, 'very comfortable': 34640, 'comfortable and': 7703, 'and relaxed': 2500, 'relaxed place': 25963, 'place to': 24449, 'to be': 32772, 'be my': 4640, 'my most': 20878, 'most enjoyable': 20516, 'enjoyable experience': 10371, 'experience at': 11189, 'affinia hotel': 779, 'hotel was': 15670, 'was the': 35589, 'the amazing': 30625, 'amazing customization': 1536, 'customization they': 8630, 'they offered': 32185, 'offered would': 22456, 'would recommend': 37792, 'recommend affinia': 2579

In [53]:
bag_of_words.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [54]:
vectorizer.get_feature_names()

['00 am',
 '00 and',
 '00 bucks',
 '00 but',
 '00 compared',
 '00 fare',
 '00 for',
 '00 how',
 '00 night',
 '00 nightly',
 '00 of',
 '00 per',
 '00 rate',
 '00 with',
 '000 night',
 '000 plus',
 '00pm we',
 '04 05',
 '05 greeted',
 '06 04',
 '06 totally',
 '07 and',
 '08 we',
 '09 chechecked',
 '09 through',
 '09 we',
 '10 00',
 '10 000',
 '10 10',
 '10 120',
 '10 30',
 '10 45',
 '10 and',
 '10 day',
 '10 dollars',
 '10 for',
 '10 girls',
 '10 id',
 '10 in',
 '10 many',
 '10 mins',
 '10 minutes',
 '10 more',
 '10 star',
 '10 thru',
 '10 to',
 '10 walk',
 '10 we',
 '100 00',
 '100 and',
 '100 days',
 '100 including',
 '100 night',
 '100 per',
 '100 percent',
 '100 the',
 '100 through',
 '100 wireless',
 '100 yards',
 '105 on',
 '10pm due',
 '10th and',
 '10th floor',
 '10th wedding',
 '10yo daughter',
 '11 30',
 '11 great',
 '11 not',
 '11 too',
 '11 yr',
 '1112 king',
 '116 we',
 '11th floor',
 '12 14',
 '12 and',
 '120 room',
 '122 tax',
 '125 night',
 '129 00',
 '129 without',
 '12p

- 훈련 세트와 테스트 세트 만들기 

In [96]:
label

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [81]:
X_train, X_test, y_train, y_test = train_test_split(bag_of_words, label, test_size=0.2, random_state=42)

In [82]:
print(len(y_train), "train +", len(y_test), "test")

640 train + 160 test


# SVM을 이용한 훈련
#### Linear SVC, SVC for Grid Search CV

In [84]:
svm_clf = LinearSVC(C=1,loss="hinge",random_state=42)

In [12]:
print(svm_clf)

LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=42, tol=0.0001, verbose=0)


In [85]:
svm_clf.fit(X_train,y_train)

LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=42, tol=0.0001, verbose=0)

In [32]:
svc_clf = svc(probability = True, random_state = 1)

In [38]:
print(svc_clf)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=1, shrinking=True, tol=0.001,
  verbose=False)


## Grid Search CV

In [33]:
auc = make_scorer(roc_auc_score)

In [35]:
grid_list = {"C": np.arange(2, 10, 2),
             "gamma": np.arange(0.1, 1, 0.2)}
 
grid_search = GridSearchCV(svc_clf, param_grid = grid_list, n_jobs = 4, cv = 5, scoring = auc) 
grid_search.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=1, shrinking=True, tol=0.001,
  verbose=False),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'C': array([2, 4, 6, 8]), 'gamma': array([0.1, 0.3, 0.5, 0.7, 0.9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score), verbose=0)

In [None]:
grid_list = {"C": np.arange(2, 10, 2),
             "gamma": np.arange(0.1, 1, 0.2)}
 
grid_search = GridSearchCV(svm_clf, param_grid = grid_list, n_jobs = 4, cv = 3, scoring = auc) 
grid_search.fit(X_train, y_train)

- 훈련 정확도 측정

In [59]:
# n_gram:2
score = cross_val_score(svm_clf, X_train, y_train, cv=5, verbose=3)
score.mean()

[CV]  ................................................................
[CV] ................................. , score=0.796875, total=   0.0s
[CV]  ................................................................
[CV] ................................ , score=0.8359375, total=   0.0s
[CV]  ................................................................
[CV] ................................ , score=0.8515625, total=   0.0s
[CV]  ................................................................
[CV] ................................ , score=0.8046875, total=   0.0s
[CV]  ................................................................
[CV] .................................... , score=0.875, total=   0.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished


0.8328125

In [86]:
# n_gram:1
score = cross_val_score(svm_clf, X_train, y_train, cv=5, verbose=3)
score.mean()

[CV]  ................................................................
[CV] .................................... , score=0.875, total=   0.0s
[CV]  ................................................................
[CV] ................................ , score=0.8046875, total=   0.0s
[CV]  ................................................................
[CV] .................................... , score=0.875, total=   0.0s
[CV]  ................................................................
[CV] ................................. , score=0.859375, total=   0.0s
[CV]  ................................................................
[CV] .................................... , score=0.875, total=   0.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished


0.8578125

In [37]:
# grid_search cv, SVC
score = cross_val_score(svc_clf, X_train, y_train, cv=5, verbose=3)
score.mean()

[CV]  ................................................................
[CV] .................................. , score=0.65625, total=   1.2s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s


[CV] ................................. , score=0.671875, total=   1.2s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.5s remaining:    0.0s


[CV] ................................ , score=0.5546875, total=   1.2s
[CV]  ................................................................
[CV] .................................. , score=0.78125, total=   1.2s
[CV]  ................................................................
[CV] ................................. , score=0.609375, total=   1.2s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    6.1s finished


0.6546875

- 확인

In [87]:
some_data = X_train[0:5]
some_labels = y_train[0:5]

In [88]:
print("Predict:", list(svm_clf.predict(some_data)))

Predict: [1, 0, 1, 1, 1]


In [41]:
print("Predict:", list(grid_search.predict(some_data)))

Predict: [1, 0, 1, 1, 1]


In [62]:
print("Labels:", list(some_labels))

Labels: [1, 0, 1, 1, 1]


In [89]:
for i in range(400):
    a = svm_clf.predict(bag_of_words[i,:])
    b = label[i]
    #print(i)
    #print(a)
    #print(b)
    if a != b:
        print(a)
        print(b)
        

[0]
1
[0]
1
[0]
1
[0]
1
[0]
1
[0]
1


In [90]:
print(X_train[1,:])

  (0, 278)	1
  (0, 309)	1
  (0, 361)	4
  (0, 459)	2
  (0, 531)	1
  (0, 574)	1
  (0, 609)	1
  (0, 630)	1
  (0, 649)	1
  (0, 684)	1
  (0, 747)	1
  (0, 751)	1
  (0, 844)	2
  (0, 873)	3
  (0, 912)	1
  (0, 1041)	1
  (0, 1048)	1
  (0, 1514)	1
  (0, 1597)	1
  (0, 1837)	1
  (0, 1883)	1
  (0, 1889)	1
  (0, 1909)	1
  (0, 2126)	1
  (0, 2130)	1
  :	:
  (0, 4175)	2
  (0, 4178)	1
  (0, 4444)	1
  (0, 4560)	2
  (0, 4596)	1
  (0, 4647)	1
  (0, 4672)	1
  (0, 4678)	1
  (0, 4768)	1
  (0, 4814)	1
  (0, 4927)	12
  (0, 4933)	1
  (0, 4955)	2
  (0, 5006)	1
  (0, 5052)	1
  (0, 5083)	1
  (0, 5177)	1
  (0, 5276)	1
  (0, 5291)	1
  (0, 5357)	2
  (0, 5367)	1
  (0, 5376)	2
  (0, 5406)	2
  (0, 5413)	1
  (0, 5531)	2


# 테스트 세트로 시스템을 평가

In [91]:
y_pred = svm_clf.predict(X_test)

In [42]:
y_pred = grid_search.predict(X_test)

In [21]:
# n_gram:3
from sklearn.metrics import precision_score, recall_score
print("Precision: {:.2f}%".format(100*precision_score(y_test,y_pred)))
print("Recall: {:.2f}%".format(100*recall_score(y_test,y_pred)))

Precision: 77.65%
Recall: 82.50%


In [95]:
y_test

array([0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0])

In [66]:
# n_gram:2
from sklearn.metrics import precision_score, recall_score
print("Precision: {:.2f}%".format(100*precision_score(y_test,y_pred)))
print("Recall: {:.2f}%".format(100*recall_score(y_test,y_pred)))

Precision: 85.37%
Recall: 87.50%


In [92]:
# n_gram:1
from sklearn.metrics import precision_score, recall_score
print("Precision: {:.2f}%".format(100*precision_score(y_test,y_pred)))
print("Recall: {:.2f}%".format(100*recall_score(y_test,y_pred)))

Precision: 85.06%
Recall: 92.50%


In [43]:
from sklearn.metrics import precision_score, recall_score
print("Precision: {:.2f}%".format(100*precision_score(y_test,y_pred)))
print("Recall: {:.2f}%".format(100*recall_score(y_test,y_pred)))

Precision: 71.43%
Recall: 6.25%


# 1개의 단어로 구분하는게 왜 성능이 좋을까??