## 라이브러리 설치

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive

/content/drive/MyDrive


In [None]:
# pip install konlpy
!git clone https://github.com/lovit/customized_konlpy.git
!pip install customized_konlpy

In [2]:
import json
import re
import pickle
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import classification_report
import ckonlpy
from ckonlpy.tag import Twitter
from ckonlpy.tag import Postprocessor
from ckonlpy.utils import load_wordset

In [5]:
twitter = ckonlpy.tag.Twitter()

  warn('"Twitter" has changed to "Okt" since KoNLPy v0.4.5.')


In [6]:
with open('dictionary_v1.pkl', 'rb') as f:
    dict_toadd = pickle.load(f)
for i in dict_toadd:
    twitter.add_dictionary(i,'Noun')

In [19]:
# passtags = {'Noun', 'Verb', 'Adjective', 'Adverb'}
# tokenizer = Postprocessor(base_tagger=twitter, passtags=passtags) # 품사 필터링 ON
tokenizer = Postprocessor(base_tagger=twitter) # 품사 필터링 OFF

#### 글로우픽 데이터 불러오기

In [7]:
train_num = 20000

In [8]:
with open('train_prepro.json', 'r', encoding='utf-8') as f:
    total_train = json.load(f, strict=False)
total_train = pd.DataFrame(total_train)
total_train = total_train.sample(frac=1).reset_index(drop=True)
total_train = total_train[:train_num]
total_train['label'] = total_train['label'].astype(float)
total_train['label'] = total_train['label'].astype(int)
print(total_train['label'].value_counts())

1    20000
Name: label, dtype: int64


#### 다른 플랫폼 데이터 불러오기 (인스타그램, 유튜브, 파우더룸, 블로그)

In [9]:
with open('test_prepro.json', 'r', encoding='utf-8') as f:
    total_test = json.load(f, strict=False)
total_test = pd.DataFrame(total_test)
total_test = total_test.sample(frac=1).reset_index(drop=True)
total_test['label'] = total_test['label'].astype(float)
total_test['label'] = total_test['label'].astype(int)
print(total_test['label'].value_counts())

 1    3945
-1    3152
Name: label, dtype: int64


- 테스트 셋의 일부 데이터를 훈련셋에 포함시킴.
- 훈련셋에 포함시킬 테스트셋의 비율은 frac 값을 조절해 변경할 수 있음.
- frac < 1일 경우 테스트셋으로 모델의 성능을 평가할 수 있음.
- frac=1 일 경우 모든 테스트셋의 1 레이블 데이터를 모델에 넣음. (최종 모델 구축 시)

In [10]:
total_test_pos = total_test[total_test['label'] == 1.]
total_test_neg = total_test[total_test['label'] == -1.]
test_pos_sampled = total_test_pos.sample(frac=1)
train_pos_sampled = total_train.sample(len(test_pos_sampled))
train_pos_remain = total_train.drop(train_pos_sampled.index)
test_pos_remain = total_test_pos.drop(test_pos_sampled.index)

In [11]:
total_train  = pd.concat([total_train, test_pos_sampled], axis=0).reset_index(drop=True)
total_test  = pd.concat([test_pos_remain, total_test_neg], axis=0).reset_index(drop=True) 

In [12]:
print(total_train['label'].value_counts())
print(total_test['label'].value_counts())

1    23945
Name: label, dtype: int64
-1    3152
Name: label, dtype: int64


In [13]:
train_text = total_train['text'].tolist()
train_labels = total_train['label'].tolist()
test_text = total_test['text'].tolist()
test_labels = total_test['label'].tolist()

In [14]:
X = total_train['text']
y = total_train['label']

In [8]:
stopwords=['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로',
           '자','에','와','한','하다','그리고','넘','네','을','랑','예요','오','여서','이에요','데',
           '에게','에서','라서','이라서','에요','와','만','나','로','이랑','내','엔','아','부터','수',
           '때','거','다','이다','이나','에도','것','고','게','인데','제','까지','에는','엔','이라'
          '약간','오늘','점','없이','자꾸','알','있','다니','또','어','딱','걍','더','중','니','저','면','듯']

In [10]:
# with open('stopwords.pkl', 'wb') as f:
#     pickle.dump(stopwords, f)

In [16]:
def tokenize_text(sample) :
    tokenized_corpus = []
    for sentence in sample:
        tmp = [i[0] for i in  tokenizer.pos(sentence)  if not i[0] in stopwords] 
        tokenized_corpus .append(' '.join(tmp))
    return tokenized_corpus

In [20]:
train_corp =tokenize_text(train_text)
test_corp = tokenize_text(test_text)

In [21]:
drop_train = [idx for idx, sentence in enumerate(train_corp) if len(sentence) < 1]
X_train = np.delete(train_corp, drop_train, axis=0)
y_train = np.delete(train_labels, drop_train, axis=0)
print(len(X_train))
print(len(y_train))

23933
23933


In [22]:
drop_test = [idx for idx, sentence in enumerate(test_corp) if len(sentence) < 1]
X_test = np.delete(test_corp, drop_test, axis=0)
y_test = np.delete(test_labels, drop_test, axis=0)
print(len(X_test))
print(len(y_test))

3151
3151


In [23]:
min_count = 2
ngram_range = (1, 1)

In [24]:
vectorizer = TfidfVectorizer(min_df=min_count,  ngram_range=ngram_range)
emb = vectorizer.fit_transform(X_train)
vocab = vectorizer.vocabulary_

In [28]:
with open('vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)

In [29]:
def get_test_emb_with_vocabfile(corpus, min_count, ngram_range, vocab):
  new_vectorizer = TfidfVectorizer(min_df=min_count, ngram_range=ngram_range, vocabulary=vocab)
  emb = new_vectorizer.fit_transform(corpus)
  return emb

In [30]:
train_matrix = emb
test_matrix = get_test_emb_with_vocabfile(X_test, min_count, ngram_range, vocab)

In [31]:
print(train_matrix.shape)
print(test_matrix.shape)

(23933, 15578)
(3151, 15578)


In [32]:
col_names = vectorizer.get_feature_names()
emb_df = pd.DataFrame(test_matrix.toarray(), columns=col_names); emb_df.iloc[:5, :10]



Unnamed: 0,가가,가게,가격,가격값,가계,가고,가구,가금,가급,가기
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### One-class SVM 학습 및 저장 

In [40]:
kernel = 'linear'
gamma = 'scale'
nu = 0.2

In [41]:
clf = OneClassSVM(kernel=kernel, gamma=gamma, nu=nu)

In [42]:
clf.fit(train_matrix, y_train)
y_pred_train = clf.predict(train_matrix)

In [43]:
joblib.dump(clf, 'ocsvm_trained_0.2.joblib')

['ocsvm_trained_0.2.joblib']

In [44]:
clf_loaded = joblib.load('ocsvm_trained_0.2.joblib')

In [38]:
result = clf_loaded.predict(test_matrix)

#### 테스트셋으로 모델을  검증할 경우(테스트셋 없는 경우 X)

In [None]:
print('Data shape: ')
print(train_matrix.shape)
print(test_matrix.shape)
print('Settings: ')
print('kernel: ', kernel)
print('min_count: ', min_count)
print('ngram_range: ', ngram_range)
print('gamma: ', gamma)
print('nu: ', nu)
y_pred_test = clf.predict(test_matrix)
# y_pred_test = clf_loaded.predict(test_matrix)
results = confusion_matrix(y_test, y_pred_test)
print('Confusion Matrix :')
print(results) 
print('Accuracy Score for train: {:.2f} %'.format(accuracy_score(y_train, y_pred_train)*100)) 
print('Accuracy Score for test: {:.2f} %'.format(accuracy_score(y_test, y_pred_test)*100)) 
report = classification_report(y_test, y_pred_test, output_dict=True)
pd.DataFrame(report).transpose()

Data shape: 
(21958, 10898)
(5121, 10898)
Settings: 
kernel:  linear
min_count:  3
ngram_range:  (1, 1)
gamma:  scale
nu:  0.25
Confusion Matrix :
[[2685  463]
 [1082  891]]
Accuracy Score for train: 74.95 %
Accuracy Score for test: 69.83 %


Unnamed: 0,precision,recall,f1-score,support
-1,0.712769,0.852922,0.776573,3148.0
1,0.65805,0.451597,0.535618,1973.0
accuracy,0.698301,0.698301,0.698301,0.698301
macro avg,0.68541,0.65226,0.656095,5121.0
weighted avg,0.691687,0.698301,0.683738,5121.0
