## libraries

In [88]:
import pandas as pd
import numpy as np
import pickle
import re

from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score

## data

In [55]:
tr = pd.read_csv('/content/drive/MyDrive/2022 ADV/data/tr.csv')
te = pd.read_csv('/content/drive/MyDrive/2022 ADV/data/te.csv')

- train

In [56]:
tr.head(3)

Unnamed: 0,tokenize_txt,label,tokenized_del_stopwords
0,"['네', '쇼핑', '입니다', '무엇', '을', '도와', '드릴까요', '지...",0,"['쇼핑', '입니다', '무엇', '도와', '드릴까요', '지금', '있', '..."
1,"['롯데', '월드', '를', '가려', '하', '는데요', '방법', '을',...",0,"['롯데', '월드', '가려', '는데요', '방법', '알려', '주', '세요..."
2,"['입니다', '제', '가', '얼마', '전', '에', '에서', '셔츠', ...",0,"['입니다', '제', '얼마', '전', '에서', '셔츠', '주문', '했',..."


- test

In [57]:
te.head(3)

Unnamed: 0,tokenize_txt,label,tokenized_del_stopwords
0,"['네', '쇼핑', '입니다', '네', '여보세요', '네', '고객', '님'...",0,"['쇼핑', '입니다', '여보세요', '고객', '님', '무엇', '도와드릴까요..."
1,"['은행', '입니다', '무엇', '을', '도와', '드릴까요', '고객', '...",0,"['은행', '입니다', '무엇', '도와', '드릴까요', '고객', '님', '..."
2,"['쇼핑', '입니다', '무엇', '을', '도와드릴까요', '지금', '청바지'...",0,"['쇼핑', '입니다', '무엇', '도와드릴까요', '지금', '청바지', '나오..."


In [58]:
def text_clearing(text):

    hangul = re.compile('[^ ㄱ-ㅣㅏ-ㅣ가-힣]+') # 한글이 아닌 텍스트를 찾음
    
    return hangul.sub('', text).split() # 치환할 문자열, target text

In [59]:
tr['tokenize_txt'] = tr['tokenize_txt'].apply(lambda x : text_clearing(x))
te['tokenize_txt'] = te['tokenize_txt'].apply(lambda x : text_clearing(x))

In [60]:
# stopwords
stopwords = ["도", "는", "다", "의", "가", "이", "은", "한", "에", "하", "고", "을", "를", "인", "듯", "과", "와", "네", "들", "듯", "지", "임", "게"]

In [61]:
def del_stop_words(text):

    # 불용어
    stopwords = ["도", "는", "다", "의", "가", "이", "은", "한", "에", "하", "고", "을", "를", "인", "듯", "과", "와", "네", "들", "듯", "지", "임", "게"]
    # 불용어 제거
    results = [text[i] for i in range(len(text)) if text[i] not in stopwords]

    return results

In [62]:
# sample
del_stop_words(['집', '에', '갈래'])

['집', '갈래']

In [63]:
# 불용어 제거
tr['tokenized_del_stopwords'] = tr['tokenize_txt'].apply(lambda x : del_stop_words(x))
te['tokenized_del_stopwords'] = te['tokenize_txt'].apply(lambda x : del_stop_words(x))

In [64]:
tr['tokenized_del_stopwords_sent'] = tr['tokenized_del_stopwords'].apply(lambda x : ' '.join(x))
te['tokenized_del_stopwords_sent'] = te['tokenized_del_stopwords'].apply(lambda x : ' '.join(x))

In [65]:
tr.drop(['tokenize_txt'], axis=1, inplace=True)
te.drop(['tokenize_txt'], axis=1, inplace=True)
te.head(3)

Unnamed: 0,label,tokenized_del_stopwords,tokenized_del_stopwords_sent
0,0,"[쇼핑, 입니다, 여보세요, 고객, 님, 무엇, 도와드릴까요, 쇼핑, 에서, 신발,...",쇼핑 입니다 여보세요 고객 님 무엇 도와드릴까요 쇼핑 에서 신발 주문 할라니까 통합...
1,0,"[은행, 입니다, 무엇, 도와, 드릴까요, 고객, 님, 제, 수신, 거래, 골드, ...",은행 입니다 무엇 도와 드릴까요 고객 님 제 수신 거래 골드 플러스 고객 인데 인터...
2,0,"[쇼핑, 입니다, 무엇, 도와드릴까요, 지금, 청바지, 나오, 거, 하나, 주문, ...",쇼핑 입니다 무엇 도와드릴까요 지금 청바지 나오 거 하나 주문 려고 는데요 고객 님...


In [81]:
tr.to_csv('/content/drive/MyDrive/2022 ADV/data/tr_fin.csv', index=False)
te.to_csv('/content/drive/MyDrive/2022 ADV/data/te_fin.csv', index=False)

## encoder

- load encoder

In [70]:
with open('/content/drive/MyDrive/2022 ADV/data/countvec.pkl', 'rb') as f:
    countvec = pickle.load(f)

In [69]:
with open('/content/drive/MyDrive/2022 ADV/data/tfidfvec.pkl', 'rb') as f:
    tfidfvec = pickle.load(f)

- transform via cnt_vec

In [82]:
X_tr_cnt = countvec.transform(tr['tokenized_del_stopwords_sent'])
X_te_cnt = countvec.transform(te['tokenized_del_stopwords_sent'])

In [83]:
y_tr_cnt = tr['label']
y_te_cnt = te['label']

- transform via tfidf_vec

In [84]:
X_tr_tf = tfidfvec.transform(tr['tokenized_del_stopwords_sent'])
X_te_tf = tfidfvec.transform(te['tokenized_del_stopwords_sent'])

In [85]:
y_tr_tf = tr['label']
y_te_tf = te['label']

## linearSVM (countvec)

- train model

In [104]:
model = LinearSVC()
model.fit(X_tr_cnt, y_tr_cnt)



LinearSVC()

- predict

In [105]:
y_pred = model.predict(X_te_cnt)

In [106]:
acc = accuracy_score(y_te_cnt, y_pred)
f1 = f1_score(y_te_cnt, y_pred)
print(f'count vec accuracy : {acc * 100} %')
print(f'count vec f1 score : {f1}')

count vec accuracy : 99.92836034745231 %
count vec f1 score : 0.9574468085106383


## linearSVM (tfidfvec)

In [107]:
model = LinearSVC()
model.fit(X_tr_tf, y_tr_tf)

LinearSVC()

- predict

In [108]:
y_pred = model.predict(X_te_tf)

In [109]:
acc = accuracy_score(y_te_tf, y_pred)
f1 = f1_score(y_te_tf, y_pred)
print(f'tfidf vec accuracy : {acc * 100} %')
print(f'tfidf vec f1 score : {f1}')

tfidf vec accuracy : 99.95522521715769 %
tfidf vec f1 score : 0.9732620320855615
