## 5.1 20 뉴스그룹 데이터 준비 및 특성 추출
### 5.1.1 데이터셋 확인 및 분리

In [2]:
from sklearn.datasets import fetch_20newsgroups

# 20개의 토픽 중 선택하고자 하는 토픽을 리스트로 생성
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

# 학습 데이터셋을 가져옴
newsgroups_train = fetch_20newsgroups(subset='train',
                                    remove=('headers', 'footers', 'quotes'),
                                    categories=categories)

# 평가 데이터셋을 가져옴
newsgroups_test = fetch_20newsgroups(subset='test',
                                    remove=('headers', 'footers', 'quotes'),
                                    categories=categories)

print('#Train set size : {}'.format(len(newsgroups_train.data)))
print('#Test set size : {}'.format(len(newsgroups_test.data)))
print('#Selected categories : {}'.format(newsgroups_train.target_names))
print('Train labels : {}'.format(set(newsgroups_train.target)))

#Train set size : 2034
#Test set size : 1353
#Selected categories : ['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']
Train labels : {0, 1, 2, 3}


In [3]:
print('#Train set text samples :', newsgroups_train.data[0])
print('#Train set label samples :', newsgroups_train.target[0])
print('#Test set text samples :', newsgroups_test.data[0])
print('#Test set label samples :', newsgroups_test.target[0])

#Train set text samples : Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych
#Train set label samples : 1
#Test set text samples : TRry the SKywatch project in  Arizona.
#Test set label samples : 2


### 5.1.2 카운트 기반 특성 추출

In [4]:
X_train = newsgroups_train.data
y_train = newsgroups_train.target

X_test = newsgroups_test.data
y_test = newsgroups_test.target

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=2000, 
                    min_df=5,  # 최소 이 개수만큼의 문서에 나타나야 한다.
                    max_df=0.5)  # 문서의 50%를 초과해 나타나는 단어들은 제외

X_train_cv = cv.fit_transform(X_train) # train set을 변환
print('Train set dimension:', X_train_cv.shape) 

X_test_cv = cv.transform(X_test) # test set을 변환
print('Test set dimension:', X_test_cv.shape)

Train set dimension: (2034, 2000)
Test set dimension: (1353, 2000)


In [5]:
for word, count in zip(cv.get_feature_names_out()[:100], X_train_cv[0].toarray()[0, :100]):
    print(word, ':', count, end=', ')

00 : 0, 000 : 0, 01 : 0, 04 : 0, 05 : 0, 10 : 0, 100 : 0, 1000 : 0, 11 : 0, 12 : 0, 128 : 0, 129 : 0, 13 : 0, 130 : 0, 14 : 0, 15 : 0, 16 : 0, 17 : 0, 18 : 0, 19 : 0, 1987 : 0, 1988 : 0, 1989 : 0, 1990 : 0, 1991 : 0, 1992 : 0, 1993 : 0, 20 : 0, 200 : 0, 202 : 0, 21 : 0, 22 : 0, 23 : 0, 24 : 0, 25 : 0, 256 : 0, 26 : 0, 27 : 0, 28 : 0, 2d : 0, 30 : 0, 300 : 0, 31 : 0, 32 : 0, 33 : 0, 34 : 0, 35 : 0, 39 : 0, 3d : 0, 40 : 0, 400 : 0, 42 : 0, 45 : 0, 50 : 0, 500 : 0, 60 : 0, 600 : 0, 65 : 0, 70 : 0, 75 : 0, 80 : 0, 800 : 0, 90 : 0, 900 : 0, 91 : 0, 92 : 0, 93 : 0, 95 : 0, _the : 0, ability : 0, able : 1, abortion : 0, about : 1, above : 0, absolute : 0, absolutely : 0, ac : 0, accept : 0, acceptable : 0, accepted : 0, access : 0, according : 0, account : 0, accurate : 0, across : 0, act : 0, action : 0, actions : 0, active : 0, activities : 0, activity : 0, acts : 0, actual : 0, actually : 0, ad : 0, add : 0, added : 0, addition : 0, additional : 0, address : 0, 

## 5.2 머신러닝과 문서 분류 과정에 대한 이해
1. 데이터 정제
2. 데이터 분리
3. 머신러닝 학습
4. 평가
5. 최종모형 도출
6. 예측

## 5.3 나이브베이즈 분류기를 이용한 문서 분류

In [7]:
from sklearn.naive_bayes import MultinomialNB

# 분류기 선언
NB_clf = MultinomialNB()

# train set을 이용해 분류기 학습
NB_clf.fit(X_train_cv, y_train)

# train set에 대한 예측 정확도를 확인
print('Train set score : {:.3f}'.format(NB_clf.score(X_train_cv, y_train)))

# test set에 대한 예측 정확도를 확인
print('Test set score : {:.3f}'.format(NB_clf.score(X_test_cv, y_test)))

Train set score : 0.824
Test set score : 0.732


In [9]:
print('#First document and label in test data:', X_test[0], y_test[0])
print('#Second document and label in test data:', X_test[1], y_test[1])

pred = NB_clf.predict(X_test_cv[:2])
print('#Predicted labels:', pred)
print('#Predicted categories:', newsgroups_train.target_names[pred[0]], newsgroups_train.target_names[pred[1]])

#First document and label in test data: TRry the SKywatch project in  Arizona. 2
#Second document and label in test data: The Vatican library recently made a tour of the US.
 Can anyone help me in finding a FTP site where this collection is 
 available. 1
#Predicted labels: [2 1]
#Predicted categories: sci.space comp.graphics


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

#CountVectorizer와 동일한 인수를 사용
tfidf = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.5) 
X_train_tfidf = tfidf.fit_transform(X_train) # train set을 변환
X_test_tfidf = tfidf.transform(X_test) # test set을 변환

NB_clf.fit(X_train_tfidf, y_train) #tfidf train set을 이용하여 분류기(classifier)를 새로 학습
print('Train set score: {:.3f}'.format(NB_clf.score(X_train_tfidf, y_train))) #train set에 대한 예측정확도를 확인
print('Test set score: {:.3f}'.format(NB_clf.score(X_test_tfidf, y_test))) #test set에 대한 예측정확도를 확인

Train set score: 0.862
Test set score: 0.741


In [11]:
print('#First document and label in test data:', X_test[0], y_test[0])
print('#Second document and label in test data:', X_test[1], y_test[1])

pred2 = NB_clf.predict(X_test_tfidf[:2])
print('#Predicted labels:', pred2)
print('#Predicted categories:', newsgroups_train.target_names[pred2[0]], newsgroups_train.target_names[pred2[1]])

#First document and label in test data: TRry the SKywatch project in  Arizona. 2
#Second document and label in test data: The Vatican library recently made a tour of the US.
 Can anyone help me in finding a FTP site where this collection is 
 available. 1
#Predicted labels: [2 1]
#Predicted categories: sci.space comp.graphics


In [15]:
# 카테고리별로 계수가 큰 10개 특성들을 추출
import numpy as np

def top10_features(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names_out())
    for i, category in enumerate(categories):
        # 역순으로 정렬하기 위해 계수에 음수를 취해서 정렬 후 앞에서부터 10개의 값을 반환
        top10 = np.argsort(-classifier.feature_log_prob_[i])[:10]  # coef_ 사라짐
        # 카테고리와 영향이 큰 특성 10개를 출력
        print("%s: %s" % (category, ", ".join(feature_names[top10])))

top10_features(NB_clf, tfidf, newsgroups_train.target_names)

alt.atheism: you, not, are, be, this, have, as, what, they, if
comp.graphics: you, on, graphics, this, have, any, can, or, with, thanks
sci.space: space, on, you, be, was, this, as, they, have, are
talk.religion.misc: you, not, he, are, as, this, be, god, was, they


## 5.4 로지스틱 회귀분석을 이용한 문서 분류

In [17]:
from sklearn.linear_model import LogisticRegression

LR_clf = LogisticRegression()
LR_clf.fit(X_train_tfidf, y_train)

print('Train set score : {:.3f}'.format(LR_clf.score(X_train_tfidf, y_train)))
print('Test set score : {:.3f}'.format(LR_clf.score(X_test_tfidf, y_test)))

Train set score : 0.930
Test set score : 0.734


### 5.4.1 릿지회귀를 이용한 과적합 방지

In [18]:
from sklearn.linear_model import RidgeClassifier

ridge_clf = RidgeClassifier()
ridge_clf.fit(X_train_tfidf, y_train)

print('Train set score : {:.3f}'.format(ridge_clf.score(X_train_tfidf, y_train)))
print('Test set score : {:.3f}'.format(ridge_clf.score(X_test_tfidf, y_test)))

Train set score : 0.960
Test set score : 0.735


In [19]:
import numpy as np
from sklearn.model_selection import train_test_split

X_train_ridge, X_val_ridge, y_train_ridge, y_val_ridge = train_test_split(
    X_train_tfidf, y_train, test_size=0.2, random_state=42)

max_score = 0
max_alpha = 0
for alpha in np.arange(0.1, 10, 0.1): # alpha를 0.1부터 10까지 0.1씩 증가
    ridge_clf = RidgeClassifier(alpha=alpha) #릿지 분류기 선언
    ridge_clf.fit(X_train_ridge, y_train_ridge) #학습
    score = ridge_clf.score(X_val_ridge, y_val_ridge) #검정 데이터셋에 대해 정확도를 측정
    if score > max_score: #정확도가 이전의 정확도 최대값보다 크면 최대값을 변경한다.
        max_score = score
        max_alpha = alpha
print('Max alpha {:.3f} at max validation score {:.3f}'.format(max_alpha, max_score))

Max alpha 1.600 at max validation score 0.826


In [20]:
ridge_clf = RidgeClassifier(alpha=1.6) #릿지 분류기 선언
ridge_clf.fit(X_train_tfidf, y_train) #학습

print('Train set score: {:.3f}'.format(ridge_clf.score(X_train_tfidf, y_train)))
print('Test set score: {:.3f}'.format(ridge_clf.score(X_test_tfidf, y_test)))

Train set score: 0.947
Test set score: 0.739


In [23]:
def top10_features(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names_out())
    for i, category in enumerate(categories):
        # 역순으로 정렬하기 위해 계수에 음수를 취해서 정렬 후 앞에서부터 10개의 값을 반환
        top10 = np.argsort(-classifier.coef_[i])[:10]
        # 카테고리와 영향이 큰 특성 10개를 출력
        print("%s: %s" % (category, ", ".join(feature_names[top10])))

top10_features(ridge_clf, tfidf, newsgroups_train.target_names)

alt.atheism: bobby, religion, atheism, atheists, motto, punishment, islam, deletion, islamic, satan
comp.graphics: graphics, computer, 3d, file, image, hi, 42, using, screen, looking
sci.space: space, orbit, nasa, spacecraft, moon, sci, launch, flight, funding, idea
talk.religion.misc: christian, christians, fbi, blood, order, jesus, objective, children, christ, hudson


### 5.4.2 라쏘 회귀를 이용한 특성 선택

In [24]:
from sklearn.linear_model import LogisticRegression

lasso_clf = LogisticRegression(penalty='l1', solver='liblinear', C=1)
lasso_clf.fit(X_train_tfidf, y_train)

print('#Train set score : {:.3f}'.format(lasso_clf.score(X_train_tfidf, y_train)))
print('#Test set score : {:.3f}'.format(lasso_clf.score(X_test_tfidf, y_test)))

# 계수 중에서 0이 아닌 것들의 개수를 출력
print('#Used feature count : {}'.format(np.sum(lasso_clf.coef_ != 0)), 'out of', X_train_tfidf.shape[1])


#Train set score : 0.819
#Test set score : 0.724
#Used feature count : 437 out of 2000


In [25]:
top10_features(lasso_clf, tfidf, newsgroups_train.target_names)

alt.atheism: bobby, atheism, atheists, islam, religion, islamic, motto, atheist, satan, vice
comp.graphics: graphics, image, 3d, file, computer, hi, video, files, looking, sphere
sci.space: space, orbit, launch, nasa, spacecraft, flight, moon, dc, shuttle, solar
talk.religion.misc: fbi, christian, christians, christ, order, jesus, children, objective, context, blood


## 5.5 결정트리 등을 이용한 기타 문서 분류 방법

In [26]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

tree = DecisionTreeClassifier(random_state=7)
forest = RandomForestClassifier(random_state=7)
gb = GradientBoostingClassifier(random_state=7)

tree.fit(X_train_tfidf, y_train)
forest.fit(X_train_tfidf, y_train)
gb.fit(X_train_tfidf, y_train)

print('#Decision Tree Train set score : {:3f}'.format(tree.score(X_train_tfidf, y_train)))
print('#Decision Tree Test set score : {:.3f}'.format(tree.score(X_test_tfidf, y_test)))

print('#Random Forest Train set score : {:3f}'.format(forest.score(X_train_tfidf, y_train)))
print('#Random Forest Test set score : {:3f}'.format(forest.score(X_test_tfidf, y_test)))

print('#Gradient Bppsting Train set score : {:3f}'.format(gb.score(X_train_tfidf, y_train)))
print('#Gradient Bppsting Test set score : {:3f}'.format(gb.score(X_test_tfidf, y_test)))

#Decision Tree Train set score : 0.977384
#Decision Tree Test set score : 0.536
#Random Forest Train set score : 0.977384
#Random Forest Test set score : 0.685144
#Gradient Bppsting Train set score : 0.932645
#Gradient Bppsting Test set score : 0.696231


In [34]:
sorted_feature_importance = sorted(zip(tfidf.get_feature_names_out(), gb.feature_importances_), key=lambda x:x[1], reverse=True)

for feature, value in sorted_feature_importance[:40]:
    print('%s: %.3f' %(feature, value), end=', ')

space: 0.126, graphics: 0.080, atheism: 0.024, thanks: 0.023, file: 0.021, orbit: 0.020, jesus: 0.018, god: 0.018, hi: 0.017, nasa: 0.015, image: 0.015, files: 0.014, christ: 0.010, moon: 0.010, bobby: 0.010, launch: 0.010, looking: 0.010, christian: 0.010, atheists: 0.009, christians: 0.009, fbi: 0.009, 3d: 0.008, you: 0.008, not: 0.008, islamic: 0.007, religion: 0.007, spacecraft: 0.007, flight: 0.007, computer: 0.007, islam: 0.007, ftp: 0.006, color: 0.006, software: 0.005, atheist: 0.005, card: 0.005, people: 0.005, koresh: 0.005, his: 0.005, kent: 0.004, sphere: 0.004, 

## 5.6 성능을 높이는 방법

In [35]:
# 필요한 library들을 import
from nltk.corpus import stopwords
cachedStopWords = stopwords.words("english")

from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
import re

RegTok = RegexpTokenizer("[\w']{3,}") # 정규포현식으로 토크나이저를 정의
english_stops = set(stopwords.words('english')) #영어 불용어를 가져옴

def tokenizer(text):
    tokens = RegTok.tokenize(text.lower())
    # stopwords 제외
    words = [word for word in tokens if (word not in english_stops) and len(word) > 2]
    # portr stemmer 적용
    features = (list(map(lambda token: PorterStemmer().stem(token),words)))
    return features

tfidf = TfidfVectorizer(tokenizer=tokenizer, max_features=2000, min_df=5, max_df=0.5) # 새로 정의한 토크나이저 사용
X_train_tfidf = tfidf.fit_transform(X_train) # train set을 변환
X_test_tfidf = tfidf.transform(X_test) # test set을 변환

#tfidf vector를 이용해서 분류기 학습
LR_clf = LogisticRegression() #분류기 선언
LR_clf.fit(X_train_tfidf, y_train) # train data를 이용하여 분류기를 학습
print('#Train set score: {:.3f}'.format(LR_clf.score(X_train_tfidf, y_train))) # train data에 대한 예측정확도 
print('#Test set score: {:.3f}'.format(LR_clf.score(X_test_tfidf, y_test))) # test data에 대한 예측정확도



#Train set score: 0.930
#Test set score: 0.751


In [36]:
len(LR_clf.coef_[0])

2000

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(tokenizer=tokenizer)

X_train_tfidf = tfidf.fit_transform(X_train) 
X_test_tfidf = tfidf.transform(X_test) 

ridge_clf = RidgeClassifier(alpha=2.4)
ridge_clf.fit(X_train_tfidf, y_train) #학습

NB_clf = MultinomialNB(alpha=0.01) 
NB_clf.fit(X_train_tfidf, y_train) 

print('#Train set dimension:', X_train_tfidf.shape) 
print('#Test set dimension:', X_test_tfidf.shape)

print('#Train set score: {:.3f}'.format(ridge_clf.score(X_train_tfidf, y_train)))
print('#Test set score: {:.3f}'.format(ridge_clf.score(X_test_tfidf, y_test)))

print('#Train set score: {:.3f}'.format(NB_clf.score(X_train_tfidf, y_train))) 
print('#Test set score: {:.3f}'.format(NB_clf.score(X_test_tfidf, y_test)))



#Train set dimension: (2034, 20085)
#Test set dimension: (1353, 20085)
#Train set score: 0.969
#Test set score: 0.768
#Train set score: 0.971
#Test set score: 0.793


## 5.7 카운트 기반의 문제점과 N-gram을 이용한 보완
- BOW 방식은 단어들의 순서를 무시, 단어가 사용된 횟수를 기반으로 문서 벡터를 만듦
- BOW 방식을 그대로 쓰면서도 단어가 쓰여진 순서를 반영할 수 있는 방법

In [38]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

cachedStopWords = stopwords.words('english')
tfidf = TfidfVectorizer(
    token_pattern="[a-zA-Z']{3,}",
    decode_error='ignore',
    lowercase=True,
    stop_words=stopwords.words('english'),
    max_df=0.5,
    min_df=2
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(X_train_tfidf.shape)

(2034, 11483)


In [39]:
from sklearn.linear_model import RidgeClassifier
ridge_clf = RidgeClassifier()
ridge_clf.fit(X_train_tfidf, y_train)

print('Train set score : {:.3f}'.format(ridge_clf.score(X_train_tfidf, y_train)))
print('Test set score : {:.3f}'.format(ridge_clf.score(X_test_tfidf, y_test)))

Train set score : 0.976
Test set score : 0.765


In [40]:
tfidf = TfidfVectorizer(
    token_pattern="[a-zA-Z']{3,}",
    lowercase=True,
    stop_words=stopwords.words('english'),
    ngram_range=(1,2),
    max_df=0.5,
    min_df=2
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(X_train_tfidf.shape)

(2034, 26550)


In [42]:
bigram_features = [ f for f in tfidf.get_feature_names_out() if len(f.split()) > 1 ]
print('bi-gram samples : {}'.format(bigram_features[:10]))

ridge_clf.fit(X_train_tfidf, y_train)

print('Train set score : {:.3f}'.format(ridge_clf.score(X_train_tfidf, y_train)))
print('Test set score : {:.3f}'.format(ridge_clf.score(X_test_tfidf, y_test)))

bi-gram samples : ["'cause can't", "'em better", "'expected errors'", "'karla' next", "'nodis' password", "'official doctrine", "'ok see", "'sci astro'", "'what's moonbase", 'aas american']
Train set score : 0.976
Test set score : 0.773


In [43]:
tfidf = TfidfVectorizer(token_pattern= "[a-zA-Z']{3,}", 
                        decode_error ='ignore', 
                        lowercase=True, 
                        stop_words = stopwords.words('english'),
                        ngram_range=(1, 3),
                        max_df=0.5,
                        min_df=2)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(X_train_tfidf.shape)

trigram_features = [f for f in tfidf.get_feature_names_out() if len(f.split()) > 2]
print('tri-gram samples:', trigram_features[:10])

ridge_clf.fit(X_train_tfidf, y_train) #학습

print('Train set score: {:.3f}'.format(ridge_clf.score(X_train_tfidf, y_train)))
print('Test set score: {:.3f}'.format(ridge_clf.score(X_test_tfidf, y_test)))

(2034, 32943)
Train set score: 0.976
Test set score: 0.775


## 5.8 한국어 문서의 분류
### 5.8.1 다음 영화 리뷰에 대한 영화 제목 예측

In [44]:
import pandas as pd
df = pd.read_csv('./data/daum_movie_review.csv')
df.head(5)

Unnamed: 0,review,rating,date,title
0,돈 들인건 티가 나지만 보는 내내 하품만,1,2018.10.29,인피니티 워
1,몰입할수밖에 없다. 어렵게 생각할 필요없다. 내가 전투에 참여한듯 손에 땀이남.,10,2018.10.26,인피니티 워
2,이전 작품에 비해 더 화려하고 스케일도 커졌지만.... 전국 맛집의 음식들을 한데 ...,8,2018.10.24,인피니티 워
3,이 정도면 볼만하다고 할 수 있음!,8,2018.10.22,인피니티 워
4,재미있다,10,2018.10.20,인피니티 워


In [45]:
df['title'].value_counts()

신과함께      4947
택시운전사     2322
인피니티 워    2042
범죄도시      1939
곤지암       1547
라라랜드      1150
코코         778
Name: title, dtype: int64

In [46]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['review'], df['title'], random_state=0)

print('#Train set size : {}'.format(len(X_train)))
print('#Test set size : {}'.format(len(X_test)))

#Train set size : 11043
#Test set size : 3682


In [48]:
from konlpy.tag import Okt
okt = Okt()

print(okt.morphs(X_train[1]))
print(okt.nouns(X_train[1]))

['몰입', '할수밖에', '없다', '.', '어렵게', '생각', '할', '필요없다', '.', '내', '가', '전투', '에', '참여', '한', '듯', '손', '에', '땀', '이남', '.']
['몰입', '생각', '내', '전투', '참여', '듯', '손', '땀', '이남']


In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

tfidf = TfidfVectorizer(
    tokenizer=okt.nouns,
    max_features=2000,
    min_df=5,
    max_df=0.5
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)

print('#Train set score : {}'.format(clf.score(X_train_tfidf, y_train)))
print('#Test set score : {}'.format(clf.score(X_test_tfidf, y_test)))



#Train set score : 0.756406773521688
#Test set score : 0.6939163498098859


In [52]:
print("실제 영화 제목, 예측한 제목, 리뷰")

for content in zip(y_test[:10], clf.predict(X_test_tfidf[:10]), X_test[:10]):
    print(content)

실제 영화 제목, 예측한 제목, 리뷰
('범죄도시', '신과함께', '오랜만에 잼나는 영화 봤습니다.  다음에 더 재미있는 영화 기대하겠습니다.')
('범죄도시', '범죄도시', '조연들이 눈에 박힌다. 간만에 집중 ㅎ')
('코코', '코코', '대감동을 선사. 인사이드 아웃을 잇는 픽사의 감동스토리. 신과함께의 멕시코판이라고나할까요??')
('신과함께', '신과함께', '돈이 안아까웠던 영화ᆞᆞ  정말 좋았다')
('신과함께', '신과함께', '역시 김용화감독이 영화는 잘 만들어요. 이제 VFX 제작 부문도 헐리우드 수준 이상입니다.')
('택시운전사', '택시운전사', '민주화를 위해 힘써주신 분들께 감사하는 마음으로 살아야겠다.')
('신과함께', '신과함께', '잠만 자다 왔음')
('신과함께', '신과함께', '오랜만에 잼있고 좋은 영화를 봤다')
('범죄도시', '신과함께', '잼남')
('범죄도시', '인피니티 워', '대박~~')


In [75]:
pred_df = pd.DataFrame(columns=["실제 영화 제목", "예측한 제목", "리뷰"])
idx = 0

for i in range(10):
    real_title = y_test[i:i+1]
    pred_title = clf.predict(X_test_tfidf[i:i+1])
    review = X_test[i:i+1]
    
    pred_df.loc[idx] = [real_title, pred_title, review]
    
    idx += 1

pred_df

Unnamed: 0,실제 영화 제목,예측한 제목,리뷰
0,"10711 범죄도시 Name: title, dtype: object",[신과함께],10711 오랜만에 잼나는 영화 봤습니다. 다음에 더 재미있는 영화 기대하겠...
1,"9848 범죄도시 Name: title, dtype: object",[범죄도시],"9848 조연들이 눈에 박힌다. 간만에 집중 ㅎ Name: review, dt..."
2,"14545 코코 Name: title, dtype: object",[코코],14545 대감동을 선사. 인사이드 아웃을 잇는 픽사의 감동스토리. 신과함께의...
3,"9017 신과함께 Name: title, dtype: object",[신과함께],"9017 돈이 안아까웠던 영화ᆞᆞ 정말 좋았다 Name: review, dt..."
4,"8659 신과함께 Name: title, dtype: object",[신과함께],8659 역시 김용화감독이 영화는 잘 만들어요. 이제 VFX 제작 부문도 헐리...
5,"11692 택시운전사 Name: title, dtype: object",[택시운전사],11692 민주화를 위해 힘써주신 분들께 감사하는 마음으로 살아야겠다. Nam...
6,"5911 신과함께 Name: title, dtype: object",[신과함께],"5911 잠만 자다 왔음 Name: review, dtype: object"
7,"6409 신과함께 Name: title, dtype: object",[신과함께],"6409 오랜만에 잼있고 좋은 영화를 봤다 Name: review, dtype..."
8,"11275 범죄도시 Name: title, dtype: object",[신과함께],"11275 잼남 Name: review, dtype: object"
9,"10818 범죄도시 Name: title, dtype: object",[인피니티 워],"10818 대박~~ Name: review, dtype: object"


### 5.8.2 성능을 개선하기 위한 노력

In [78]:
# 명사 대신 모든 형태소를 사용
tfidf = TfidfVectorizer(tokenizer=okt.morphs, max_features=2000, min_df=5, max_df=0.5)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# 충분한 학습을 위해 max_iter를 1,000으로 설정, 기본은 100
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)

print('#Train set score: {:.3f}'.format(clf.score(X_train_tfidf, y_train)))
print('#Test set score: {:.3f}'.format(clf.score(X_test_tfidf, y_test)))



#Train set score: 0.777
#Test set score: 0.695


In [79]:
def twit_tokenizer(text): #전체를 다 사용하는 대신, 명사, 동사, 형용사를 사용
    target_tags = ['Noun', 'Verb', 'Adjective']
    result = []
    for word, tag in okt.pos(text, norm=True, stem=True):
        if tag in target_tags:
            result.append(word)
    return result

tfidf = TfidfVectorizer(tokenizer=twit_tokenizer, max_features=2000, min_df=5, max_df=0.5)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)

print('#Train set score: {:.3f}'.format(clf.score(X_train_tfidf, y_train)))
print('#Test set score: {:.3f}'.format(clf.score(X_test_tfidf, y_test)))



#Train set score: 0.784
#Test set score: 0.712


In [80]:
# 모든 형태소를 다 사용하고 품사를 알 수 있도록 하면?
def twit_tokenizer2(text):
    result = []
    for word, tag in okt.pos(text, norm=True, stem=True):
        result.append('/'.join([word, tag])) #단어의 품사를 구분할 수 있도록 함
    return result

print(twit_tokenizer2(X_train[1]))

['몰입/Noun', '하다/Verb', '없다/Adjective', './Punctuation', '어렵다/Adjective', '생각/Noun', '하다/Verb', '필요없다/Adjective', './Punctuation', '내/Noun', '가/Josa', '전투/Noun', '에/Josa', '참여/Noun', '한/Determiner', '듯/Noun', '손/Noun', '에/Josa', '땀/Noun', '이남/Noun', './Punctuation']


In [81]:
tfidf = TfidfVectorizer(tokenizer=twit_tokenizer2, max_features=2000, min_df=5, max_df=0.5)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

clf = LogisticRegression(max_iter=1000) 
clf.fit(X_train_tfidf, y_train)

print('#Train set score: {:.3f}'.format(clf.score(X_train_tfidf, y_train))) 
print('#Test set score: {:.3f}'.format(clf.score(X_test_tfidf, y_test)))



#Train set score: 0.789
#Test set score: 0.718


In [82]:
# 명사, 형용사, 동사만 선택하고 품사를 붙인 후 로지스틱 회귀분석 실시

def twit_tokenizer3(text):
    target_tags = ['Noun', 'Verb', 'Adjective']
    result = []
    for word, tag in okt.pos(text, norm=True, stem=True):
        if tag in target_tags:
            result.append('/'.join([word, tag]))
    return result

tfidf = TfidfVectorizer(tokenizer=twit_tokenizer3, max_features=2000, min_df=5, max_df=0.5)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

clf = LogisticRegression(max_iter=1000) 
clf.fit(X_train_tfidf, y_train)

print('#Train set score: {:.3f}'.format(clf.score(X_train_tfidf, y_train))) 
print('#Test set score: {:.3f}'.format(clf.score(X_test_tfidf, y_test)))



#Train set score: 0.784
#Test set score: 0.713


In [83]:
import numpy as np
from sklearn.model_selection import train_test_split

X_train_ridge, X_val_ridge, y_train_ridge, y_val_ridge = train_test_split(
    X_train_tfidf, y_train, test_size=0.2, random_state=42)

max_score = 0
max_alpha = 0
for alpha in np.arange(0.1, 10, 0.1): # alpha를 0.1부터 10까지 0.1씩 증가
    ridge_clf = RidgeClassifier(alpha=alpha) #릿지 분류기 선언
    ridge_clf.fit(X_train_ridge, y_train_ridge) #학습
    score = ridge_clf.score(X_val_ridge, y_val_ridge) #검정 데이터셋에 대해 정확도를 측정
    if score > max_score: #정확도가 이전의 정확도 최대값보다 크면 최대값을 변경한다.
        max_score = score
        max_alpha = alpha
print('#Max alpha {:.3f} at max validation score {:.3f}'.format(max_alpha, max_score))

#Max alpha 2.300 at max validation score 0.717


In [84]:
from sklearn.linear_model import RidgeClassifier

ridge_clf = RidgeClassifier(alpha=1.6)
ridge_clf.fit(X_train_tfidf, y_train)

print('#Ridge Train set score: {:.3f}'.format(ridge_clf.score(X_train_tfidf, y_train)))
print('#Ridge Test set score: {:.3f}'.format(ridge_clf.score(X_test_tfidf, y_test)))

from sklearn.linear_model import LogisticRegression
import numpy as np

lasso_clf = LogisticRegression(penalty='l1', solver='liblinear', C=0.5)
lasso_clf.fit(X_train_tfidf, y_train)

print('#Lasso Train set score: {:.3f}'.format(lasso_clf.score(X_train_tfidf, y_train)))
print('#Lasso Test set score: {:.3f}'.format(lasso_clf.score(X_test_tfidf, y_test)))
print('#Used features count: {}'.format(np.sum(lasso_clf.coef_ != 0)), 'out of', X_train_tfidf.shape[1])

#Ridge Train set score: 0.797
#Ridge Test set score: 0.715
#Lasso Train set score: 0.700
#Lasso Test set score: 0.695
#Used features count: 947 out of 2000


In [85]:
from sklearn.naive_bayes import MultinomialNB

NB_clf = MultinomialNB(alpha=0.1)
NB_clf.fit(X_train_tfidf, y_train)

print('Train set score: {:.3f}'.format(NB_clf.score(X_train_tfidf, y_train)))
print('Test set score: {:.3f}'.format(NB_clf.score(X_test_tfidf, y_test)))

Train set score: 0.773
Test set score: 0.711
