## 8.1 텍스트 처리용 IMDb 영화 리뷰 데이터 준비

In [None]:
# 압축 파일 풀기
import tarfile
with tarfile.open('aclImdb_v1.tar.gz', 'r:gz') as tar:
    tar.extractall()

In [2]:
import pyprind
import pandas as pd
import os
import sys # 추가적인 모듈 불러오기

In [3]:
basepath = '/Users/sunghyouk/study_room/aclImdb'

labels = {'pos':1, 'neg':0}
pbar = pyprind.ProgBar(50000, stream=sys.stderr) # stream argument 추가
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file),
                      'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]],
                           ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']

  df = df.append([[txt, labels[l]]],
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:05:45


In [4]:
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False, encoding='utf-8')

In [5]:
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [6]:
df.shape

(50000, 2)

## 8.2 BoW 모델 소개

In [7]:
# BoW의 예시
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining, the weather is sweet, and one and one is two'
])
bag = count.fit_transform(docs)

# 정수 인덱스 출력
print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [8]:
# 특성 벡터 출력
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [9]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=True,
                         norm='l2',
                         smooth_idf=True)
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


### 8.2.3 텍스트 데이터 정제

In [10]:
# 마지막 50개 글자 출력
df.loc[0, 'review'][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

In [11]:
# 불필요한 문자 제거
import re
def preprocessor(text):
    text = re.sub('<[^>]*', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

In [12]:
# example 1
preprocessor(df.loc[0, 'review'][-50:])

'is seven title brazil not available'

In [13]:
# example 2
preprocessor("</a>This :) is :( a test :-)!")

' this is a test :) :( :)'

In [14]:
# 영화 리뷰의 모든 칼럼에 preprocessor 함수 적용하기
df['review'] = df['review'].apply(preprocessor)

### 8.2.4 문서를 토큰으로 나누기

In [15]:
def tokenizer(text):
    return text.split()

In [16]:
# NLTK - Porter stemmer
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [17]:
# 불용어 다운로드
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sunghyouk/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [18]:
# 불용어 제거
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

## 8.3 문서 분류를 위한 로지스틱 회귀 모델 훈련

In [19]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)
param_grid = [{'vect__ngram_range' : [(1, 1)],
               'vect__stop_words' : [stop, None],
               'vect__tokenizer' : [tokenizer,
                                    tokenizer_porter],
               'clf__penalty' : ['l1', 'l2'],
               'clf__C' : [1.0, 10.0, 100.0]},
              {'vect__ngram_range' : [(1, 1)],
               'vect__stop_words' : [stop, None],
               'vect__tokenizer' : [tokenizer, 
                                    tokenizer_porter],
               'vect__use_idf' : [False],
               'vect__norm' : [None],
               'clf__penalty' : ['l1', 'l2'],
               'clf__C' : [1.0, 10.0, 100.0]}
              ]
lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(solver='liblinear', random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5, verbose=1,
                           n_jobs=1)
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits




GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect',
                                        TfidfVectorizer(lowercase=False)),
                                       ('clf',
                                        LogisticRegression(random_state=0,
                                                           solver='liblinear'))]),
             n_jobs=1,
             param_grid=[{'clf__C': [1.0, 10.0, 100.0],
                          'clf__penalty': ['l1', 'l2'],
                          'vect__ngram_range': [(1, 1)],
                          'vect__stop_words': [['i', 'me', 'my', 'myself', 'we',
                                                'our', 'ours', 'ourselves',
                                                'you', "you're", "you've"...
                                                'our', 'ours', 'ourselves',
                                                'you', "you're", "you've",
                                                "you'll", "you'd", 'your',
 

In [22]:
print('최적의 매개변수 조합: %s ' %gs_lr_tfidf.best_params_)

최적의 매개변수 조합: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x7fe13e6ae040>} 


In [23]:
print('CV 정확도: %.3f'
      %gs_lr_tfidf.best_score_)

CV 정확도: 0.897


In [24]:
clf = gs_lr_tfidf.best_estimator_
print('테스트 정확도: %.3f'
      %clf.score(X_test, y_test))

테스트 정확도: 0.899


## 8.4 대용량 데이터 처리: 온라인 알고리즘과 외부 메모리 학습

In [4]:
import numpy as np
import re
from nltk.corpus import stopwords

In [5]:
stop = stopwords.words('english')

In [1]:
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) \
        + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label
            
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        pass
    return docs, y

In [2]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore',
                         n_features=2**21,
                         preprocessor=None,
                         tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=1, max_iter=1)
doc_stream = stream_docs(path='movie_data.csv')

In [6]:
import pyprind
pbar = pyprind.ProgBar(45, stream=sys.stderr)
classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:30


In [7]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('정확도: %.3f' % clf.score(X_test, y_test))

정확도: 0.868


In [8]:
clf = clf.partial_fit(X_test, y_test)

## 8.5 잠재 디리클레 할당을 사용한 토픽 모델링

* Latent Dirichlet Allocation (LDA)
* 선형 판별 분석 (LDA)와 약어가 같지만 다름

In [1]:
import pandas as pd
df = pd.read_csv('movie_data.csv', encoding='utf-8')

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words='english',
                        max_df=0.1,
                        max_features=5000)
X = count.fit_transform(df['review'].values)

In [3]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=10,
                                random_state=123,
                                learning_method='batch')
X_topics = lda.fit_transform(X)

In [4]:
lda.components_.shape

(10, 5000)

In [5]:
n_top_words = 5
feature_names = count.get_feature_names()
for topic_idx,  topic in enumerate(lda.components_):
    print("토픽 %d:" % (topic_idx + 1))
    print(" ".join([feature_names[i]
                    for i in topic.argsort()\
                        [:-n_top_words - 1:-1]]))

토픽 1:
worst minutes awful script stupid
토픽 2:
family mother father children girl
토픽 3:
american war dvd music tv
토픽 4:
human audience cinema art sense
토픽 5:
police guy car dead murder
토픽 6:
horror house sex girl woman
토픽 7:
role performance comedy actor performances
토픽 8:
series episode war episodes tv
토픽 9:
book version original read novel
토픽 10:
action fight guy guys cool




In [6]:
horror = X_topics[:, 5].argsort()[::-1]
for iter_idx, movie_idx in enumerate(horror[:3]):
    print('\n공포 영화 #%d:' % (iter_idx + 1))
    print(df['review'][movie_idx][:300], '...')


공포 영화 #1:
House of Dracula works from the same basic premise as House of Frankenstein from the year before; namely that Universal's three most famous monsters; Dracula, Frankenstein's Monster and The Wolf Man are appearing in the movie together. Naturally, the film is rather messy therefore, but the fact that ...

공포 영화 #2:
Okay, what the hell kind of TRASH have I been watching now? "The Witches' Mountain" has got to be one of the most incoherent and insane Spanish exploitation flicks ever and yet, at the same time, it's also strangely compelling. There's absolutely nothing that makes sense here and I even doubt there  ...

공포 영화 #3:
<br /><br />Horror movie time, Japanese style. Uzumaki/Spiral was a total freakfest from start to finish. A fun freakfest at that, but at times it was a tad too reliant on kitsch rather than the horror. The story is difficult to summarize succinctly: a carefree, normal teenage girl starts coming fac ...
