## 8.2 ~ 8.3

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yoontaepark/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [26]:
# default: . + space, not divided by \ (this is just a symbol to connect data in 'enter')
from nltk import sent_tokenize

text_sample = 'The Matrix is everywhere its all around us, here even in this room. \
                You can see it out your window or on your television.\
                You feel it when you go to work, or go to church or pay your taxes.'

sentence = sent_tokenize(text=text_sample)
print(type(sentence), len(sentence))
print(sentence)

<class 'list'> 3
['The Matrix is everywhere its all around us, here even in this room.', 'You can see it out your window or on your television.', 'You feel it when you go to work, or go to church or pay your taxes.']


In [27]:
from nltk import word_tokenize
sentence = "The Matrix is everywhere its all around us, here even in this room."
words = word_tokenize(sentence)
print(type(words), len(words))
print(words)

<class 'list'> 15
['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.']


In [29]:
# integrated version
from nltk import word_tokenize, sent_tokenize

def tokenize_text(text):
    
    sentences = sent_tokenize(text)
    
    word_tokens = [word_tokenize(sentence) for sentence in sentences]
    
    return word_tokens

text_sample = 'The Matrix is everywhere its all around us, here even in this room. \
                You can see it out your window or on your television.\
                You feel it when you go to work, or go to church or pay your taxes.'

word_tokens = tokenize_text(text_sample)
print(type(word_tokens), len(word_tokens))
print(word_tokens)

<class 'list'> 3
[['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.'], ['You', 'can', 'see', 'it', 'out', 'your', 'window', 'or', 'on', 'your', 'television', '.'], ['You', 'feel', 'it', 'when', 'you', 'go', 'to', 'work', ',', 'or', 'go', 'to', 'church', 'or', 'pay', 'your', 'taxes', '.']]


In [34]:
# ngram: grouping and moves like bootstrapping
# Should be words to apply ngrams

from nltk import word_tokenize, sent_tokenize, ngrams

sentence = "The Matrix is everywhere its all around us, here even in this room."
words = word_tokenize(sentence)

# it becomes zip file, so needs some iterate function to print out data
all_ngrams = ngrams(words, 3)

ngrams = [ngram for ngram in all_ngrams]
print(type(ngrams), len(ngrams))
print(ngrams)

<class 'list'> 13
[('The', 'Matrix', 'is'), ('Matrix', 'is', 'everywhere'), ('is', 'everywhere', 'its'), ('everywhere', 'its', 'all'), ('its', 'all', 'around'), ('all', 'around', 'us'), ('around', 'us', ','), ('us', ',', 'here'), (',', 'here', 'even'), ('here', 'even', 'in'), ('even', 'in', 'this'), ('in', 'this', 'room'), ('this', 'room', '.')]


In [35]:
# d/l stopword library in nltk
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yoontaepark/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [40]:
# checking english stop words
print('# of stop words(eng): ', len(nltk.corpus.stopwords.words('english')))
print(nltk.corpus.stopwords.words('english')[:20])

# of stop words(eng):  179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his']


In [50]:
import nltk

print('Before stop words:', word_tokens, len(word_tokens))

stopwords = nltk.corpus.stopwords.words('english')

all_tokens = []
for sentence in word_tokens:
    filtered_words = []
    
    for word in sentence:
        word = word.lower()
        
        if word not in stopwords:
            filtered_words.append(word)
    
    all_tokens.append(filtered_words)

print()
print('After stop words: ', all_tokens)

Before stop words: [['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.'], ['You', 'can', 'see', 'it', 'out', 'your', 'window', 'or', 'on', 'your', 'television', '.'], ['You', 'feel', 'it', 'when', 'you', 'go', 'to', 'work', ',', 'or', 'go', 'to', 'church', 'or', 'pay', 'your', 'taxes', '.']] 3

After stop words:  [['matrix', 'everywhere', 'around', 'us', ',', 'even', 'room', '.'], ['see', 'window', 'television', '.'], ['feel', 'go', 'work', ',', 'go', 'church', 'pay', 'taxes', '.']]


In [62]:
# Stemmer(어근), 대소 관계없이 결과값 동일(신기하게도) 
# 일부는 원하는 결과값이 안나옴 
from nltk.stem import LancasterStemmer

stemmer = LancasterStemmer()

print(stemmer.stem('working'), stemmer.stem('works'), stemmer.stem('worked'))
print(stemmer.stem('amusing'), stemmer.stem('amused'), stemmer.stem('amusement'))
print(stemmer.stem('happier'), stemmer.stem('happiness'), stemmer.stem('happiest'))
print(stemmer.stem('fancy'), stemmer.stem('fancier'), stemmer.stem('fanciest'))

work work work
amus amus amus
happy happy happiest
fant fant fanciest


In [53]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/yoontaepark/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [79]:
# Lemmatize(기본형 찾기, 더 많이씀)
# pos tagging 해줘야함 (품사 지정, 안하면 인식을 안함 -> 불편)
# param pos: The Part Of Speech tag. Valid options are "n" for nouns, "v" for verbs, "a" for adjectives, 
#             "r" for adverbs and  "s" for satellite adjectives.

from nltk.stem import WordNetLemmatizer

lemma = WordNetLemmatizer()
print(lemma.lemmatize('amusing', 'v'), lemma.lemmatize('amuses', 'v'), lemma.lemmatize('amused', 'v'))
print(lemma.lemmatize('happier', 'a'), lemma.lemmatize('happiest', 'a'), lemma.lemmatize('happiness', 'n'))
print(lemma.lemmatize('fancier', 'a'), lemma.lemmatize('fanciest', 'a'))

amuse amuse amuse
happy happy happiness
fancy fancy


## BOW

In [84]:
text_sample_01 = 'The Matrix is everywhere its all around us, here even in this room. \
                  You can see it out your window or on your television. \
                  You feel it when you go to work, or go to church or pay your taxes.'
text_sample_02 = 'You take the blue pill and the story ends.  You wake in your bed and you believe whatever you want to believe\
                  You take the red pill and you stay in Wonderland and I show you how deep the rabbit-hole goes.'

text = []
text.append(text_sample_01)
text.append(text_sample_02)

print(text, len(text))

['The Matrix is everywhere its all around us, here even in this room.                   You can see it out your window or on your television.                   You feel it when you go to work, or go to church or pay your taxes.', 'You take the blue pill and the story ends.  You wake in your bed and you believe whatever you want to believe                  You take the red pill and you stay in Wonderland and I show you how deep the rabbit-hole goes.'] 2


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Count Vectorization으로 feature extraction 변환 수행. 
cnt_vect = CountVectorizer()
cnt_vect.fit(text)

In [87]:
from sklearn.feature_extraction.text import CountVectorizer

cnt_vect = CountVectorizer()
cnt_vect.fit(text)

CountVectorizer()

In [90]:
ftr_vect = cnt_vect.transform(text)
ftr_vect

<2x51 sparse matrix of type '<class 'numpy.int64'>'
	with 56 stored elements in Compressed Sparse Row format>

In [95]:
# 문서2개에(2 rows) * 51개 단어들이 feature로 있음. (0,13) = 2 라는 소리는, 0번 문서의 13번째 단어가 2번 나왔다는 소리
print(type(ftr_vect), ftr_vect.shape)
print(ftr_vect)

<class 'scipy.sparse.csr.csr_matrix'> (2, 51)
  (0, 0)	1
  (0, 2)	1
  (0, 6)	1
  (0, 7)	1
  (0, 10)	1
  (0, 11)	1
  (0, 12)	1
  (0, 13)	2
  (0, 15)	1
  (0, 18)	1
  (0, 19)	1
  (0, 20)	2
  (0, 21)	1
  (0, 22)	1
  (0, 23)	1
  (0, 24)	3
  (0, 25)	1
  (0, 26)	1
  (0, 30)	1
  (0, 31)	1
  (0, 36)	1
  (0, 37)	1
  (0, 38)	1
  (0, 39)	1
  (0, 40)	2
  :	:
  (1, 1)	4
  (1, 3)	1
  (1, 4)	2
  (1, 5)	1
  (1, 8)	1
  (1, 9)	1
  (1, 14)	1
  (1, 16)	1
  (1, 17)	1
  (1, 18)	2
  (1, 27)	2
  (1, 28)	1
  (1, 29)	1
  (1, 32)	1
  (1, 33)	1
  (1, 34)	1
  (1, 35)	2
  (1, 38)	4
  (1, 40)	1
  (1, 42)	1
  (1, 43)	1
  (1, 44)	1
  (1, 47)	1
  (1, 49)	7
  (1, 50)	1


In [103]:
print(cnt_vect.vocabulary_)

{'the': 38, 'matrix': 22, 'is': 19, 'everywhere': 11, 'its': 21, 'all': 0, 'around': 2, 'us': 41, 'here': 15, 'even': 10, 'in': 18, 'this': 39, 'room': 30, 'you': 49, 'can': 6, 'see': 31, 'it': 20, 'out': 25, 'your': 50, 'window': 46, 'or': 24, 'on': 23, 'television': 37, 'feel': 12, 'when': 45, 'go': 13, 'to': 40, 'work': 48, 'church': 7, 'pay': 26, 'taxes': 36, 'take': 35, 'blue': 5, 'pill': 27, 'and': 1, 'story': 34, 'ends': 9, 'wake': 42, 'bed': 3, 'believe': 4, 'whatever': 44, 'want': 43, 'red': 29, 'stay': 33, 'wonderland': 47, 'show': 32, 'how': 17, 'deep': 8, 'rabbit': 28, 'hole': 16, 'goes': 14}


In [None]:
cnt_vect = CountVectorizer(max_features=5, stop_words='english')
cnt_vect.fit(text)
ftr_vect = cnt_vect.transform(text)
print(type(ftr_vect), ftr_vect.shape)
print(cnt_vect.vocabulary_)

In [112]:
# Stop words 는 영어밖에 지원이 안되기도 함
cnt_vect2 = CountVectorizer(max_features=5, stop_words='english')
cnt_vect2.fit(text)
ftr_vect2 = cnt_vect2.transform(text)

print(type(ftr_vect2), ftr_vect2.shape)
print(cnt_vect2.vocabulary_)

<class 'scipy.sparse.csr.csr_matrix'> (2, 5)
{'window': 4, 'pill': 1, 'wake': 2, 'believe': 0, 'want': 3}


In [121]:
# ngram_range: start ngram, to ngram
cnt_vect3 = CountVectorizer(ngram_range=(2,2))
cnt_vect3.fit(text)
ftr_vect3 = cnt_vect3.transform(text)

print(type(ftr_vect3), ftr_vect3.shape)
print(cnt_vect3.vocabulary_)

<class 'scipy.sparse.csr.csr_matrix'> (2, 74)
{'the matrix': 45, 'matrix is': 27, 'is everywhere': 23, 'everywhere its': 14, 'its all': 26, 'all around': 0, 'around us': 4, 'us here': 53, 'here even': 17, 'even in': 13, 'in this': 20, 'this room': 49, 'room you': 37, 'you can': 62, 'can see': 9, 'see it': 38, 'it out': 24, 'out your': 32, 'your window': 73, 'window or': 58, 'or on': 30, 'on your': 28, 'your television': 72, 'television you': 43, 'you feel': 63, 'feel it': 15, 'it when': 25, 'when you': 57, 'you go': 64, 'go to': 16, 'to work': 52, 'work or': 60, 'or go': 29, 'to church': 51, 'church or': 10, 'or pay': 31, 'pay your': 33, 'your taxes': 71, 'you take': 67, 'take the': 42, 'the blue': 44, 'blue pill': 8, 'pill and': 34, 'and the': 2, 'the story': 48, 'story ends': 41, 'ends you': 12, 'you wake': 68, 'wake in': 54, 'in your': 22, 'your bed': 70, 'bed and': 5, 'and you': 3, 'you believe': 61, 'believe whatever': 6, 'whatever you': 56, 'you want': 69, 'want to': 55, 'to beli

In [122]:
import numpy as np

dense = np.array( [ [ 3, 0, 1 ], 
                    [0, 2, 0 ] ] )

In [123]:
from scipy import sparse

data = np.array([3,1,2])

# (0,0), (0,2), (1,1) -> divide by row & col
row_pos = np.array([0,0,1])
col_pos = np.array([0,2,1])

sparse_coo = sparse.coo_matrix((data, (row_pos, col_pos)))

In [125]:
print(type(sparse_coo))
print(sparse_coo)

print()
dense01 = sparse_coo.toarray()
print(type(dense01),"\n", dense01)

<class 'scipy.sparse.coo.coo_matrix'>
  (0, 0)	3
  (0, 2)	1
  (1, 1)	2

<class 'numpy.ndarray'> 
 [[3 0 1]
 [0 2 0]]


In [126]:
from scipy import sparse

dense2 = np.array([[0,0,1,0,0,5],
             [1,4,0,3,2,5],
             [0,6,0,3,0,0],
             [2,0,0,0,0,0],
             [0,0,0,7,0,8],
             [1,0,0,0,0,0]])

# 0 이 아닌 데이터 추출
data2 = np.array([1, 5, 1, 4, 3, 2, 5, 6, 3, 2, 7, 8, 1])

# 행 위치와 열 위치를 각각 array로 생성 
row_pos = np.array([0, 0, 1, 1, 1, 1, 1, 2, 2, 3, 4, 4, 5])
col_pos = np.array([2, 5, 0, 1, 3, 4, 5, 1, 3, 0, 3, 5, 0])

# COO 형식으로 변환 
sparse_coo = sparse.coo_matrix((data2, (row_pos,col_pos)))

# 행 위치 배열의 고유한 값들의 시작 위치 인덱스를 배열로 생성
row_pos_ind = np.array([0, 2, 7, 9, 10, 12, 13])

# CSR 형식으로 변환 
sparse_csr = sparse.csr_matrix((data2, col_pos, row_pos_ind))

print('COO 변환된 데이터가 제대로 되었는지 다시 Dense로 출력 확인')
print(sparse_coo.toarray())
print('CSR 변환된 데이터가 제대로 되었는지 다시 Dense로 출력 확인')
print(sparse_csr.toarray())

COO 변환된 데이터가 제대로 되었는지 다시 Dense로 출력 확인
[[0 0 1 0 0 5]
 [1 4 0 3 2 5]
 [0 6 0 3 0 0]
 [2 0 0 0 0 0]
 [0 0 0 7 0 8]
 [1 0 0 0 0 0]]
CSR 변환된 데이터가 제대로 되었는지 다시 Dense로 출력 확인
[[0 0 1 0 0 5]
 [1 4 0 3 2 5]
 [0 6 0 3 0 0]
 [2 0 0 0 0 0]
 [0 0 0 7 0 8]
 [1 0 0 0 0 0]]


In [127]:
sparse_csr

<6x6 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [128]:
# 위에 만드는걸 직접할 필요는 없고, 펑션 호출시 자동적으로 .coo_matrix 나 .csr_matrix가 만들어짐
dense3 = np.array([[0,0,1,0,0,5],
             [1,4,0,3,2,5],
             [0,6,0,3,0,0],
             [2,0,0,0,0,0],
             [0,0,0,7,0,8],
             [1,0,0,0,0,0]])

coo = sparse.coo_matrix(dense3)
csr = sparse.csr_matrix(dense3)

In [132]:
print(coo)

  (0, 2)	1
  (0, 5)	5
  (1, 0)	1
  (1, 1)	4
  (1, 3)	3
  (1, 4)	2
  (1, 5)	5
  (2, 1)	6
  (2, 3)	3
  (3, 0)	2
  (4, 3)	7
  (4, 5)	8
  (5, 0)	1


In [131]:
print(csr)

  (0, 2)	1
  (0, 5)	5
  (1, 0)	1
  (1, 1)	4
  (1, 3)	3
  (1, 4)	2
  (1, 5)	5
  (2, 1)	6
  (2, 3)	3
  (3, 0)	2
  (4, 3)	7
  (4, 5)	8
  (5, 0)	1


## 8.4

In [133]:
# 예제 dataset import, subset은 조정할수 있는데 일단 여기에선 alL로 
from sklearn.datasets import fetch_20newsgroups

news_data = fetch_20newsgroups(subset='all', random_state=156)

In [134]:
print(news_data.keys())

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])


In [153]:
import pandas as pd

print('target 클래스의 값과 분포도 \n',pd.Series(news_data.target).value_counts().sort_index())
print('target 클래스의 이름들 \n',news_data.target_names)
len(news_data.target_names), pd.Series(news_data.target).shape

target 클래스의 값과 분포도 
 0     799
1     973
2     985
3     982
4     963
5     988
6     975
7     990
8     996
9     994
10    999
11    991
12    984
13    990
14    987
15    997
16    910
17    940
18    775
19    628
dtype: int64
target 클래스의 이름들 
 ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


(20, (18846,))

In [154]:
print(news_data.data[0])

From: egreen@east.sun.com (Ed Green - Pixel Cruncher)
Subject: Re: Observation re: helmets
Organization: Sun Microsystems, RTP, NC
Lines: 21
Distribution: world
Reply-To: egreen@east.sun.com
NNTP-Posting-Host: laser.east.sun.com

In article 211353@mavenry.altcit.eskimo.com, maven@mavenry.altcit.eskimo.com (Norman Hamer) writes:
> 
> The question for the day is re: passenger helmets, if you don't know for 
>certain who's gonna ride with you (like say you meet them at a .... church 
>meeting, yeah, that's the ticket)... What are some guidelines? Should I just 
>pick up another shoei in my size to have a backup helmet (XL), or should I 
>maybe get an inexpensive one of a smaller size to accomodate my likely 
>passenger? 

If your primary concern is protecting the passenger in the event of a
crash, have him or her fitted for a helmet that is their size.  If your
primary concern is complying with stupid helmet laws, carry a real big
spare (you can put a big or small head in a big helmet, bu

In [155]:
# 이건 이 데이터셋이 subset으로 나눌수 있어서 나눈거고, 실제 데이터셋은 train_test_split 같은걸 써야할듯
from sklearn.datasets import fetch_20newsgroups

# subset='train'으로 학습용(Train) 데이터만 추출, remove=('headers', 'footers', 'quotes')로 내용만 추출
train_news= fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), random_state=156)
X_train = train_news.data
y_train = train_news.target
print(type(X_train))

# subset='test'으로 테스트(Test) 데이터만 추출, remove=('headers', 'footers', 'quotes')로 내용만 추출
test_news= fetch_20newsgroups(subset='test',remove=('headers', 'footers','quotes'),random_state=156)
X_test = test_news.data
y_test = test_news.target
print('학습 데이터 크기 {0} , 테스트 데이터 크기 {1}'.format(len(train_news.data) , len(test_news.data)))

<class 'list'>
학습 데이터 크기 11314 , 테스트 데이터 크기 7532


In [160]:
len(X_train), len(y_train), len(X_test), len(y_test)

(11314, 11314, 7532, 7532)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Count Vectorization으로 feature extraction 변환 수행. 
cnt_vect = CountVectorizer()
cnt_vect.fit(X_train)
X_train_cnt_vect = cnt_vect.transform(X_train)

# 학습 데이터로 fit( )된 CountVectorizer를 이용하여 테스트 데이터를 feature extraction 변환 수행. 
X_test_cnt_vect = cnt_vect.transform(X_test)

print('학습 데이터 Text의 CountVectorizer Shape:',X_train_cnt_vect.shape, X_test_cnt_vect.shape)

In [163]:
# 다른거보다도 fit했으면 다시 test셋으로 fit하지 말 것(feature 갯수 망가짐)

from sklearn.feature_extraction.text import CountVectorizer 

cnt_vect = CountVectorizer()
cnt_vect.fit(X_train)

# transforming X_train, X_test data into row * col dataset(actually, csr_matrix type)
X_train_cnt_vect = cnt_vect.transform(X_train)
X_test_cnt_vect = cnt_vect.transform(X_test)

print(X_train_cnt_vect.shape, X_test_cnt_vect.shape)

(11314, 101631) (7532, 101631)


In [170]:
# X_train이 애초에 행만있고 열이 없었기 때문에, vectorizer로 행x열로 변환하고, 그걸 y_train 이랑 Fit 시킨다. 
# logistic regression error: max_iter를 늘려준다 (Default=100)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr_clf = LogisticRegression()
lr_clf.fit(X_train_cnt_vect, y_train)
pred = lr_clf.predict(X_test_cnt_vect)

print(accuracy_score(y_test, pred))

0.6050185873605948


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [181]:
# TFID case: same as CountVectorizer, just change vectorizer initialization
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)

X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr_clf = LogisticRegression(max_iter=500)
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)

print(accuracy_score(y_test, pred))

0.6736590546999469


In [183]:
# Featrue engineering in terms of vectorization 

tfidf_vect2 = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df = 300)
tfidf_vect2.fit(X_train)

X_train_tfidf_vect2 = tfidf_vect2.transform(X_train)
X_test_tfidf_vect2 = tfidf_vect2.transform(X_test)

lr_clf = LogisticRegression(max_iter=500)
lr_clf.fit(X_train_tfidf_vect2, y_train)
pred = lr_clf.predict(X_test_tfidf_vect2)

print(accuracy_score(y_test, pred))

0.6922464152947424


In [185]:
# Hyper parameter tuning in terms of Logistic Regression
# verbose=1이면 간단한 메시지 출력

from sklearn.model_selection import GridSearchCV

params = {'C': [0.01, 0.1, 1, 5, 10]}
grid_cv_lr = GridSearchCV(lr_clf, param_grid = params, cv=3, scoring='accuracy', verbose=1)
grid_cv_lr.fit(X_train_tfidf_vect2, y_train)
print('best estimator: ', grid_cv_lr.best_params_)

pred = grid_cv_lr.predict(X_test_tfidf_vect2)

print(accuracy_score(y_test, pred))

Fitting 3 folds for each of 5 candidates, totalling 15 fits
best estimator:  {'C': 10}
0.7012745618693574


## Using pipeline: But can be more slower than conducting seperately

In [189]:
# Pipeline 만 쓴 버젼 
# Pipeline 에 다 때려넣기 떄문에 X_train 변환을 굳이할필요가 없다
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('tfidf_vect2', TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=300)),
    ('lr_clf', LogisticRegression(C=10, max_iter=500))
])


pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)

print(accuracy_score(y_test, pred))

0.7012745618693574


In [190]:
## Final version
# pipeline + gridsearchCV

from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('tfidf_vect2', TfidfVectorizer(stop_words='english')),
    ('lr_clf', LogisticRegression(max_iter=500))
])

# params need __ for each pipeline parameters
params = { 'tfidf_vect2__ngram_range' : [(1,1), (1,2)],
           'tfidf_vect2__max_df' : [100, 300],
           'lr_clf__C' : [1, 10],          
}

# put pipleline instead of estimator 
grid_cv_pipe = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_cv_pipe.fit(X_train, y_train)
print('best param:{}, best score:{}'.format(grid_cv_pipe.best_params_, grid_cv_pipe.best_score_))

pred = grid_cv_pipe.predict(X_test)

print(accuracy_score(y_test, pred))

Fitting 3 folds for each of 8 candidates, totalling 24 fits
best param:{'lr_clf__C': 10, 'tfidf_vect2__max_df': 300, 'tfidf_vect2__ngram_range': (1, 2)}, best score:0.7538456260353824
0.7012745618693574
