In [2]:
from sklearn.datasets import load_files
import numpy as np

reviews_train = load_files("/Users/wnd180/Downloads/aclImdb/train")
reviews_test = load_files("/Users/wnd180/Downloads/aclImdb/test")

text_train, y_train = reviews_train.data, reviews_train.target
text_test, y_test = reviews_test.data, reviews_test.target

text_train = [doc.replace(b"<br /", b" ") for doc in text_train]
text_test = [doc.replace(b"<br /", b" ") for doc in text_test]

print("학습 데이터의 문서 수: {}".format(len(text_train)))
print("학습 데이터의 문서 수: {}".format(len(text_test)))


학습 데이터의 문서 수: 25000
학습 데이터의 문서 수: 25000


In [3]:
print("클래스별 샘플 수 (학습 데이터): {}".format(np.bincount(y_train)))
print("클래스별 샘플 수 (테스트 데이터): {}".format(np.bincount(y_test)))

클래스별 샘플 수 (학습 데이터): [12500 12500]
클래스별 샘플 수 (테스트 데이터): [12500 12500]


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)
print("X_train:\n{}".format(repr(X_train)))

X_train:
<25000x74849 sparse matrix of type '<class 'numpy.int64'>'
	with 3431196 stored elements in Compressed Sparse Row format>


In [5]:
features_names = vect.get_feature_names()
print("특성 개수:{}".format(len(features_names)))
print("첫 20개의 특성:\n{}".format(features_names[:20]))

특성 개수:74849
첫 20개의 특성:
['00', '000', '0000000000001', '00001', '00015', '000s', '001', '003830', '006', '007', '0079', '0080', '0083', '0093638', '00am', '00pm', '00s', '01', '01pm', '02']




In [6]:
print("20010에서 20030까지의 특성:\n{}".format(features_names[20010:20030]))
print("매 2000번째 특성:\n{}".format(features_names[::2000]))

20010에서 20030까지의 특성:
['dratted', 'draub', 'draught', 'draughts', 'draughtswoman', 'draw', 'drawback', 'drawbacks', 'drawer', 'drawers', 'drawing', 'drawings', 'drawl', 'drawled', 'drawling', 'drawn', 'draws', 'draza', 'dre', 'drea']
매 2000번째 특성:
['00', 'aesir', 'aquarian', 'barking', 'blustering', 'bête', 'chicanery', 'condensing', 'cunning', 'detox', 'draper', 'enshrined', 'favorit', 'freezer', 'goldman', 'hasan', 'huitieme', 'intelligible', 'kantrowitz', 'lawful', 'maars', 'megalunged', 'mostey', 'norrland', 'padilla', 'pincher', 'promisingly', 'receptionist', 'rivals', 'schnaas', 'shunning', 'sparse', 'subset', 'temptations', 'treatises', 'unproven', 'walkman', 'xylophonist']


In [7]:
# 불용어 제거
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
print("불용어 개수: {}".format(len(ENGLISH_STOP_WORDS)))
#english_stop_words는 frozenset 클래스 -> 리스트로 변환해야 함.

print("불용어 일부:\n{}".format(list(ENGLISH_STOP_WORDS)[:20]))

불용어 개수: 318
불용어 일부:
['thin', 'any', 'within', 'afterwards', 'toward', 're', 'less', 'indeed', 'herself', 'else', 'six', 'why', 'fill', 'more', 'almost', 'too', 'much', 'twelve', 'a', 'she']


In [8]:
vect = CountVectorizer(min_df=5, stop_words="english").fit(text_train)
X_train = vect.transform(text_train)
print("불용어가 제거된 x_train: \n{}".format(repr(X_train)))

불용어가 제거된 x_train: 
<25000x26966 sparse matrix of type '<class 'numpy.int64'>'
	with 2149958 stored elements in Compressed Sparse Row format>


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?'
]

vect1 = CountVectorizer().fit(corpus)
tf = vect1.transform(corpus)
features_names = vect1.get_feature_names()
print("Term:{}".format(features_names[:]))
print(tf.toarray())
print()

vect2 = TfidfVectorizer().fit(corpus)
tfidf = vect2.transform(corpus)
print(tfidf.toarray())

Term:['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
[[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]

[[0.         0.43877674 0.54197657 0.43877674 0.         0.
  0.35872874 0.         0.43877674]
 [0.         0.27230147 0.         0.27230147 0.         0.85322574
  0.22262429 0.         0.27230147]
 [0.55280532 0.         0.         0.         0.55280532 0.
  0.28847675 0.55280532 0.        ]
 [0.         0.43877674 0.54197657 0.43877674 0.         0.
  0.35872874 0.         0.43877674]]


In [10]:
bards_words = ['The fool doth think he is wise,', 
'but the wise man knows himself to be a fool']

cv = CountVectorizer(ngram_range=(1,1)).fit(bards_words)
print("어휘사전의 크기",len(cv.vocabulary_))
print("어휘사전",cv.get_feature_names())
print("변환된 데이터", cv.transform(bards_words).toarray())

어휘사전의 크기 13
어휘사전 ['be', 'but', 'doth', 'fool', 'he', 'himself', 'is', 'knows', 'man', 'the', 'think', 'to', 'wise']
변환된 데이터 [[0 0 1 1 1 0 1 0 0 1 1 0 1]
 [1 1 0 1 0 1 0 1 1 1 0 1 1]]


In [11]:
cv2 = CountVectorizer(ngram_range=(2,2)).fit(bards_words)
print("어휘사전의 크기",len(cv2.vocabulary_))
print("어휘사전",cv2.get_feature_names())
print("변환된 데이터", cv2.transform(bards_words).toarray())

어휘사전의 크기 14
어휘사전 ['be fool', 'but the', 'doth think', 'fool doth', 'he is', 'himself to', 'is wise', 'knows himself', 'man knows', 'the fool', 'the wise', 'think he', 'to be', 'wise man']
변환된 데이터 [[0 0 1 1 1 0 1 0 0 1 0 1 0 0]
 [1 1 0 0 0 1 0 1 1 0 1 0 1 1]]


In [12]:
cv1to3 = CountVectorizer(ngram_range=(1,3)).fit(bards_words)
print("어휘사전의 크기",len(cv1to3.vocabulary_))
print("어휘사전",cv1to3.get_feature_names())
print("변환된 데이터", cv1to3.transform(bards_words).toarray())

어휘사전의 크기 39
어휘사전 ['be', 'be fool', 'but', 'but the', 'but the wise', 'doth', 'doth think', 'doth think he', 'fool', 'fool doth', 'fool doth think', 'he', 'he is', 'he is wise', 'himself', 'himself to', 'himself to be', 'is', 'is wise', 'knows', 'knows himself', 'knows himself to', 'man', 'man knows', 'man knows himself', 'the', 'the fool', 'the fool doth', 'the wise', 'the wise man', 'think', 'think he', 'think he is', 'to', 'to be', 'to be fool', 'wise', 'wise man', 'wise man knows']
변환된 데이터 [[0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 1 1 0 0 0 0 0 0 1 1 1 0 0 1 1 1 0 0 0
  1 0 0]
 [1 1 1 1 1 0 0 0 1 0 0 0 0 0 1 1 1 0 0 1 1 1 1 1 1 1 0 0 1 1 0 0 0 1 1 1
  1 1 1]]


In [13]:
#char 로 해보기
analyzer = CountVectorizer(analyzer="char").fit(bards_words)
print("어휘사전 크기", len(analyzer.vocabulary_))
print("어휘사전", analyzer.get_feature_names())
print("변환된 데이터", analyzer.transform(bards_words).toarray())


어휘사전 크기 18
어휘사전 [' ', ',', 'a', 'b', 'd', 'e', 'f', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 's', 't', 'u', 'w']
변환된 데이터 [[6 1 0 0 1 3 1 4 3 1 1 0 1 3 2 3 0 1]
 [9 0 2 2 0 4 2 2 2 1 2 2 2 4 3 3 1 2]]


In [14]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer()

example_words = ['python','pythoner','pythoning','pythoned','puthonly']
for stem in example_words:
    print(ps.stem(stem))

python
python
python
python
puthonli


In [17]:
new_text =  "It is important to be very pythonly while you are pythoning with python"

words = word_tokenize(new_text)
print(words)

for w in words:
    print(ps.stem(w))

['It', 'is', 'important', 'to', 'be', 'very', 'pythonly', 'while', 'you', 'are', 'pythoning', 'with', 'python']
it
is
import
to
be
veri
pythonli
while
you
are
python
with
python
