<a href="https://colab.research.google.com/github/tjddyd2259/caba_nlp/blob/main/nlp10_Text_Mining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
nltk.download('punkt')




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
from nltk import sent_tokenize
text_sample = 'Regression analysis is primarily used for two conceptually distinct purposes. First, regression analysis is widely used for prediction and forecasting, where its use has substantial overlap with the field of machine learning. Second, in some situations regression analysis can be used to infer causal relationships between the independent and dependent variables.'
sentences = sent_tokenize(text=text_sample)
print(sentences)
print(type(sentences)), len(sentences)

['Regression analysis is primarily used for two conceptually distinct purposes.', 'First, regression analysis is widely used for prediction and forecasting, where its use has substantial overlap with the field of machine learning.', 'Second, in some situations regression analysis can be used to infer causal relationships between the independent and dependent variables.']
<class 'list'>


(None, 3)

In [3]:
# 단어 토큰화 (word_tokenize) : 공백, 콤마, 마침표, 개행문자, 정규표현식
from nltk import word_tokenize

sentences = 'Regression analysis is primarily used for two conceptually distinct purposes.'
words = word_tokenize(sentences)
print(words)
print(type(words)),len(words)

['Regression', 'analysis', 'is', 'primarily', 'used', 'for', 'two', 'conceptually', 'distinct', 'purposes', '.']
<class 'list'>


(None, 11)

In [4]:
# 문서에 대해서 모든 단어를 토큰화
from nltk import sent_tokenize, word_tokenize

def tokenize_text(text):
  sentences = sent_tokenize(text) # 문장별 분리 토큰
  word_tokens = [word_tokenize(sentence) for sentence in sentences] # 문장별 단어 토큰화 
  return word_tokens

word_tokens = tokenize_text(text_sample)
print(word_tokens)
print(type(word_tokens)) , len(word_tokens)

[['Regression', 'analysis', 'is', 'primarily', 'used', 'for', 'two', 'conceptually', 'distinct', 'purposes', '.'], ['First', ',', 'regression', 'analysis', 'is', 'widely', 'used', 'for', 'prediction', 'and', 'forecasting', ',', 'where', 'its', 'use', 'has', 'substantial', 'overlap', 'with', 'the', 'field', 'of', 'machine', 'learning', '.'], ['Second', ',', 'in', 'some', 'situations', 'regression', 'analysis', 'can', 'be', 'used', 'to', 'infer', 'causal', 'relationships', 'between', 'the', 'independent', 'and', 'dependent', 'variables', '.']]
<class 'list'>


(None, 3)

In [5]:
# 스톱워드 제거 : the , is , a , will 와 같이 문맥적으로 큰 의미가 없는 단어를 제거
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
# NLTK english stopwords 갯수 확인
print(len(nltk.corpus.stopwords.words('english')))
print(nltk.corpus.stopwords.words('english')[:20])

179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his']


In [7]:
# stopwords 필터링을 통한 제거
import nltk
stopwords = nltk.corpus.stopwords.words('english')
all_tokens = []
for sentence in word_tokens:
  filtered_words = []
  for word in sentence:
    word = word.lower()
    if word not in stopwords:
      filtered_words.append(word)
  all_tokens.append(filtered_words)
print(all_tokens)

[['regression', 'analysis', 'primarily', 'used', 'two', 'conceptually', 'distinct', 'purposes', '.'], ['first', ',', 'regression', 'analysis', 'widely', 'used', 'prediction', 'forecasting', ',', 'use', 'substantial', 'overlap', 'field', 'machine', 'learning', '.'], ['second', ',', 'situations', 'regression', 'analysis', 'used', 'infer', 'causal', 'relationships', 'independent', 'dependent', 'variables', '.']]


In [8]:
# 문법적 또는 의미적으로 변화하는 단어의 원형을 찾는 방법
# Stemmer(LancasterStemmer)
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()
print(stemmer.stem('working'),stemmer.stem('works'),stemmer.stem('worked'))
print(stemmer.stem('amusing'),stemmer.stem('aumses'),stemmer.stem('amused'))
print(stemmer.stem('fancier'),stemmer.stem('fancist'))

work work work
amus aums amus
fant fant


In [9]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [10]:
# Lemmatizetion(WordNetLemmatizer) : 정확한 원형 단어 추출을 위해 단어의 품사를 직접 입력
from nltk.stem.wordnet import WordNetLemmatizer

lemma = WordNetLemmatizer()
print(lemma.lemmatize('working','v'),lemma.lemmatize('works','v'),lemma.lemmatize('worked','v'))
print(lemma.lemmatize('amusing','v'),lemma.lemmatize('aumses','v'),lemma.lemmatize('amused','v'))
print(lemma.lemmatize('fancier','a'),lemma.lemmatize('fancist','a'))

work work work
amuse aumses amuse
fancy fancist


In [11]:
import numpy as np
num_samples = 100
height = 71
width = 71
num_classes = 100

import tensorflow as tf
from keras.applications import Xception
import datetime
start = datetime.datetime.now()

model = Xception(weights = None,
                 input_shape =( height,width,3),
                 classes = num_classes)
model.compile(loss = 'categorical_crossentropy',
              optimizer = 'rmsprop')
x=np.random.random((num_samples,height,width,3))
y=np.random.random((num_samples,num_classes))

model.fit(x,y,epochs=3,batch_size=16)
model.save('my_model.h5')
end = datetime.datetime.now()
time_delta = end - start

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [12]:
print('걸린 시간: {}초'.format(time_delta.seconds))

걸린 시간: 47초


In [13]:
import numpy as np
num_samples = 100
height = 71
width = 71
num_classes = 100

import tensorflow as tf
from keras.applications import Xception
import datetime
start = datetime.datetime.now()

with tf.device('/cpu:0'):

  model = Xception(weights = None,
                  input_shape =( height,width,3),
                  classes = num_classes)
  model.compile(loss = 'categorical_crossentropy',
                optimizer = 'rmsprop')
  x=np.random.random((num_samples,height,width,3))
  y=np.random.random((num_samples,num_classes))

model.fit(x,y,epochs=3,batch_size=16)
model.save('my_model.h5')
end = datetime.datetime.now()
time_delta = end - start

Epoch 1/3


KeyboardInterrupt: ignored

In [None]:
print('걸린 시간: {}초'.format(time_delta.seconds))

In [None]:
!pip install konlpy

In [None]:
from konlpy.tag import Okt
okt = Okt()
print(okt.morphs('단독입찰보다 복수입찰의 경우'))

In [None]:
print(okt.nouns('유일하게 항공기 체계 종합개발 경험을 갖고 있는 kai는'))

In [None]:
print(okt.nouns('나는 프로젝트를 하고있는데 너무 어렵다'))

In [None]:
print(okt.phrases('날카로운 분석과 신뢰감 있는 진행으로'))

In [None]:
print(okt.pos('이것도 되나욬ㅋㅋ',norm=True))

In [None]:
print(okt.pos('이것도 되나욬ㅋㅋ',norm=True,stem=True))

In [None]:
print(okt.pos('이것도 되나욬ㅋㅋ',norm=True,stem=True,join=True))

In [None]:
print(okt.pos('아름다운 꽃과 파란 하늘',norm=True))

In [None]:
a = okt.pos('아름다운 꽃과 파란 하늘')
b = []
c = []
for i in range(len(a)):
  if a[i][1] == 'Adjective':
    b.append(a[i][0])
  else :
    c.append(a[i][0])



In [None]:
b = []
for i,j in a:
  if j == 'Noun':
    b.append(i)
b

In [None]:
list1 = okt.nouns('나는 오늘 방콕에 가고싶다.')
list2 = okt.pos('나는 오늘 방콕에 갔다.',norm=True,stem=True)
list3 = okt.morphs('친절한 코치와 재미있는 친구들이 있는 도장에 가고 싶다')
list4 = okt.pos('나는 오늘도 장에 가고싶다',norm=True,stem=True,join=True)


In [None]:
from sklearn.datasets import fetch_20newsgroups

news_data = fetch_20newsgroups(subset='all',random_state=0)
news_data.keys()

In [None]:
import pandas as pd
print(news_data.target)
a = pd.Series(news_data.target).unique()
sorted(a)

In [None]:
train_news = fetch_20newsgroups(subset='train',remove=('header','footer','quotes'),random_state=0)
X_train = train_news.data
y_train = train_news.target
test_news = fetch_20newsgroups(subset='test',remove=('header','footer','quotes'),random_state=0)
X_test = test_news.data
y_test = test_news.target
print(len(X_train),len(X_test))

In [None]:
print(news_data.target_names)
print(pd.Series(y_test).value_counts().sort_index())

In [None]:
# 피처 벡터화 변환
from sklearn.feature_extraction.text import CountVectorizer
cnt_vect = CountVectorizer()
cnt_vect.fit(X_train)
X_train_cnt_vect = cnt_vect.transform(X_train)
# 학습 데이터로 fit()된 Countervectorizer를 이용, 테스트 데이터 피처 벡터화 변환 
# (피처 개수가 동일해야 함)
X_test_cnt_vect = cnt_vect.transform(X_test)
print(X_train_cnt_vect.shape)
print(X_test_cnt_vect.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression()
lr.fit(X_train_cnt_vect,y_train)
pred = lr.predict(X_test_cnt_vect)
accuracy_score(y_test,pred)


In [None]:
# 피처 벡터화 변환 : TF-IDF 벡터화
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr = LogisticRegression()
lr.fit(X_train_tfidf_vect,y_train)
pred = lr.predict(X_test_tfidf_vect)
accuracy_score(y_test,pred)

In [None]:
# stop words 필터링 추가 , ngram을 기본(1,1)에서 (1,2)로 max_df=300 으로 변경해 피처 벡터화 적용
tfidf_vect = TfidfVectorizer(stop_words='english',ngram_range=(1,2),max_df=300)
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr = LogisticRegression()
lr.fit(X_train_tfidf_vect,y_train)
pred = lr.predict(X_test_tfidf_vect)
accuracy_score(y_test,pred)

In [None]:
from sklearn.model_selection import GridSearchCV
# 최적 c값 도출 튜닝 수행 , cv 는 3 Fold셋으로 설정
params = {'C':[5,10]}
gcv_lr = GridSearchCV(lr,param_grid=params,cv=3,scoring='accuracy',verbose=1)
gcv_lr.fit(X_train_tfidf_vect,y_train)
print(gcv_lr.best_params_)
lr_pred = gcv_lr.predict(X_test_tfidf_vect)
print(accuracy_score(y_test,lr_pred))

In [None]:
# 사이킷런 파이프라인 
# TfidfVectorizer 객체를 tfidf_vect 객체명으로 LogisticRegression 객체를 lr_clf 객체명으로 생성하는 
# pipeline 생성
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
                     ('tfidf_vect',TfidfVectorizer(stop_words='english',
                                                   ngram_range=(1,2),max_df=300)),
                     ('lr',LogisticRegression(C=10))])
pipeline.fit(X_train,y_train)
pred = pipeline.predict(X_test)
print(accuracy_score(y_test,pred))


In [None]:
# 사이킷런 파이프라인과   GridSearchCV와의 결합 
pipeline = Pipeline([
                     ('tfidf_vect',TfidfVectorizer(stop_words='english')),
                     ('lr',LogisticRegression())
])
params = {'tfidf_vect__ngram_range':[(1,1),(1,2),(1,3)],
          'tfidf_vect__max_df':[100,300,700],
          'lr__C':[1,5,10]}
grid_cv_pipe = GridSearchCV(pipeline,param_grid=params,cv=3,scoring='accuracy',verbose=1)
grid_cv_pipe.fit(X_train,y_train)
print(grid_cv_pipe.best_params_,grid_cv_pipe.best_score_)
pred = grid_cv_pipe.predict(X_test)
print(accuracy_score(y_test,pred))


In [15]:
pwd

'/content'

In [16]:
import pandas as pd 
import numpy as np

train_df = pd.read_csv('ratings_train.txt',sep='\t')
test_df = pd.read_csv('ratings_test.txt',sep='\t')

In [17]:
import re
train_df = train_df.fillna(' ')
train_df['document'] = train_df['document'].apply(lambda x:re.sub(r'\d+',' ',x))
test_df = test_df.fillna(' ')
test_df['document'] = test_df['document'].apply(lambda x:re.sub(r'\d+',' ',x))

In [20]:
!pip install konlpy



In [33]:
train_df

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1
...,...,...,...
149995,6222902,인간이 문제지.. 소는 뭔죄인가..,0
149996,8549745,평점이 너무 낮아서...,1
149997,9311800,이게 뭐요? 한국인은 거들먹거리고 필리핀 혼혈은 착하다?,0
149998,2376369,청춘 영화의 최고봉.방황과 우울했던 날들의 자화상,1


In [21]:

from konlpy.tag import Okt

okt = Okt()
def okt_tokenizer(text):
    tokens_ko = okt.morphs(text)
    return tokens_ko
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

tfidf_vect = TfidfVectorizer(tokenizer=okt_tokenizer, ngram_range=(1,2),min_df=3,max_df=0.9)
tfidf_vect.fit(train_df.document)
tfidf_matrix_train = tfidf_vect.transform(train_df.document)



KeyboardInterrupt: ignored

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

lr_clf = LogisticRegression
params = {'C':[1,3.5,4.5,5.5,10]}
grid_cv = GridSearchCV(lr_clf,param_grid=params,cv=3,scoring='accuracy',verbose=1)
grid_cv.fit(tfidf_matrix_train,train_df.label)
print(grid_cv.best_params_,round(grid_cv.best_score_,4))

In [25]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

cats = ['rec.motorcycles','rec.sport.baseball','comp.graphics','comp.windows.x',
        'talk.politics.mideast','soc.religion.christian','sci.electronics','sci.med']
      
news_df = fetch_20newsgroups(subset='all',remove=('headers','footers','quotes'),
                             categories = cats, random_state=0)
count_vect = CountVectorizer(max_df = 0.95,max_features=1000,min_df=2,stop_words='english',
                             ngram_range=(1,2))
feat_vect = count_vect.fit_transform(news_df.data)
print('CountVectorizer Shape:',feat_vect.shape)

CountVectorizer Shape: (7862, 1000)


In [28]:
lda = LatentDirichletAllocation(n_components=8,random_state=0)
lda.fit(feat_vect)


LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=8, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [29]:
print(lda.components_.shape)

(8, 1000)


In [30]:
lda.components_

array([[3.60992018e+01, 1.35626798e+02, 2.15751867e+01, ...,
        3.02911688e+01, 8.66830093e+01, 6.79285199e+01],
       [1.25199920e-01, 1.44401815e+01, 1.25045596e-01, ...,
        1.81506995e+02, 1.25097844e-01, 9.39593286e+01],
       [3.34762663e+02, 1.25176265e-01, 1.46743299e+02, ...,
        1.25105772e-01, 3.63689741e+01, 1.25025218e-01],
       ...,
       [3.60204965e+01, 2.08640688e+01, 4.29606813e+00, ...,
        1.45056650e+01, 8.33854413e+00, 1.55690009e+01],
       [1.25128711e-01, 1.25247756e-01, 1.25005143e-01, ...,
        9.17278769e+01, 1.25177668e-01, 3.74575887e+01],
       [5.49258690e+01, 4.47009532e+00, 9.88524814e+00, ...,
        4.87048440e+01, 1.25034678e-01, 1.25074632e-01]])

In [31]:
feature_names = count_vect.get_feature_names()
len(feature_names)

1000

In [35]:
def display_topics(model,feature_names,no_top_words):
  for topic_index,topic in enumerate(model.components_):
    print('Topic #', topic_index)
    topic_word_indexes = topic.argsort()[::-1]
    top_indexes = topic_word_indexes[:no_top_words]
    
    feature_concat = ' '.join([feature_names[i] for i in top_indexes])
    print(feature_concat)

feature_names = count_vect.get_feature_names()

display_topics(lda,feature_names,15)
  

Topic # 0
year 10 game medical health team 12 20 disease cancer 1993 games years patients good
Topic # 1
don just like know people said think time ve didn right going say ll way
Topic # 2
image file jpeg program gif images output format files color entry 00 use bit 03
Topic # 3
like know don think use does just good time book read information people used post
Topic # 4
armenian israel armenians jews turkish people israeli jewish government war dos dos turkey arab armenia 000
Topic # 5
edu com available graphics ftp data pub motif mail widget software mit information version sun
Topic # 6
god people jesus church believe christ does christian say think christians bible faith sin life
Topic # 7
use dos thanks windows using window does display help like problem server need know run


문서 군집화
- 비슷한 텍스트 구성의 문서를 군집화하여 같은 카테고리 소속으로 분류
- 학습 데이터 세트가 필요없는 비지도학습 기반으로 동작

In [5]:
!pwd

/content


In [10]:
!pip install konlpy

Collecting konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 1.3MB/s 
[?25hCollecting beautifulsoup4==4.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237bfcd51bbffeaf0a576b0a847ec7ab15bd7ace/beautifulsoup4-4.6.0-py3-none-any.whl (86kB)
[K     |████████████████████████████████| 92kB 8.3MB/s 
[?25hCollecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Collecting JPype1>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/cd/a5/9781e2ef4ca92d09912c4794642c1653aea7607f473e156cf4d423a881a1/JPype1-1.2.1-cp37-cp37m-manylinux2010_x86_64.whl (457kB)
[K     |████████████████████████████████| 460kB 36.2MB/s 
Installing collected packages: bea

In [6]:
# glob 모듈의 glob 함수는 path로 지정한 디렉토리 밑에 있는 모든 .data 파일들의 파일명을 리스트로 반환

import pandas as pd
import glob, os

path = 'sample_data/topics'
all_files = glob.glob(os.path.join(path,'*.data'))

filename_list = []
opinion_text = []

for file_ in all_files:
  df = pd.read_table(file_, index_col = None, header=0,encoding='latin1')
  filename_ = file_.split('/')[-1]
  filename = filename_.split('.')[0]

  filename_list.append(filename)
  opinion_text.append(df.to_string())

document_df = pd.DataFrame({'filename':filename_list,'opinion_text':opinion_text})
document_df.head()

Unnamed: 0,filename,opinion_text
0,price_holiday_inn_london,...
1,service_holiday_inn_london,...
2,mileage_honda_accord_2008,...
3,performance_honda_accord_2008,...
4,interior_toyota_camry_2007,...


In [7]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [11]:
from nltk.stem import WordNetLemmatizer
import string

remove_punct_dict = dict((ord(punct),None)for punct in string.punctuation)

lemma = WordNetLemmatizer()
def LemTokens(tokens):
  return [lemma.lemmatize(token)for token in tokens]
def LemNormalize(text):
  return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

In [13]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(tokenizer=LemNormalize,stop_words='english',
                             ngram_range=(1,2),min_df=0.05,max_df=0.85)
feature_vect = tfidf_vect.fit_transform(document_df['opinion_text'])

  'stop_words.' % sorted(inconsistent))


In [15]:
from sklearn.cluster import KMeans

km_cluster = KMeans(n_clusters=5,max_iter=10000,random_state=0)
km_cluster.fit(feature_vect)
cluster_label = km_cluster.labels_
cluster_centers = km_cluster.cluster_centers_
document_df['cluster_label'] = cluster_label
document_df.head() 

Unnamed: 0,filename,opinion_text,cluster_label
0,price_holiday_inn_london,...,2
1,service_holiday_inn_london,...,2
2,mileage_honda_accord_2008,...,0
3,performance_honda_accord_2008,...,0
4,interior_toyota_camry_2007,...,0


In [16]:
print(document_df[document_df['cluster_label']==0])
print(document_df[document_df['cluster_label']==1])
print(document_df[document_df['cluster_label']==2])
print(document_df[document_df['cluster_label']==3])
print(document_df[document_df['cluster_label']==4])


                          filename  ... cluster_label
2        mileage_honda_accord_2008  ...             0
3    performance_honda_accord_2008  ...             0
4       interior_toyota_camry_2007  ...             0
25   gas_mileage_toyota_camry_2007  ...             0
27       quality_toyota_camry_2007  ...             0
30  transmission_toyota_camry_2007  ...             0
32         seats_honda_accord_2008  ...             0
35      interior_honda_accord_2008  ...             0
36       comfort_honda_accord_2008  ...             0
39       comfort_toyota_camry_2007  ...             0

[10 rows x 3 columns]
                           filename  ... cluster_label
5    satellite_garmin_nuvi_255W_gps  ...             1
14       speed_garmin_nuvi_255W_gps  ...             1
21             screen_ipod_nano_8gb  ...             1
22       voice_garmin_nuvi_255W_gps  ...             1
23      screen_garmin_nuvi_255W_gps  ...             1
24    accuracy_garmin_nuvi_255W_gps  ...             

In [17]:

km_cluster = KMeans(n_clusters=3,max_iter=10000,random_state=0)
km_cluster.fit(feature_vect)
cluster_label = km_cluster.labels_
cluster_centers = km_cluster.cluster_centers_
document_df['cluster_label'] = cluster_label
document_df.sort_values(by='cluster_label')

Unnamed: 0,filename,opinion_text,cluster_label
25,gas_mileage_toyota_camry_2007,...,0
30,transmission_toyota_camry_2007,...,0
32,seats_honda_accord_2008,...,0
35,interior_honda_accord_2008,...,0
27,quality_toyota_camry_2007,...,0
39,comfort_toyota_camry_2007,...,0
36,comfort_honda_accord_2008,...,0
4,interior_toyota_camry_2007,...,0
3,performance_honda_accord_2008,...,0
2,mileage_honda_accord_2008,...,0


In [18]:
cluster_centers = km_cluster.cluster_centers_
print(cluster_centers.shape)
print(cluster_centers)

(3, 4611)
[[0.         0.00092551 0.         ... 0.         0.         0.        ]
 [0.         0.00099499 0.00174637 ... 0.         0.00183397 0.00144581]
 [0.01005322 0.         0.         ... 0.00706287 0.         0.        ]]


In [26]:
def get_cluster_details(cluster_model, cluster_data,feature_names,clusters_num,top_n_features=10):
  cluster_details = {}
  centroid_feature_ordered_ind = cluster_model.cluster_centers_.argsort()[::-1]
  for cluster_num in range(clusters_num):
    cluster_details[cluster_num] = {}
    cluster_details[cluster_num]['cluster'] = cluster_num

    top_feature_indexes = centroid_feature_ordered_ind[cluster_num,:top_n_features]
    top_features = [feature_names[ind] for ind in top_feature_indexes]

    top_feature_values = cluster_model.cluster_centers_[cluster_num,top_feature_indexes].tolist()

    cluster_details[cluster_num]['top_features'] = top_features
    cluster_details[cluster_num]['top_features_value'] = top_feature_values
    filenames = cluster_data[cluster_data['cluster_label']==cluster_num]['filename']
    cluster_details[cluster_num]['filenames'] = filenames
  
  return cluster_details

def print_cluster_details(cluster_details):
  for cluster_num, cluster_detail in cluster_details.items():
    print('### Cluster {0}'.format(cluster_num))
    print('Top features:',cluster_detail['top_features'])
    print('Review 파일명:',cluster_detail['filenames'][:7])

feature_names = tfidf_vect.get_feature_names()
cluster_details = get_cluster_details(cluster_model=km_cluster,
                                      cluster_data=document_df,
                                      feature_names=feature_names,
                                      clusters_num = 3, top_n_features=10)
print_cluster_details(cluster_details)


### Cluster 0
Top features: ['£6', 'served', 'francisco', 'fran', 'service 111', 'service 33', 'service 34', 'service 53', 'service 57', 'foot']
Review 파일명: 2          mileage_honda_accord_2008
3      performance_honda_accord_2008
4         interior_toyota_camry_2007
25     gas_mileage_toyota_camry_2007
27         quality_toyota_camry_2007
30    transmission_toyota_camry_2007
32           seats_honda_accord_2008
Name: filename, dtype: object
### Cluster 1
Top features: ['0 5', 'iphone', 'intrusive', 'interstate', 'interior trim', 'interior roomy', 'interior quality', 'interior nice', 'interior new', 'interior luxurious']
Review 파일명: 0            price_holiday_inn_london
1          service_holiday_inn_london
7         location_holiday_inn_london
12    service_swissotel_hotel_chicago
13     location_bestwestern_hotel_sfo
15             food_swissotel_chicago
17            food_holiday_inn_london
Name: filename, dtype: object
### Cluster 2
Top features: ['0 5', 'navigator', 'near kensingt