<a href="https://colab.research.google.com/github/tjddyd2259/caba_nlp/blob/main/nlp10_Text_Mining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import nltk
nltk.download('punkt')




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
from nltk import sent_tokenize
text_sample = 'Regression analysis is primarily used for two conceptually distinct purposes. First, regression analysis is widely used for prediction and forecasting, where its use has substantial overlap with the field of machine learning. Second, in some situations regression analysis can be used to infer causal relationships between the independent and dependent variables.'
sentences = sent_tokenize(text=text_sample)
print(sentences)
print(type(sentences)), len(sentences)

['Regression analysis is primarily used for two conceptually distinct purposes.', 'First, regression analysis is widely used for prediction and forecasting, where its use has substantial overlap with the field of machine learning.', 'Second, in some situations regression analysis can be used to infer causal relationships between the independent and dependent variables.']
<class 'list'>


(None, 3)

In [4]:
# 단어 토큰화 (word_tokenize) : 공백, 콤마, 마침표, 개행문자, 정규표현식
from nltk import word_tokenize

sentences = 'Regression analysis is primarily used for two conceptually distinct purposes.'
words = word_tokenize(sentences)
print(words)
print(type(words)),len(words)

['Regression', 'analysis', 'is', 'primarily', 'used', 'for', 'two', 'conceptually', 'distinct', 'purposes', '.']
<class 'list'>


(None, 11)

In [5]:
# 문서에 대해서 모든 단어를 토큰화
from nltk import sent_tokenize, word_tokenize

def tokenize_text(text):
  sentences = sent_tokenize(text) # 문장별 분리 토큰
  word_tokens = [word_tokenize(sentence) for sentence in sentences] # 문장별 단어 토큰화 
  return word_tokens

word_tokens = tokenize_text(text_sample)
print(word_tokens)
print(type(word_tokens)) , len(word_tokens)

[['Regression', 'analysis', 'is', 'primarily', 'used', 'for', 'two', 'conceptually', 'distinct', 'purposes', '.'], ['First', ',', 'regression', 'analysis', 'is', 'widely', 'used', 'for', 'prediction', 'and', 'forecasting', ',', 'where', 'its', 'use', 'has', 'substantial', 'overlap', 'with', 'the', 'field', 'of', 'machine', 'learning', '.'], ['Second', ',', 'in', 'some', 'situations', 'regression', 'analysis', 'can', 'be', 'used', 'to', 'infer', 'causal', 'relationships', 'between', 'the', 'independent', 'and', 'dependent', 'variables', '.']]
<class 'list'>


(None, 3)

In [6]:
# 스톱워드 제거 : the , is , a , will 와 같이 문맥적으로 큰 의미가 없는 단어를 제거
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
# NLTK english stopwords 갯수 확인
print(len(nltk.corpus.stopwords.words('english')))
print(nltk.corpus.stopwords.words('english')[:20])

179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his']


In [8]:
# stopwords 필터링을 통한 제거
import nltk
stopwords = nltk.corpus.stopwords.words('english')
all_tokens = []
for sentence in word_tokens:
  filtered_words = []
  for word in sentence:
    word = word.lower()
    if word not in stopwords:
      filtered_words.append(word)
  all_tokens.append(filtered_words)
print(all_tokens)

[['regression', 'analysis', 'primarily', 'used', 'two', 'conceptually', 'distinct', 'purposes', '.'], ['first', ',', 'regression', 'analysis', 'widely', 'used', 'prediction', 'forecasting', ',', 'use', 'substantial', 'overlap', 'field', 'machine', 'learning', '.'], ['second', ',', 'situations', 'regression', 'analysis', 'used', 'infer', 'causal', 'relationships', 'independent', 'dependent', 'variables', '.']]


In [9]:
# 문법적 또는 의미적으로 변화하는 단어의 원형을 찾는 방법
# Stemmer(LancasterStemmer)
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()
print(stemmer.stem('working'),stemmer.stem('works'),stemmer.stem('worked'))
print(stemmer.stem('amusing'),stemmer.stem('aumses'),stemmer.stem('amused'))
print(stemmer.stem('fancier'),stemmer.stem('fancist'))

work work work
amus aums amus
fant fant


In [10]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [11]:
# Lemmatizetion(WordNetLemmatizer) : 정확한 원형 단어 추출을 위해 단어의 품사를 직접 입력
from nltk.stem.wordnet import WordNetLemmatizer

lemma = WordNetLemmatizer()
print(lemma.lemmatize('working','v'),lemma.lemmatize('works','v'),lemma.lemmatize('worked','v'))
print(lemma.lemmatize('amusing','v'),lemma.lemmatize('aumses','v'),lemma.lemmatize('amused','v'))
print(lemma.lemmatize('fancier','a'),lemma.lemmatize('fancist','a'))

work work work
amuse aumses amuse
fancy fancist


In [12]:
import numpy as np
num_samples = 100
height = 71
width = 71
num_classes = 100

import tensorflow as tf
from keras.applications import Xception
import datetime
start = datetime.datetime.now()

model = Xception(weights = None,
                 input_shape =( height,width,3),
                 classes = num_classes)
model.compile(loss = 'categorical_crossentropy',
              optimizer = 'rmsprop')
x=np.random.random((num_samples,height,width,3))
y=np.random.random((num_samples,num_classes))

model.fit(x,y,epochs=3,batch_size=16)
model.save('my_model.h5')
end = datetime.datetime.now()
time_delta = end - start

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [13]:
print('걸린 시간: {}초'.format(time_delta.seconds))

걸린 시간: 48초


In [14]:
import numpy as np
num_samples = 100
height = 71
width = 71
num_classes = 100

import tensorflow as tf
from keras.applications import Xception
import datetime
start = datetime.datetime.now()

with tf.device('/cpu:0'):

  model = Xception(weights = None,
                  input_shape =( height,width,3),
                  classes = num_classes)
  model.compile(loss = 'categorical_crossentropy',
                optimizer = 'rmsprop')
  x=np.random.random((num_samples,height,width,3))
  y=np.random.random((num_samples,num_classes))

model.fit(x,y,epochs=3,batch_size=16)
model.save('my_model.h5')
end = datetime.datetime.now()
time_delta = end - start

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [15]:
print('걸린 시간: {}초'.format(time_delta.seconds))

걸린 시간: 11초


In [16]:
!pip install konlpy



In [17]:
from konlpy.tag import Okt
okt = Okt()
print(okt.morphs('단독입찰보다 복수입찰의 경우'))

['단독', '입찰', '보다', '복수', '입찰', '의', '경우']


In [18]:
print(okt.nouns('유일하게 항공기 체계 종합개발 경험을 갖고 있는 kai는'))

['항공기', '체계', '종합', '개발', '경험']


In [19]:
print(okt.nouns('나는 프로젝트를 하고있는데 너무 어렵다'))

['나', '프로젝트']


In [20]:
print(okt.phrases('날카로운 분석과 신뢰감 있는 진행으로'))

['날카로운 분석', '날카로운 분석과 신뢰감', '날카로운 분석과 신뢰감 있는 진행', '분석', '신뢰', '진행']


In [21]:
print(okt.pos('이것도 되나욬ㅋㅋ',norm=True))

[('이', 'Determiner'), ('것', 'Noun'), ('도', 'Josa'), ('되나요', 'Verb'), ('ㅋㅋ', 'KoreanParticle')]


In [22]:
print(okt.pos('이것도 되나욬ㅋㅋ',norm=True,stem=True))

[('이', 'Determiner'), ('것', 'Noun'), ('도', 'Josa'), ('되다', 'Verb'), ('ㅋㅋ', 'KoreanParticle')]


In [23]:
print(okt.pos('이것도 되나욬ㅋㅋ',norm=True,stem=True,join=True))

['이/Determiner', '것/Noun', '도/Josa', '되다/Verb', 'ㅋㅋ/KoreanParticle']


In [24]:
print(okt.pos('아름다운 꽃과 파란 하늘',norm=True))

[('아름다운', 'Adjective'), ('꽃', 'Noun'), ('과', 'Josa'), ('파란', 'Noun'), ('하늘', 'Noun')]


In [25]:
a = okt.pos('아름다운 꽃과 파란 하늘')
b = []
c = []
for i in range(len(a)):
  if a[i][1] == 'Adjective':
    b.append(a[i][0])
  else :
    c.append(a[i][0])



In [26]:
b = []
for i,j in a:
  if j == 'Noun':
    b.append(i)
b

['꽃', '파란', '하늘']

In [27]:
list1 = okt.nouns('나는 오늘 방콕에 가고싶다.')
list2 = okt.pos('나는 오늘 방콕에 갔다.',norm=True,stem=True)
list3 = okt.morphs('친절한 코치와 재미있는 친구들이 있는 도장에 가고 싶다')
list4 = okt.pos('나는 오늘도 장에 가고싶다',norm=True,stem=True,join=True)


In [28]:
from sklearn.datasets import fetch_20newsgroups

news_data = fetch_20newsgroups(subset='all',random_state=0)
news_data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [29]:
import pandas as pd
print(news_data.target)
a = pd.Series(news_data.target).unique()
sorted(a)

[ 6  1 15 ...  0  5  8]


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [30]:
train_news = fetch_20newsgroups(subset='train',remove=('header','footer','quotes'),random_state=0)
X_train = train_news.data
y_train = train_news.target
test_news = fetch_20newsgroups(subset='test',remove=('header','footer','quotes'),random_state=0)
X_test = test_news.data
y_test = test_news.target
print(len(X_train),len(X_test))

11314 7532


In [31]:
print(news_data.target_names)
print(pd.Series(y_test).value_counts().sort_index())

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
0     319
1     389
2     394
3     392
4     385
5     395
6     390
7     396
8     398
9     397
10    399
11    396
12    393
13    396
14    394
15    398
16    364
17    376
18    310
19    251
dtype: int64


In [32]:
# 피처 벡터화 변환
from sklearn.feature_extraction.text import CountVectorizer
cnt_vect = CountVectorizer()
cnt_vect.fit(X_train)
X_train_cnt_vect = cnt_vect.transform(X_train)
# 학습 데이터로 fit()된 Countervectorizer를 이용, 테스트 데이터 피처 벡터화 변환 
# (피처 개수가 동일해야 함)
X_test_cnt_vect = cnt_vect.transform(X_test)
print(X_train_cnt_vect.shape)
print(X_test_cnt_vect.shape)

(11314, 120756)
(7532, 120756)


In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression()
lr.fit(X_train_cnt_vect,y_train)
pred = lr.predict(X_test_cnt_vect)
accuracy_score(y_test,pred)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.7550451407328731

In [34]:
# 피처 벡터화 변환 : TF-IDF 벡터화
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr = LogisticRegression()
lr.fit(X_train_tfidf_vect,y_train)
pred = lr.predict(X_test_tfidf_vect)
accuracy_score(y_test,pred)

0.7841210833775889

In [35]:
# stop words 필터링 추가 , ngram을 기본(1,1)에서 (1,2)로 max_df=300 으로 변경해 피처 벡터화 적용
tfidf_vect = TfidfVectorizer(stop_words='english',ngram_range=(1,2),max_df=300)
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr = LogisticRegression()
lr.fit(X_train_tfidf_vect,y_train)
pred = lr.predict(X_test_tfidf_vect)
accuracy_score(y_test,pred)

0.774429102496017

In [37]:
from sklearn.model_selection import GridSearchCV
# 최적 c값 도출 튜닝 수행 , cv 는 3 Fold셋으로 설정
params = {'C':[5,10]}
gcv_lr = GridSearchCV(lr,param_grid=params,cv=3,scoring='accuracy',verbose=1)
gcv_lr.fit(X_train_tfidf_vect,y_train)
print(gcv_lr.best_params_)
lr_pred = gcv_lr.predict(X_test_tfidf_vect)
print(accuracy_score(y_test,lr_pred))

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative sol

{'C': 10}
0.7980616038236856


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
# 사이킷런 파이프라인 
# TfidfVectorizer 객체를 tfidf_vect 객체명으로 LogisticRegression 객체를 lr_clf 객체명으로 생성하는 
# pipeline 생성
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
                     ('tfidf_vect',TfidfVectorizer(stop_words='english',
                                                   ngram_range=(1,2),max_df=300)),
                     ('lr',LogisticRegression(C=10))])
pipeline.fit(X_train,y_train)
pred = pipeline.predict(X_test)
print(accuracy_score(y_test,pred))


In [None]:
# 사이킷런 파이프라인과   GridSearchCV와의 결합 
pipeline = Pipeline([
                     ('tfidf_vect',TfidfVectorizer(stop_words='english')),
                     ('lr',LogisticRegression())
])
params = {'tfidf_vect__ngram_range':[(1,1),(1,2),(1,3)],
          'tfidf_vect__max_df':[100,300,700],
          'lr__C':[1,5,10]}
grid_cv_pipe = GridSearchCV(pipeline,param_grid=params,cv=3,scoring='accuracy',verbose=1)
grid_cv_pipe.fit(X_train,y_train)
print(grid_cv_pipe.best_params_,grid_cv_pipe.best_score_)
pred = grid_cv_pipe.predict(X_test)
print(accuracy_score(y_test,pred))
