# Text Mining
- 텍스트로부터 정보 추출해 내는 기법

In [1]:
!pip install wordcloud



In [2]:
import nltk
from nltk.corpus import stopwords

nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Stopwords (불용어)
- 각 나라의 언어마다 불필요한 관사가 많음.
  - 영어: at, the, a, an...
  - 한국어: 은, 는, 이, 가, ...
- 불용어 사전을 이용해서 단어를 제거

In [3]:
stopwords.words('english')[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [4]:
from nltk import word_tokenize

sample = "I feel so LUCKY to have found this used (phone to us & not used hard at all), phone on line from someone upgraded and sold this one. My Son liked his old one that finally fell apart after 2.5+ years and didn't want an upgrade!! Thank you Seller, we really appreciate it & your honesty re: said used phone.I recommend this seller very highly & would but from them again!!"
print(sample)

I feel so LUCKY to have found this used (phone to us & not used hard at all), phone on line from someone upgraded and sold this one. My Son liked his old one that finally fell apart after 2.5+ years and didn't want an upgrade!! Thank you Seller, we really appreciate it & your honesty re: said used phone.I recommend this seller very highly & would but from them again!!


In [5]:
# 영어는 소문자로 만들어 준 뒤 전처리 수행
sample_lower = sample.lower()
print(word_tokenize(sample_lower))

['i', 'feel', 'so', 'lucky', 'to', 'have', 'found', 'this', 'used', '(', 'phone', 'to', 'us', '&', 'not', 'used', 'hard', 'at', 'all', ')', ',', 'phone', 'on', 'line', 'from', 'someone', 'upgraded', 'and', 'sold', 'this', 'one', '.', 'my', 'son', 'liked', 'his', 'old', 'one', 'that', 'finally', 'fell', 'apart', 'after', '2.5+', 'years', 'and', 'did', "n't", 'want', 'an', 'upgrade', '!', '!', 'thank', 'you', 'seller', ',', 'we', 'really', 'appreciate', 'it', '&', 'your', 'honesty', 're', ':', 'said', 'used', 'phone.i', 'recommend', 'this', 'seller', 'very', 'highly', '&', 'would', 'but', 'from', 'them', 'again', '!', '!']


In [6]:
# 불용어 제거
sample_word_tokens = word_tokenize(sample_lower)

# Corpus
sample_without_stopwords = []

for token in sample_word_tokens:
  if token not in stopwords.words('english'):
    sample_without_stopwords.append(token)

print(sample_without_stopwords)

['feel', 'lucky', 'found', 'used', '(', 'phone', 'us', '&', 'used', 'hard', ')', ',', 'phone', 'line', 'someone', 'upgraded', 'sold', 'one', '.', 'son', 'liked', 'old', 'one', 'finally', 'fell', 'apart', '2.5+', 'years', "n't", 'want', 'upgrade', '!', '!', 'thank', 'seller', ',', 'really', 'appreciate', '&', 'honesty', ':', 'said', 'used', 'phone.i', 'recommend', 'seller', 'highly', '&', 'would', '!', '!']


In [8]:
# Using Comprehension
sample_word_tokens = word_tokenize(sample_lower)

sample_without_stopwords = [ token for token in sample_word_tokens if token not in stopwords.words('english') ]
print(sample_without_stopwords)

['feel', 'lucky', 'found', 'used', '(', 'phone', 'us', '&', 'used', 'hard', ')', ',', 'phone', 'line', 'someone', 'upgraded', 'sold', 'one', '.', 'son', 'liked', 'old', 'one', 'finally', 'fell', 'apart', '2.5+', 'years', "n't", 'want', 'upgrade', '!', '!', 'thank', 'seller', ',', 'really', 'appreciate', '&', 'honesty', ':', 'said', 'used', 'phone.i', 'recommend', 'seller', 'highly', '&', 'would', '!', '!']


# WordCloud

In [9]:
# draw_wordcloud()

# spaCy

In [10]:
import spacy
from collections import Counter
#  반복문에서 현재 진행 상황이 어떻게 되는지를 표시
from tqdm import tqdm

# save
# import pickle

# with open('tokens_preprocessed.pickle', 'wb') as f:
#     pickle.dump(tokens_preprocessed, f, pickle.HIGHEST_PROTOCOL)

# # load
# import pickle

# with open('./tokens_preprocessed.pickle', 'rb') as f:
#   tokens_preprocessed = pickle.load(f)

# tokens_preprocessed[:3]

## Bigram & Trigram
- Phrase Modeling

In [11]:
# gensim 패키지 활용. gengim 패키지는 자연어 처리 관련 여러 라이브리러를 포함
from gensim.models.phrases import Phrases, Phraser

sentences = [
    '피카츄 라이츄 파이리 꼬부기',
    '피카츄 라이츄 의 진화 전 포켓몬 입니다',
    '피카츄 라이츄 파이리 꼬부기',
    '피카츄 라이츄 파이리 는 친구 입니다'
]

word_lsts = [ s.split() for s in sentences ]
word_lsts

[['피카츄', '라이츄', '파이리', '꼬부기'],
 ['피카츄', '라이츄', '의', '진화', '전', '포켓몬', '입니다'],
 ['피카츄', '라이츄', '파이리', '꼬부기'],
 ['피카츄', '라이츄', '파이리', '는', '친구', '입니다']]

In [13]:
# min_count : 바이그램의 최소 등장횟수
# threshold : 값이 작아지면 작아질 수록 두 토큰을 잘 붙여준다. 기본값은 10.0 무조건 양수
#             0.01 같은 작은 값으로 설정하면 웬만하면 합성어로 처리 된다.
# bigram = Phrases(word_lsts, min_count=1, threshold=1, delimiter=b"_")
# bigram_phraser = Phraser(bigram)

# for w_l in word_lsts:
#   bigram_s = bigram_phraser[w_l]

#   print("유니그램 : {}".format(w_l))
#   print("바이그램 : {}".format(bigram_s))
#   print("-"*40)

# trigram : 바이그램을 다시한번 묶어주면 된다.
#  bigram_phraser[word_lsts] : 모든 문장에 대한 바이그램
# trigram = Phrases(bigram_phraser[word_lsts], min_count=1, threshold=1.0, delimiter=b"_")
# trigram_phraser = Phraser(trigram)

# bigram_vocab_set = set(bigram.vocab.keys())
# trigram_vocab_set = set(trigram.vocab.keys())

# print("== Bigram Model vocab")
# print({vocab.decode('utf-8') for vocab in bigram_vocab_set})
# print("--"*20)
# print("== Trigram Model vocab")
# print({vocab.decode('utf-8') for vocab in trigram_vocab_set})
# print("--"*20)