### 1.1 실습용 영문기사 수집
- 온라인 기사를 바로 수집하여 실습데이터로 사용  
https://www.forbes.com/sites/adrianbridgwater/2019/04/15/what-drove-the-ai-renaissance/

In [2]:
import requests
from bs4 import BeautifulSoup

In [4]:
url = 'https://www.forbes.com/sites/adrianbridgwater/2019/04/15/what-drove-the-ai-renaissan'
response = requests.get(url)
soup = BeautifulSoup(response.text,'html.parser')

In [5]:
eng_news = soup.select('p') #[class="speakable-paragraph"]')
eng_text = eng_news[3].get_text()
eng_text

'Loni Love, Comedian and TV Personality'

### 1.2 영문 토큰화
- https://www.nltk.org/api/nltk.tokenize.html

In [6]:
!pip install nltk



In [10]:
#word_tokenize() : 단어와 구두점(온점(.), 컴마(,), 물음표(?), 세미콜론(;), 느낌표(!) 등과 같은 기호)으로 구 
import nltk 
nltk.download('punkt')
from nltk.tokenize import word_tokenize 
text = 'Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks.' 
word_tokens = word_tokenize(text) 
print(word_tokens) 


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']


In [12]:
#word_tokenize() : 단어와 구두점(온점(.), 컴마(,), 물음표(?), 세미콜론(;), 느낌표(!) 등과 같은 기호)으로 구 
import nltk 
from nltk.tokenize import word_tokenize 
token1 = word_tokenize(eng_text) 
print(token1) 

['Loni', 'Love', ',', 'Comedian', 'and', 'TV', 'Personality']


In [13]:
#WordPunctTokenizer() : 알파벳과 알파벳이 아닌문자를 구분하여 토큰화
import nltk
from nltk.tokenize import WordPunctTokenizer
text = 'Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks.'
wordpuncttoken = WordPunctTokenizer().tokenize(text)
print(wordpuncttoken)

['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']


In [14]:
#TreebankWordTokenizer() : 정규표현식에 기반한 토큰화
import nltk
from nltk.tokenize import TreebankWordTokenizer
text = 'Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks.'
treebankwordtoken = TreebankWordTokenizer().tokenize(text)
print(treebankwordtoken)

['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.']


### 1.3 영문 품사 부착 (PoS Tagging)
-분리한 토큰마다 품사를 부착한다  
https://www.nltk.org/api/nltk.tag.html  
태크목록 : https://pythonprogramming.net/natural-language-toolkit-nltk-part-speech-tagging/

In [15]:
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')
taggedToken = pos_tag(word_tokens)
print(taggedToken)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[('Good', 'JJ'), ('muffins', 'NNS'), ('cost', 'VBP'), ('$', '$'), ('3.88', 'CD'), ('in', 'IN'), ('New', 'NNP'), ('York', 'NNP'), ('.', '.'), ('Please', 'NNP'), ('buy', 'VB'), ('me', 'PRP'), ('two', 'CD'), ('of', 'IN'), ('them', 'PRP'), ('.', '.'), ('Thanks', 'NNS'), ('.', '.')]


### 1.4 개체명 인식 (NER, Named Entity Recognition)
-http://www.nltk.org/api/nltk.chunk.html

In [17]:
nltk.download('words')
nltk.download('maxent_ne_chunker')

from nltk import ne_chunk
neToken = ne_chunk(taggedToken)
print(neToken)

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
(S
  (GPE Good/JJ)
  muffins/NNS
  cost/VBP
  $/$
  3.88/CD
  in/IN
  (GPE New/NNP York/NNP)
  ./.
  Please/NNP
  buy/VB
  me/PRP
  two/CD
  of/IN
  them/PRP
  ./.
  Thanks/NNS
  ./.)


### 1.5 원형 복원
- 각 토큰의 원형을 복원하여 표준화 한다.

#### 1.5.1 어간추출 (Stemming)
- 규칙에 기반 하여 토큰을 표준화
- ning제거, ful 제거 등  
https://www.nltk.org/api/nltk.stem.html  
규칙상세 : https://tartarus.org/martin/PorterStemmer/def.txt  

In [22]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
print("running -> " + ps.stem("running"))
# print("beautiful -> " + ps stem("beautiful"))
print("believes -> " + ps.stem("believes"))
print("using -> " + ps.stem("using"))
print("conversation -> " + ps.stem("conversation"))
print("organization -> " + ps.stem("organization"))
print("studies -> " + ps.stem("studies"))

running -> run
believes -> believ
using -> use
conversation -> convers
organization -> organ
studies -> studi


#### 1.5.2 표제어 추출 (Lemmatization)
품사정보를 보존하여 토큰을 표준화  
http://www.nltk.org/api/nltk.stem.html?highlight=lemmatizer

In [23]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()
print("running -> " + wl.lemmatize("running"))
print("beautiful -> " + wl.lemmatize("beautiful"))
print("believes -> " + wl.lemmatize("believes"))
print("using -> " + wl.lemmatize("using"))
print("conversation -> " + wl.lemmatize("conversation"))
print("organization -> " + wl.lemmatize("organization"))
print("studies -> " + wl.lemmatize("studies"))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
running -> running
beautiful -> beautiful
believes -> belief
using -> using
conversation -> conversation
organization -> organization
studies -> study


### 1.6 불용어 처리 (Stopword)

In [25]:
stopPos = ['IN', 'CC', 'UH', 'TO', 'MD', 'DT', 'VBZ','VBP']
# 최빈어 조회. 최빈어를 조회하여 불용어 제거 대상을 선정
from collections import Counter
Counter(taggedToken).most_common()

stopWord = [',','be','able']
word = []
for tag in taggedToken:
  if tag[1] not in stopPos:
    if tag[0] not in stopWord:
      word.append(tag[0])
print(word)

['Good', 'muffins', '$', '3.88', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'them', '.', 'Thanks', '.']


### 1.7 영문 텍스트 전처리 종합

In [27]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('words')
nltk.download('maxent_ne_chunker')
nltk.download('wordnet')

from nltk.tokenize import TreebankWordTokenizer
sumtoken = TreebankWordTokenizer().tokenize("Obama loves fried chicken of KFC")
print(sumtoken)
from nltk import pos_tag
sumTaggedToken = pos_tag(sumtoken)
print(sumTaggedToken)
from nltk import ne_chunk
sumNeToken = ne_chunk(sumTaggedToken)
print(sumNeToken)
from nltk.stem import PorterStemmer
ps = PorterStemmer()
print("loves -> " + ps.stem("loves"))
print("fried -> " + ps.stem("fried"))
from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()
print("loves -> " + wl.lemmatize("loves"))
print("fried -> " + wl.lemmatize("fried"))
#불용어 처리
sumStopPos = ['IN']
sumStopWord = ['fried']
word = []
for tag in sumTaggedToken:
  if tag[1] not in sumStopPos:
    if tag[0] not in sumStopWord:
      word.append(tag[0])
print(word)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
['Obama', 'loves', 'fried', 'chicken', 'of', 'KFC']
[('Obama', 'NNP'), ('loves', 'VBZ'), ('fried', 'VBN'), ('chicken', 'NN'), ('of', 'IN'), ('KFC', 'NNP')]
(S
  (GPE Obama/NNP)
  loves/VBZ
  fried/VBN
  chicken/NN
  of/IN
  (ORGANIZATION KFC/NNP))
loves -> love
fried -> fri
loves -> love
fried -> fried
['Obama', 'loves', 'chicken', 'KFC']


### 2.2 한글 토큰화 및 형태소 분석
- 한글 자연어처리기 비교  
https://konlpy.org/ko/latest/morph/

In [28]:
# konlpy 설치
!pip install konlpy

Collecting konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 1.4MB/s 
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/c9/dc/45cdef1b4d119eb96316b3117e6d5708a08029992b2fee2c143c7a0a5cc5/colorama-0.4.3-py2.py3-none-any.whl
Collecting tweepy>=3.7.0
  Downloading https://files.pythonhosted.org/packages/bb/7c/99d51f80f3b77b107ebae2634108717362c059a41384a1810d13e2429a81/tweepy-3.9.0-py2.py3-none-any.whl
Collecting JPype1>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/2d/9b/e115101a833605b3c0e6f3a2bc1f285c95aaa1d93ab808314ca1bde63eed/JPype1-0.7.5-cp36-cp36m-manylinux2010_x86_64.whl (3.6MB)
[K     |████████████████████████████████| 3.6MB 41.0MB/s 
[?25hCollecting beautifulsoup4==4.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237

In [31]:
# 코모란(Komoran) 토큰화
from konlpy.tag import Komoran
komoran= Komoran()
kor_text = "인간이 컴퓨터와 대화하고 있다는 것을 깨닫지 못하고 인간과 대화를 계속할 수 있다면 컴퓨터는 지능적인 것으"
komoran_tokens = komoran.morphs(kor_text)
print(komoran_tokens)


#한나눔(Hannanum) 토큰화
from konlpy.tag import Hannanum
hannanum= Hannanum()
kor_text = "인간이 컴퓨터와 대화하고 있다는 것을 깨닫지 못하고 인간과 대화를 계속할 수 있다면 컴퓨터는 지능적인 것으"
hannanum_tokens = hannanum.morphs(kor_text)
print(hannanum_tokens)


#Okt 토큰화
from konlpy.tag import Okt
okt= Okt()
kor_text = "인간이 컴퓨터와 대화하고 있다는 것을 깨닫지 못하고 인간과 대화를 계속할 수 있다면 컴퓨터는 지능적인 것으"
okt_tokens = okt.morphs(kor_text)
print(okt_tokens)


#Kkma 토큰화
from konlpy.tag import Kkma
kkma= Kkma()
kor_text = "인간이 컴퓨터와 대화하고 있다는 것을 깨닫지 못하고 인간과 대화를 계속할 수 있다면 컴퓨터는 지능적인 것으"
kkma_tokens = kkma.morphs(kor_text)
print(kkma_tokens)

['인간', '이', '컴퓨터', '와', '대화', '하', '고', '있', '다는', '것', '을', '깨닫', '지', '못하', '고', '인간', '과', '대화', '를', '계속', '하', 'ㄹ', '수', '있', '다면', '컴퓨터', '는', '지능', '적', '이', 'ㄴ', '것', '으']
['인간', '이', '컴퓨터', '와', '대화', '하고', '있', '다는', '것', '을', '깨닫', '지', '못하', '고', '인간', '과', '대화', '를', '계속', '하', 'ㄹ', '수', '있', '다면', '컴퓨터', '는', '지능적', '이', 'ㄴ', '것으']
['인간', '이', '컴퓨터', '와', '대화', '하고', '있다는', '것', '을', '깨닫지', '못', '하고', '인간', '과', '대화', '를', '계속', '할', '수', '있다면', '컴퓨터', '는', '지능', '적', '인', '것', '으']
['인간', '이', '컴퓨터', '와', '대화', '하', '고', '있', '다는', '것', '을', '깨닫', '지', '못하', '고', '인간', '과', '대화', '를', '계속', '하', 'ㄹ', '수', '있', '다면', '컴퓨터', '는', '지능', '적', '이', 'ㄴ', '것', '으']


#### 2.3 한글 품사 부착 (PoS Tagging)
- PoS Tag 목록  
https://docs.google.com/spreadsheets/u/1/d/1OGAjUvalBuXoZvZ_-  
9tEfYD2gQe7hTGsgUpiiBSXI8/edit#gid=0

In [33]:
# 코모란(Komoran) 품사 태깅
komoranTag = []
for token in komoran_tokens:
  komoranTag += komoran.pos(token)
print(komoranTag)


# 한나눔(Hannanum) 품사 태깅
hannanumTag = []
for token in hannanum_tokens:
  hannanumTag += hannanum.pos(token)
print(hannanumTag)


#Okt 품사 태깅
oktTag = []
for token in okt_tokens:
  oktTag += okt.pos(token)
print(oktTag)


#Kkma 품사 태깅
kkmaTag = []
for token in kkma_tokens:
  kkmaTag += kkma.pos(token)
print(kkmaTag)

[('인간', 'NNG'), ('이', 'MM'), ('컴퓨터', 'NNG'), ('오', 'VV'), ('아', 'EC'), ('대화', 'NNG'), ('하', 'NNG'), ('고', 'MM'), ('있', 'VV'), ('달', 'VV'), ('는', 'ETM'), ('것', 'NNB'), ('을', 'NNG'), ('깨닫', 'VV'), ('지', 'NNB'), ('못', 'MAG'), ('하', 'MAG'), ('고', 'MM'), ('인간', 'NNG'), ('과', 'NNG'), ('대화', 'NNG'), ('를', 'JKO'), ('계속', 'MAG'), ('하', 'NNG'), ('ㄹ', 'NA'), ('수', 'NNB'), ('있', 'VV'), ('다면', 'NNG'), ('컴퓨터', 'NNG'), ('늘', 'VV'), ('ㄴ', 'ETM'), ('지능', 'NNP'), ('적', 'NNB'), ('이', 'MM'), ('ㄴ', 'JX'), ('것', 'NNB'), ('으', 'NNG')]
[('인간', 'N'), ('이', 'M'), ('컴퓨터', 'N'), ('와', 'I'), ('대화', 'N'), ('하', 'P'), ('고', 'E'), ('있', 'N'), ('다', 'M'), ('는', 'J'), ('것', 'N'), ('을', 'N'), ('깨닫', 'N'), ('지', 'N'), ('못하', 'P'), ('어', 'E'), ('고', 'M'), ('인간', 'N'), ('과', 'N'), ('대화', 'N'), ('를', 'N'), ('계속', 'M'), ('하', 'I'), ('ㄹ', 'N'), ('수', 'N'), ('있', 'N'), ('다면', 'N'), ('컴퓨터', 'N'), ('늘', 'P'), ('ㄴ', 'E'), ('지능적', 'N'), ('이', 'M'), ('ㄴ', 'N'), ('것으', 'N')]
[('인간', 'Noun'), ('이', 'Noun'), ('컴퓨터', 'Noun'), ('와', 'Ve

### 2.4 불용어(Stopword) 처리
- 분석에 불필요한 품사를 제거하고, 불필요한 단어(불용어)를 제거한다

In [34]:
#불용어 처리
stopPos = ['Suffix','Punctuation','Josa','Foreign','Alpha','Number']

#최빈어 조회. 최빈어를 조회하여 불용어 제거 대상을 선정
from collections import Counter
Counter(oktTag).most_common()

[(('인간', 'Noun'), 2),
 (('컴퓨터', 'Noun'), 2),
 (('대화', 'Noun'), 2),
 (('하고', 'Verb'), 2),
 (('것', 'Noun'), 2),
 (('이', 'Noun'), 1),
 (('와', 'Verb'), 1),
 (('있다는', 'Adjective'), 1),
 (('을', 'Josa'), 1),
 (('깨닫지', 'Verb'), 1),
 (('못', 'Noun'), 1),
 (('과', 'Noun'), 1),
 (('를', 'Noun'), 1),
 (('계속', 'Noun'), 1),
 (('할', 'Verb'), 1),
 (('수', 'Noun'), 1),
 (('있다면', 'Adjective'), 1),
 (('는', 'Verb'), 1),
 (('지능', 'Noun'), 1),
 (('적', 'Noun'), 1),
 (('인', 'Noun'), 1),
 (('으', 'Adverb'), 1)]

In [38]:
stopWord = ['의','이','로','두고','들','를','은','과','수','했다','것','있는','한다','하는','그','있다']
word = []
for tag in oktTag:
  if tag[1] not in stopPos:
    if tag[0] not in stopWord:
      word.append(tag[0])

print(word)

['인간', '컴퓨터', '와', '대화', '하고', '있다는', '깨닫지', '못', '하고', '인간', '대화', '계속', '할', '있다면', '컴퓨터', '는', '지능', '적', '인', '으']


### 2 N-gram

In [42]:
import nltk
from nltk import bigrams, word_tokenize
from nltk.util import ngrams
nltk.download('punkt')

sentence = "I am a boy."
tokens = word_tokenize(sentence)

bigram = bigrams(tokens)
trigram = ngrams(tokens, 3)

for t in bigram:
  print(t)

for t in trigram:
  print(t)

import nltk
nltk.download('movie_reviews')
nltk.download('punkt')

from nltk.corpus import movie_reviews
sentences = []
for tokens in movie_reviews.sents():
  bigram = ngrams(tokens, 2, pad_left=True, pad_right=True, left_pad_symbol="SS") #right_p
  sentences += [t for t in bigram]

sentences[:20]

movie_reviews.sents()

sentences[-5:]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
('I', 'am')
('am', 'a')
('a', 'boy')
('boy', '.')
('I', 'am', 'a')
('am', 'a', 'boy')
('a', 'boy', '.')
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[('whatever', 'it'), ('it', 'may'), ('may', 'be'), ('be', '.'), ('.', None)]