* 자연어 처리 : 음성인식, 요약, 번역, 감성분석, 분류, 질의응답, 챗봇 등등
* 환경 구성 : 아나콘다 (머신러닝, 시각화, 데이터분석 등) + 텐서플로우, 젠심, 파이토치, knolpy 등

1) 텍스트 전처리(토큰화, 정제, 어간추출, 불용어제거, 정수인코딩, 패딩)
2) 텍스트의 수치표현(BoW, DTM/TDM, TF-IDF)
3) 유사도(문서 / 단어 / 문장)
4) 머신/딥 러닝 모델 생성

In [1]:
pip install konlpy

Note: you may need to restart the kernel to use updated packages.


In [2]:
from konlpy.tag import Okt

In [3]:
okt = Okt()

In [4]:
 print(okt.morphs(u'단독입찰보다 복수입찰의 경우'))

['단독', '입찰', '보다', '복수', '입찰', '의', '경우']


In [5]:
print(okt.nouns(u'유일하게 항공기 체계 종합개발 경험을 갖고 있는 KAI는'))

['항공기', '체계', '종합', '개발', '경험']


In [6]:
print(okt.pos(u'이것도 되나욬ㅋㅋ'))

[('이', 'Determiner'), ('것', 'Noun'), ('도', 'Josa'), ('되나욬', 'Noun'), ('ㅋㅋ', 'KoreanParticle')]


In [7]:
from konlpy.tag import *

In [10]:
okt = Okt()
han = Hannanum()
kkma = Kkma()

In [None]:
'아버지가방에들어가신다'

In [11]:
okt.pos('아버지가방에들어가신다')

[('아버지', 'Noun'), ('가방', 'Noun'), ('에', 'Josa'), ('들어가신다', 'Verb')]

In [12]:
han.pos('아버지가방에들어가신다')

[('아버지가방에들어가', 'N'), ('이', 'J'), ('시ㄴ다', 'E')]

In [13]:
kkma.pos('아버지가방에들어가신다')

[('아버지', 'NNG'),
 ('가방', 'NNG'),
 ('에', 'JKM'),
 ('들어가', 'VV'),
 ('시', 'EPH'),
 ('ㄴ다', 'EFN')]

In [16]:
okt.pos('정부가 발표하는 물가상승률과 소비자가 느끼는 물가상승률은 다르다.')

[('정부', 'Noun'),
 ('가', 'Josa'),
 ('발표', 'Noun'),
 ('하는', 'Verb'),
 ('물가상승률', 'Noun'),
 ('과', 'Josa'),
 ('소비자', 'Noun'),
 ('가', 'Josa'),
 ('느끼는', 'Verb'),
 ('물가상승률', 'Noun'),
 ('은', 'Josa'),
 ('다르다', 'Adjective'),
 ('.', 'Punctuation')]

In [17]:
han.pos('정부가 발표하는 물가상승률과 소비자가 느끼는 물가상승률은 다르다.')

[('정부', 'N'),
 ('가', 'J'),
 ('발표', 'N'),
 ('하', 'X'),
 ('는', 'E'),
 ('물가상승률', 'N'),
 ('과', 'J'),
 ('소비자', 'N'),
 ('가', 'J'),
 ('느끼', 'P'),
 ('는', 'E'),
 ('물가상승률', 'N'),
 ('은', 'J'),
 ('다르', 'P'),
 ('다', 'E'),
 ('.', 'S')]

In [18]:
kkma.pos('정부가 발표하는 물가상승률과 소비자가 느끼는 물가상승률은 다르다.')

[('정부', 'NNG'),
 ('가', 'JKS'),
 ('발표', 'NNG'),
 ('하', 'XSV'),
 ('는', 'ETD'),
 ('물가', 'NNG'),
 ('상승률', 'NNG'),
 ('과', 'JC'),
 ('소비자', 'NNG'),
 ('가', 'JKS'),
 ('느끼', 'VV'),
 ('는', 'ETD'),
 ('물가', 'NNG'),
 ('상승률', 'NNG'),
 ('은', 'JX'),
 ('다르', 'VA'),
 ('다', 'EFN'),
 ('.', 'SF')]

In [149]:
from math import log 
docs = [
  '먹고 싶은 사과',
  '먹고 싶은 바나나',
  '길고 노란 바나나 바나나',
  '저는 과일이 좋아요'
]
docs

['먹고 싶은 사과', '먹고 싶은 바나나', '길고 노란 바나나 바나나', '저는 과일이 좋아요']

In [145]:
vocab = list(w for doc in docs for w in doc.split())
vocab

['먹고',
 '싶은',
 '사과',
 '먹고',
 '싶은',
 '바나나',
 '길고',
 '노란',
 '바나나',
 '바나나',
 '저는',
 '과일이',
 '좋아요']

In [28]:
# 코퍼스 (말뭉치) : 전체 문서에 등장하는 단어 집합

In [31]:
import pandas as pd

In [40]:
N = len(docs) 

def tf(t, d):
    return d.count(t)
def idf(t):
    df =0
    for doc in docs:
        df += t in doc
    return log(N/(df+1))

def tfidf(t, d):
    return tf(t,d)* idf(t)

In [43]:
result = []

# 각 문서에 대해서 아래 연산을 반복
for i in range(N):
    result.append([])
    d = docs[i]
    for j in range(len(vocab)):
        t = vocab[j]
        result[-1].append(tf(t, d))

tf_ = pd.DataFrame(result, columns = vocab)

In [45]:
tf_

Unnamed: 0,과일이,길고,노란,먹고,먹고.1,바나나,바나나.1,바나나.2,사과,싶은,싶은.1,저는,좋아요
0,0,0,0,1,1,0,0,0,1,1,1,0,0
1,0,0,0,1,1,1,1,1,0,1,1,0,0
2,0,1,1,0,0,2,2,2,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,1,1


In [46]:
from sklearn.feature_extraction.text import CountVectorizer

In [47]:
vec = CountVectorizer()
vec.fit_transform(docs).toarray()

array([[0, 0, 0, 1, 0, 1, 1, 0, 0],
       [0, 0, 0, 1, 1, 0, 1, 0, 0],
       [0, 1, 1, 0, 2, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 1, 1]], dtype=int64)

In [48]:
vec.vocabulary_

{'먹고': 3,
 '싶은': 6,
 '사과': 5,
 '바나나': 4,
 '길고': 1,
 '노란': 2,
 '저는': 7,
 '과일이': 0,
 '좋아요': 8}

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [51]:
result = []
for j in range(len(vocab)):
    t = vocab[j]
    result.append(idf(t))

idf_ = pd.DataFrame(result, index=vocab, columns=["IDF"])
idf_

Unnamed: 0,IDF
과일이,0.693147
길고,0.693147
노란,0.693147
먹고,0.287682
먹고,0.287682
바나나,0.287682
바나나,0.287682
바나나,0.287682
사과,0.693147
싶은,0.287682


In [52]:
result = []
for i in range(N):
    result.append([])
    d = docs[i]
    for j in range(len(vocab)):
        t = vocab[j]
        result[-1].append(tfidf(t,d))

tfidf_ = pd.DataFrame(result, columns = vocab)
tfidf_

Unnamed: 0,과일이,길고,노란,먹고,먹고.1,바나나,바나나.1,바나나.2,사과,싶은,싶은.1,저는,좋아요
0,0.0,0.0,0.0,0.287682,0.287682,0.0,0.0,0.0,0.693147,0.287682,0.287682,0.0,0.0
1,0.0,0.0,0.0,0.287682,0.287682,0.287682,0.287682,0.287682,0.0,0.287682,0.287682,0.0,0.0
2,0.0,0.693147,0.693147,0.0,0.0,0.575364,0.575364,0.575364,0.0,0.0,0.0,0.0,0.0
3,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.693147


1) 각 문서(영화)에서 중요 단어 추출
2) 코사인 유사도 기반 가장 유사한 문서(영화) 검색
> 데이터 전처리 nltk

In [57]:
pip install tensorflow

Collecting tensorflow
  Obtaining dependency information for tensorflow from https://files.pythonhosted.org/packages/80/6f/57d36f6507e432d7fc1956b2e9e8530c5c2d2bfcd8821bcbfae271cd6688/tensorflow-2.14.0-cp311-cp311-win_amd64.whl.metadata
  Downloading tensorflow-2.14.0-cp311-cp311-win_amd64.whl.metadata (3.3 kB)
Collecting tensorflow-intel==2.14.0 (from tensorflow)
  Obtaining dependency information for tensorflow-intel==2.14.0 from https://files.pythonhosted.org/packages/ad/6e/1bfe367855dd87467564f7bf9fa14f3b17889988e79598bc37bf18f5ffb6/tensorflow_intel-2.14.0-cp311-cp311-win_amd64.whl.metadata
  Downloading tensorflow_intel-2.14.0-cp311-cp311-win_amd64.whl.metadata (4.8 kB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.14.0->tensorflow)
  Obtaining dependency information for absl-py>=1.0.0 from https://files.pythonhosted.org/packages/01/e4/dc0a1dcc4e74e08d7abedab278c795eef54a224363bb18f5692f416d834f/absl_py-2.0.0-py3-none-any.whl.metadata
  Downloading absl_py-2.0.0-py3-none-any

In [59]:
pip install gensim

Collecting FuzzyTM>=0.4.0 (from gensim)
  Downloading FuzzyTM-2.0.5-py3-none-any.whl (29 kB)
Collecting pyfume (from FuzzyTM>=0.4.0->gensim)
  Downloading pyFUME-0.2.25-py3-none-any.whl (67 kB)
     ---------------------------------------- 0.0/67.1 kB ? eta -:--:--
     ---------------------------------------- 67.1/67.1 kB 1.8 MB/s eta 0:00:00
Collecting simpful (from pyfume->FuzzyTM>=0.4.0->gensim)
  Downloading simpful-2.11.0-py3-none-any.whl (32 kB)
Collecting fst-pso (from pyfume->FuzzyTM>=0.4.0->gensim)
  Downloading fst-pso-1.8.1.tar.gz (18 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting miniful (from fst-pso->pyfume->FuzzyTM>=0.4.0->gensim)
  Downloading miniful-0.0.6.tar.gz (2.8 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: fst-pso, miniful
  Building wheel for fst-pso (setup.py): started
  Building wheel for fs

In [60]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [61]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import WordPunctTokenizer
from tensorflow.keras.preprocessing.text import text_to_word_sequence

* token(토큰) : 자연어 처리작업을 수행하는 기본 단위,
    * 일반적으로 단어(or 문장, 문단, 문자) 나누는 단위에 따라 생각
* tokenization(토큰화) : 주어진 코퍼스를 토큰 단위로 나누는 작업
    * 자연어 -> 토큰화 -> 세부 작업

In [64]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [65]:
print('단어 토큰화1 :',word_tokenize("Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."))

단어 토큰화1 : ['Do', "n't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr.', 'Jone', "'s", 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


In [66]:
print('단어 토큰화2 :',WordPunctTokenizer().tokenize("Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."))

단어 토큰화2 : ['Don', "'", 't', 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr', '.', 'Jone', "'", 's', 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


In [67]:
print('단어 토큰화3 :',text_to_word_sequence("Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."))

단어 토큰화3 : ["don't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', 'mr', "jone's", 'orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop']


In [68]:
from nltk.tokenize import sent_tokenize
# 문장 단위 토큰화
text = "His barber kept his word. But keeping such a huge secret to himself was driving him crazy. Finally, the barber went up a mountain and almost to the edge of a cliff. He dug a hole in the midst of some reeds. He looked about, to make sure no one was near."
print('문장 토큰화1 :',sent_tokenize(text))

문장 토큰화1 : ['His barber kept his word.', 'But keeping such a huge secret to himself was driving him crazy.', 'Finally, the barber went up a mountain and almost to the edge of a cliff.', 'He dug a hole in the midst of some reeds.', 'He looked about, to make sure no one was near.']


In [69]:
text = "I am actively looking for Ph.D. students. and you are a Ph.D student."
print('문장 토큰화2 :',sent_tokenize(text))

문장 토큰화2 : ['I am actively looking for Ph.D. students.', 'and you are a Ph.D student.']


In [70]:
from konlpy.tag import Okt
from konlpy.tag import Kkma

okt = Okt()
kkma = Kkma()

print('OKT 형태소 분석 :',okt.morphs("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
print('OKT 품사 태깅 :',okt.pos("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
print('OKT 명사 추출 :',okt.nouns("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))

OKT 형태소 분석 : ['열심히', '코딩', '한', '당신', ',', '연휴', '에는', '여행', '을', '가봐요']
OKT 품사 태깅 : [('열심히', 'Adverb'), ('코딩', 'Noun'), ('한', 'Josa'), ('당신', 'Noun'), (',', 'Punctuation'), ('연휴', 'Noun'), ('에는', 'Josa'), ('여행', 'Noun'), ('을', 'Josa'), ('가봐요', 'Verb')]
OKT 명사 추출 : ['코딩', '당신', '연휴', '여행']


In [71]:
print('꼬꼬마 형태소 분석 :',kkma.morphs("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
print('꼬꼬마 품사 태깅 :',kkma.pos("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))
print('꼬꼬마 명사 추출 :',kkma.nouns("열심히 코딩한 당신, 연휴에는 여행을 가봐요"))

꼬꼬마 형태소 분석 : ['열심히', '코딩', '하', 'ㄴ', '당신', ',', '연휴', '에', '는', '여행', '을', '가보', '아요']
꼬꼬마 품사 태깅 : [('열심히', 'MAG'), ('코딩', 'NNG'), ('하', 'XSV'), ('ㄴ', 'ETD'), ('당신', 'NP'), (',', 'SP'), ('연휴', 'NNG'), ('에', 'JKM'), ('는', 'JX'), ('여행', 'NNG'), ('을', 'JKO'), ('가보', 'VV'), ('아요', 'EFN')]
꼬꼬마 명사 추출 : ['코딩', '당신', '연휴', '여행']


In [72]:
# 정제, 대소문자통합, 불융어 제거(등장 빈도수가 낮은 단어)

In [73]:
import re
text = "I was wondering if anyone out there could enlighten me on this car."

# 길이가 1~2인 단어들을 정규 표현식을 이용하여 삭제
shortword = re.compile(r'\W*\b\w{1,2}\b') # \W문자숫자 아닌건 \w문자숫자 \b공백문자
print(shrtword.sub('', text))

 was wondering anyone out there could enlighten this car.


In [None]:
# 형태소 = 어간(stem, 단어 중요의미) + 접사(부가의미)
# dogs => dog + s

In [77]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...


True

In [78]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
# are, is, am => be(표제어)

words = ['policy', 'doing', 'organization', 'have', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting']

print('표제어 추출 전 :',words)
print('표제어 추출 후 :',[lemmatizer.lemmatize(word) for word in words])

표제어 추출 전 : ['policy', 'doing', 'organization', 'have', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting']
표제어 추출 후 : ['policy', 'doing', 'organization', 'have', 'going', 'love', 'life', 'fly', 'dy', 'watched', 'ha', 'starting']


In [81]:
lemmatizer.lemmatize('is','v')
lemmatizer.lemmatize('are','v')
lemmatizer.lemmatize('watched','v')

'watch'

In [105]:
# 어간(Stemming)
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import LancasterStemmer
from nltk.corpus import stopwords

stemmer = PorterStemmer()
lancaster = LancasterStemmer()

In [106]:
words = ['formalize', 'allowance', 'electricical']
print('어간 추출 후 :',[stemmer.stem(word) for word in words])

어간 추출 후 : ['formal', 'allow', 'electric']


In [107]:
import nltk
nltk.download('LancasterStemmer')

[nltk_data] Error loading LancasterStemmer: Package 'LancasterStemmer'
[nltk_data]     not found in index


False

In [108]:
print('어간 추출 후 :',[lancaster.stem(word) for word in words])

어간 추출 후 : ['form', 'allow', 'elect']


In [109]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [110]:
stop_words = set(stopwords.words('english'))
print(stop_words)

{'having', 'they', 'then', "weren't", 'who', 'isn', 'needn', 'are', 'into', 'itself', 'to', 'have', "it's", 'can', 'be', 'his', 'doesn', 'we', 'each', 'were', 'your', 'an', 'ourselves', 'themselves', 'had', "that'll", 'same', 'o', "wouldn't", 'only', 'which', 'too', 'wouldn', 'for', 'by', 'not', 'when', 'what', "doesn't", 'being', 'mightn', "wasn't", 'hasn', "isn't", 'my', 'mustn', 'how', 'shouldn', 'does', 'up', 'no', "haven't", 'where', 'there', 'he', "needn't", 'further', 'hadn', 'again', 'both', "you'd", 'through', 'in', 'them', "mightn't", 've', 'aren', "aren't", "you'll", 'if', 'under', 'nor', 'after', 'as', 'whom', 'him', "don't", 'y', 'own', 'but', 'now', 'don', "she's", 'about', 'few', 'm', 'and', "didn't", 'the', 'it', 'weren', 'all', 'of', 'himself', 'during', 'such', "hadn't", 'so', 'out', 're', 'between', 'am', 'wasn', 'do', 'that', 'some', 's', 'at', 'than', 'hers', 'or', 'yours', 'why', 'been', "should've", 'ma', 'any', 'while', "shan't", 'before', "shouldn't", 'haven', 

In [None]:
# 'overview' 열에서 텍스트를 토큰화하고 불용어 제거
def process_text(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens

# 'overview' 열에 대해 텍스트 처리 적용
movies['tokenized_overview'] = movies['overview'].apply(process_text)


In [111]:
example = "Family is not an important thing. It's everything."
stop_words = set(stopwords.words('english')) 

word_tokens = word_tokenize(example)

result = []
for word in word_tokens: 
    if word not in stop_words: 
        result.append(word) 

print('불용어 제거 전 :',word_tokens) 
print('불용어 제거 후 :',result)

불용어 제거 전 : ['Family', 'is', 'not', 'an', 'important', 'thing', '.', 'It', "'s", 'everything', '.']
불용어 제거 후 : ['Family', 'important', 'thing', '.', 'It', "'s", 'everything', '.']


In [102]:
example = "고기를 아무렇게나 구우려고 하면 안 돼. 고기라고 다 같은 게 아니거든. 예컨대 삼겹살을 구울 때는 중요한 게 있지."
stop_words = "를 아무렇게나 구 우려 고 안 돼 같은 게 구울 때 는"

stop_words = set(stop_words.split(' '))
word_tokens = okt.morphs(example)

result = [word for word in word_tokens if not word in stop_words]

print('불용어 제거 전 :',word_tokens) 
print('불용어 제거 후 :',result)

불용어 제거 전 : ['고기', '를', '아무렇게나', '구', '우려', '고', '하면', '안', '돼', '.', '고기', '라고', '다', '같은', '게', '아니거든', '.', '예컨대', '삼겹살', '을', '구울', '때', '는', '중요한', '게', '있지', '.']
불용어 제거 후 : ['고기', '하면', '.', '고기', '라고', '다', '아니거든', '.', '예컨대', '삼겹살', '을', '중요한', '있지', '.']


In [112]:
from nltk.corpus import wordnet

In [115]:
synsets = wordnet.synsets("plan")

In [116]:
synsets

[Synset('plan.n.01'),
 Synset('design.n.02'),
 Synset('plan.n.03'),
 Synset('plan.v.01'),
 Synset('plan.v.02'),
 Synset('plan.v.03'),
 Synset('design.v.04')]

In [121]:
plan = wordnet.synset('plan.n.01')
plan.definition()

'a series of steps to be carried out or goals to be accomplished'

In [122]:
wordnet.synset('plan.v.01').definition()

'have the will and intention to carry out some action'

In [130]:
# boy man
wordnet.synsets('boy')
wordnet.synsets('man')

[Synset('man.n.01'),
 Synset('serviceman.n.01'),
 Synset('man.n.03'),
 Synset('homo.n.02'),
 Synset('man.n.05'),
 Synset('man.n.06'),
 Synset('valet.n.01'),
 Synset('man.n.08'),
 Synset('man.n.09'),
 Synset('man.n.10'),
 Synset('world.n.08'),
 Synset('man.v.01'),
 Synset('man.v.02')]

In [137]:
wordnet.synsets('boy')[1].path_similarity(wordnet.synsets('man')[0])


0.5

In [138]:
boy = wordnet.synsets('boy')[1]
man = wordnet.synsets('man')[0]
boy.path_similarity(man)

0.5

In [143]:
wordnet.synsets('computer')
# car 단어에 5개의 동의어 그룹이 있음
computer = wordnet.synset('computer.n.01')
computer.definition()

'a machine for performing calculations automatically'

In [144]:
computer.lemma_names()

['computer',
 'computing_machine',
 'computing_device',
 'data_processor',
 'electronic_computer',
 'information_processing_system']