In [1]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = ["You know I want your love. because I love you"]
vector = CountVectorizer() # 인스턴스 생성

vector.fit(corpus) # 단어 사전 생성
print(f"vocabulary 단어 : {vector.vocabulary_}")

result = vector.transform(corpus).toarray() # 단어별 빈도수
print(f"문장 단어 빈도수 : {corpus} => {result}")

vocabulary 단어 : {'you': 4, 'know': 1, 'want': 3, 'your': 5, 'love': 2, 'because': 0}
문장 단어 빈도수 : ['You know I want your love. because I love you'] => [[1 1 2 1 2 1]]


In [10]:
import pandas as pd
from math import log

docs = docs = [ '먹고 싶은 사과', '먹고 싶은 바나나', '길고 노란 바나나 바나나', '저는 과일이 좋아요'  ]

vocab = list(set(w for doc in docs for w in doc.split()))
vocab.sort()

print(f"vocab => {vocab}")

# 총 문서의 수 
N = len(docs)

# 문서에 나타난 단어 빈도 수
def tf(t, d):
 return d.count(t)

# 총 문서에서 단어가 나타난 빈도
def idf(t):
 df=0
 for doc in docs :   df += t in doc
 return log( N/(df+1) )

# TF-IDF 계산 
def tfidf(t, d):
 return tf(t,d) & idf(t)

result =[]
# 각 문서에 대해서 아래 연산 반복
for i in range(N):
 result.append([])
 d = docs[i]
 for j in range(len(vocab)):
  t = vocab[j]
  result[-1].append(tf(t, d))
     
tf_ = pd.DataFrame(result, columns=vocab)
tf_


vocab => ['과일이', '길고', '노란', '먹고', '바나나', '사과', '싶은', '저는', '좋아요']


Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0,0,0,1,0,1,1,0,0
1,0,0,0,1,1,0,1,0,0
2,0,1,1,0,2,0,0,0,0
3,1,0,0,0,0,0,0,1,1


In [8]:
for i in range(5):
 print("a")

a
a
a
a
a


In [11]:
# 토큰화
from nltk.tokenize import word_tokenize

text = "Happy, new year! Don't stop."

result = word_tokenize(text)
print(result)

['Happy', ',', 'new', 'year', '!', 'Do', "n't", 'stop', '.']


In [16]:
# 구두점 분리 토큰화
from nltk.tokenize import WordPunctTokenizer

text = "Happy, new year! Don't stop."

wp_tokenizer = WordPunctTokenizer()
result = wp_tokenizer.tokenize(text)
print(result)

['Happy', ',', 'new', 'year', '!', 'Don', "'", 't', 'stop', '.']


In [18]:
# 문장 단위 토큰화
from nltk import sent_tokenize
text = 'The Matrix is everywhere its all around us, here even in this room. \
You can see it out your window or on your television. \
You feel it when you go to work, or go to church or pay your taxes.'
result = sent_tokenize(text)
print(result)


['The Matrix is everywhere its all around us, here even in this room.', 'You can see it out your window or on your television.', 'You feel it when you go to work, or go to church or pay your taxes.']


In [19]:
import re
text = "I was wondering if anyone out there could enlighten me on this car."
# 길이가 1~2인 단어들 정규 표현식 이용하여 삭제
shortword = re.compile(r'\W*\b\w{1,2}\b')
print(shortword.sub('', text))

 was wondering anyone out there could enlighten this car.


In [25]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words_list = stopwords.words('english')

print("불용어 개수 :", len(stop_words_list))
print("불용어 10개 출력 :", stop_words_list[:10])

# stop words 제거
example = "Family is not an important thing. It's everything"
stop_words = set(stopwords.words('english')) # 굳이 set을 왜?

word_tokens = word_tokenize(example)

result = []
for word in word_tokens:
 if word not in stop_words_list:result.append(word)
 
print(f"불용어 제거 전 : {word_tokens}, 불용어 제거 후 : {result}")

불용어 개수 : 179
불용어 10개 출력 : ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]
불용어 제거 전 : ['Family', 'is', 'not', 'an', 'important', 'thing', '.', 'It', "'s", 'everything'], 불용어 제거 후 : ['Family', 'important', 'thing', '.', 'It', "'s", 'everything']


In [26]:
# 어간 추출
from nltk.stem import PorterStemmer, LancasterStemmer
from nltk.tokenize import word_tokenize

porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()

words =  [ 'policy', 'the going', 'am', 'doing', 'organization', 'having', 'going', 'love',
 'lives', 'fly', 'dies', 'watched', 'has', 'starting']

print(f"어간 추출 전 : {words}")

어간 추출 전 : ['policy', 'the going', 'am', 'doing', 'organization', 'having', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting']


In [28]:
from nltk.stem import WordNetLemmatizer

words =  ['policy', 'the going', 'am', 'doing', 'organization', 'having', 'going', 'love', 'lives', 'fly', 
'dies', 'watched', 'has', 'starting']

lemmatizer = WordNetLemmatizer()
result = [lemmatizer.lemmatize(word) for word in words]

print(f"표제어 추출 전 : {words}")
print(f"표제어 추출 후 : {result}")

표제어 추출 전 : ['policy', 'the going', 'am', 'doing', 'organization', 'having', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting']
표제어 추출 후 : ['policy', 'the going', 'am', 'doing', 'organization', 'having', 'going', 'love', 'life', 'fly', 'dy', 'watched', 'ha', 'starting']


In [30]:
# 정규표현식
from nltk.tokenize import RegexpTokenizer

text = "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop"

tokenizer1 = RegexpTokenizer("[\w]+")
tokenizer2 = RegexpTokenizer("\s+", gaps=True)

print(tokenizer1.tokenize(text))
print(tokenizer2.tokenize(text))

['Don', 't', 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', 'Mr', 'Jone', 's', 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop']
["Don't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name,', 'Mr.', "Jone's", 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop']


# 자연어 전처리 적용

In [31]:
import pandas as pd

In [40]:
df = pd.read_csv("imdb.tsv", delimiter="\t")
df

Unnamed: 0.1,Unnamed: 0,review
0,0,"Watching Time Chasers, it obvious that it was ..."
1,1,I saw this film about 20 years ago and remembe...
2,2,"Minor Spoilers In New York, Joan Barnard (Elvi..."
3,3,I went to see this film with a great deal of e...
4,4,"Yes, I agree with everyone on this site this m..."
5,5,"Jennifer Ehle was sparkling in \""Pride and Pre..."
6,6,Amy Poehler is a terrific comedian on Saturday...
7,7,A plane carrying employees of a large biotech ...
8,8,"A well made, gritty science fiction movie, it ..."
9,9,Incredibly dumb and utterly predictable story ...


## 대소문자 통합

In [41]:
df["review"] = df["review"].str.lower()
df["review"][0]

'watching time chasers, it obvious that it was made by a bunch of friends. maybe they were sitting around one day in film school and said, \\"hey, let\'s pool our money together and make a really bad movie!\\" or something like that. what ever they said, they still ended up making a really bad movie--dull story, bad script, lame acting, poor cinematography, bottom of the barrel stock music, etc. all corners were cut, except the one that would have prevented this film\'s release. life\'s like that.'

## 단어 토큰화

In [42]:
df["word_tokens"] = df["review"].apply(word_tokenize) 

In [43]:
print(df["word_tokens"][0])

['watching', 'time', 'chasers', ',', 'it', 'obvious', 'that', 'it', 'was', 'made', 'by', 'a', 'bunch', 'of', 'friends', '.', 'maybe', 'they', 'were', 'sitting', 'around', 'one', 'day', 'in', 'film', 'school', 'and', 'said', ',', '\\', "''", 'hey', ',', 'let', "'s", 'pool', 'our', 'money', 'together', 'and', 'make', 'a', 'really', 'bad', 'movie', '!', '\\', "''", 'or', 'something', 'like', 'that', '.', 'what', 'ever', 'they', 'said', ',', 'they', 'still', 'ended', 'up', 'making', 'a', 'really', 'bad', 'movie', '--', 'dull', 'story', ',', 'bad', 'script', ',', 'lame', 'acting', ',', 'poor', 'cinematography', ',', 'bottom', 'of', 'the', 'barrel', 'stock', 'music', ',', 'etc', '.', 'all', 'corners', 'were', 'cut', ',', 'except', 'the', 'one', 'that', 'would', 'have', 'prevented', 'this', 'film', "'s", 'release', '.', 'life', "'s", 'like', 'that', '.']


## 데이터 정제

In [45]:
%load_ext autoreload
%autoreload 2

from NLP_preprocess import clean_by_freq, clean_by_len, clean_by_stopwords

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [48]:
stopwords_set = set(stopwords.words('english'))
type(stopwords.words('english'))

list

In [50]:
df["cleaned_tokens"] = df["word_tokens"].apply(lambda x : clean_by_freq(x, 1))
df["cleaned_tokens"] = df["cleaned_tokens"].apply(lambda x :clean_by_len(x, 2))
df["cleaned_tokens"] = df["cleaned_tokens"].apply(lambda x :clean_by_stopwords(x, stopwords_set))
# apply는 적용할 함수 이름 하나만 넣을 수 있어가지고,,, 내가 만든 함수 쓰려면 lambda를 써야함 

In [60]:
def sum_(x):
 return sum(x)

test_df = pd.DataFrame([[1, 2, 3],[4, 5, 6],[7, 5, 9]])
test_df.apply(sum_)

0    12
1    12
2    18
dtype: int64