In [13]:
# import library
import pandas as pd
import numpy as np
import re
import nltk
import gensim

#nltk.download('stopwords') # 불용어 다운로드
#nltk.download('punkt') # tokenizer 다운로드

In [6]:
# data 확인

df = pd.read_csv('../spam.csv')
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# v2 column만 가져오기
sentences = df.loc[:,['v2']].copy()
sentences

Unnamed: 0,v2
0,"Go until jurong point, crazy.. Available only ..."
1,Ok lar... Joking wif u oni...
2,Free entry in 2 a wkly comp to win FA Cup fina...
3,U dun say so early hor... U c already then say...
4,"Nah I don't think he goes to usf, he lives aro..."
...,...
5567,This is the 2nd time we have tried 2 contact u...
5568,Will Ì_ b going to esplanade fr home?
5569,"Pity, * was in mood for that. So...any other s..."
5570,The guy did some bitching but I acted like i'd...


In [8]:
# 특수 문자 제거 / 정규 표현식 이용
sentences['v2'] = sentences['v2'].apply(lambda x: re.sub(r"[^a-zA-Z\s]","",x.strip())) #앞 뒤 공백 제거 후 알파벳만 남기기
sentences

Unnamed: 0,v2
0,Go until jurong point crazy Available only in ...
1,Ok lar Joking wif u oni
2,Free entry in a wkly comp to win FA Cup final...
3,U dun say so early hor U c already then say
4,Nah I dont think he goes to usf he lives aroun...
...,...
5567,This is the nd time we have tried contact u U...
5568,Will b going to esplanade fr home
5569,Pity was in mood for that Soany other suggest...
5570,The guy did some bitching but I acted like id ...


In [9]:
# tokenizing
sentences['tokens'] = sentences['v2'].apply(lambda x: nltk.tokenize.word_tokenize(x))
sentences

Unnamed: 0,v2,tokens
0,Go until jurong point crazy Available only in ...,"[Go, until, jurong, point, crazy, Available, o..."
1,Ok lar Joking wif u oni,"[Ok, lar, Joking, wif, u, oni]"
2,Free entry in a wkly comp to win FA Cup final...,"[Free, entry, in, a, wkly, comp, to, win, FA, ..."
3,U dun say so early hor U c already then say,"[U, dun, say, so, early, hor, U, c, already, t..."
4,Nah I dont think he goes to usf he lives aroun...,"[Nah, I, dont, think, he, goes, to, usf, he, l..."
...,...,...
5567,This is the nd time we have tried contact u U...,"[This, is, the, nd, time, we, have, tried, con..."
5568,Will b going to esplanade fr home,"[Will, b, going, to, esplanade, fr, home]"
5569,Pity was in mood for that Soany other suggest...,"[Pity, was, in, mood, for, that, Soany, other,..."
5570,The guy did some bitching but I acted like id ...,"[The, guy, did, some, bitching, but, I, acted,..."


In [12]:
# 불용어 처리 with nltk stopwords
stopwords = set(nltk.corpus.stopwords.words('english'))

sentences['del_stopwords'] = sentences['tokens'].apply(lambda x: [word for word in x if word not in stopwords])
sentences

Unnamed: 0,v2,tokens,del_stopwords
0,Go until jurong point crazy Available only in ...,"[Go, until, jurong, point, crazy, Available, o...","[Go, jurong, point, crazy, Available, bugis, n..."
1,Ok lar Joking wif u oni,"[Ok, lar, Joking, wif, u, oni]","[Ok, lar, Joking, wif, u, oni]"
2,Free entry in a wkly comp to win FA Cup final...,"[Free, entry, in, a, wkly, comp, to, win, FA, ...","[Free, entry, wkly, comp, win, FA, Cup, final,..."
3,U dun say so early hor U c already then say,"[U, dun, say, so, early, hor, U, c, already, t...","[U, dun, say, early, hor, U, c, already, say]"
4,Nah I dont think he goes to usf he lives aroun...,"[Nah, I, dont, think, he, goes, to, usf, he, l...","[Nah, I, dont, think, goes, usf, lives, around..."
...,...,...,...
5567,This is the nd time we have tried contact u U...,"[This, is, the, nd, time, we, have, tried, con...","[This, nd, time, tried, contact, u, U, Pound, ..."
5568,Will b going to esplanade fr home,"[Will, b, going, to, esplanade, fr, home]","[Will, b, going, esplanade, fr, home]"
5569,Pity was in mood for that Soany other suggest...,"[Pity, was, in, mood, for, that, Soany, other,...","[Pity, mood, Soany, suggestions]"
5570,The guy did some bitching but I acted like id ...,"[The, guy, did, some, bitching, but, I, acted,...","[The, guy, bitching, I, acted, like, id, inter..."


In [15]:
# gensim을 통해 word2vec model 가져오기

w2v = gensim.models.Word2Vec(sentences['del_stopwords'],vector_size=100, window=5, min_count=1, sg=0) # 각 token을 100차원 임베딩, window는 5, sg=0 -> CBOW

In [25]:
df.loc[df['v1'] == 'spam'] # spam만 보기

Unnamed: 0,v1,v2
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
5,spam,FreeMsg Hey there darling it's been 3 week's n...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po..."
...,...,...
5537,spam,Want explicit SEX in 30 secs? Ring 02073162414...
5540,spam,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
5547,spam,Had your contract mobile 11 Mnths? Latest Moto...
5566,spam,REMINDER FROM O2: To get 2.50 pounds free call...


In [30]:
# 유사도 보기
w2v.wv.most_similar('hello')

[('Claim', 0.9443881511688232),
 ('apply', 0.9427038431167603),
 ('da', 0.9426981210708618),
 ('close', 0.9426915049552917),
 ('else', 0.94260174036026),
 ('hrs', 0.9423786997795105),
 ('computer', 0.9422488212585449),
 ('birthday', 0.942230761051178),
 ('Nokia', 0.9421374201774597),
 ('soon', 0.9419139623641968)]