## Data Loading

In [None]:
import pandas as pd
df = pd.read_csv('./data/edu_news.csv')

In [None]:
df = df[df['category']==1]

## Clean up sentences

In [None]:
import re

def preprocessing(sentence):
    sentence =re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z]', ' ', sentence)
    return sentence

In [None]:
df['content_cleaned'] = df['content'].apply(preprocessing)
content = df['content_cleaned'].tolist()

In [None]:
df.describe()

## CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

counter = CountVectorizer()
c = counter.fit_transform(content)

## Kakao Stemmer

In [None]:
from khaiii import KhaiiiApi

In [None]:
kakao_stemmer = KhaiiiApi()

In [None]:
def noun_tokenize(sentence):
    result = kakao_stemmer.analyze(sentence)
    word_tokens = [l.lex for mor in result for l in mor.morphs if l.tag == 'NNG']
    ret = ' '.join(word_tokens)
    return ret

In [None]:
def tokenize(sentence):
    result = kakao_stemmer.analyze(sentence)
    word_tokens = [l.lex for mor in result for l in mor.morphs]
    ret = ' '.join(word_tokens)
    return ret

In [None]:
df['tokenized_noun'] = df['content_cleaned'].apply(noun_tokenize)

In [None]:
df['tokenized_noun'].head()

In [None]:
df['tokenized'] = df['content_cleaned'].apply(tokenize)
df['tokenized'].head()

In [None]:
tokenized_sentence = df['tokenized_noun'].tolist()

In [None]:
len(tokenized_sentence)

In [None]:
df.to_csv('./data/news_tokenized.csv', index=False)

## Starts from Here

In [None]:
import pandas as pd
df = pd.read_csv('./data/news_tokenized.csv')

In [None]:
tokenized_sentence = df['tokenized'].tolist()

## Import Pytorch

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import random
import numpy as np

## Flatten

In [None]:
flattened = ""
for sentence in tokenized_sentence:
    flattened+=(sentence)
    
flat = flattened.split()

In [None]:
flat[:10]

In [None]:
from collections import Counter
word_counter = Counter(flat)

In [None]:
word_list = word_counter.most_common()
word_dict = dict(word_list)

## Generate word2idx, idx2word

In [None]:
word2idx = dict()
idx2word = dict()

In [None]:
for idx, value in enumerate(word_dict.keys()):
    word2idx[value] = idx
    idx2word[idx] = value
    
last_idx = len(word2idx)
word2idx['<UNK>'] = last_idx
idx2word[last_idx] = '<UNK>'

In [None]:
word2idx['학생'], idx2word[3]

In [None]:
len(word2idx), len(idx2word)

## One Hot Encoding Look Up Table

In [None]:
one_hot_lookup = torch.eye(len(word2idx))

In [None]:
one_hot_lookup[word2idx['학교']]

## Word to idx

In [None]:
flat_idx = [word2idx[x] for x in flat]

In [None]:
flat_idx[:5]

In [None]:
flat_one_hot = [one_hot_lookup[x] for x in flat_idx]

## Create Batch

In [None]:
def create_batch(data, idx=0, batch_size=20, skip_gram=2)
    x = []
    target = []
    max_length = len(data)
    for i in range(batch_size):
        if (idx + i +) >= max_length:
            break
        x.append(data[i+idx])
        target.append()

In [None]:
import random

def generate_input(dataset, num_skips):
    random.shuffle(dataset)  # 문장 단위로 셔플한다.

    # 일차원 array로 만든다. (window를 돌리기 위해!)
    flatten = []
    for list_ in dataset:
        flatten += list_

    # (나는, 그녀를 보았다.) => (i:그녀를, l:나는), (i:그녀를, l:보았다)
    data = []
    label = []
    for idx in range(num_skips, len(flatten)-num_skips):
        data.append(flatten[idx])
        data.append(flatten[idx])
        label.append([flatten[idx-1]])
        label.append([flatten[idx+1]])
    return data, label

## Skip gram dataset build

In [None]:
def create_skipgram_dataset(text):
    import random
    data = []
    for i in range(2, len(text) - 2):
        data.append((text[i], text[i-2], 1))
        data.append((text[i], text[i-1], 1))
        data.append((text[i], text[i+1], 1))
        data.append((text[i], text[i+2], 1))
        # negative sampling
        for _ in range(4):
            if random.random() < 0.5 or i >= len(text) - 3:
                rand_id = random.randint(0, i-1)
            else:
                rand_id = random.randint(i+3, len(text)-1)
            data.append((text[i], text[rand_id], 0))
    return data

In [None]:
sample_size = 1500

In [None]:
skipgram_train = create_skipgram_dataset(flat[:sample_size])

## Define SkipGram Model

In [None]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embd_size):
        super(SkipGram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embd_size)
    
    def forward(self, focus, context):
        embed_focus = self.embeddings(focus).view((1, -1))
        embed_ctx = self.embeddings(context).view((1, -1))
        score = torch.mm(embed_focus, torch.t(embed_ctx))
        log_probs = F.logsigmoid(score)
    
        return log_probs

In [None]:
len(skipgram_train)

In [None]:
vocab_size = len(word2idx)
embd_size = 2

model = SkipGram(vocab_size, embd_size)

use_cuda = torch.cuda.is_available()
if use_cuda:
    model = model.cuda()
    print("We are using GPU")

In [None]:
learning_rate = 0.5
n_epoch = 30

In [None]:
loss_fn = nn.MSELoss()

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

model.train()

for epoch in range(n_epoch):
    total_loss = .0
    idx = 0
    for in_w, out_w, target in skipgram_train:
        idx+=1
        in_w_var = Variable(torch.LongTensor([word2idx[in_w]]))
        out_w_var = Variable(torch.LongTensor([word2idx[out_w]]))
        target = Variable(torch.FloatTensor([target]))
        if use_cuda:
            in_w_var = in_w_var.cuda()
            out_w_var = out_w_var.cuda()
            target = target.cuda()

        model.zero_grad()
        log_probs = model(in_w_var, out_w_var)
        loss = loss_fn(log_probs[0], target)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print("epoch: {}, loss: {:.5f}".format(epoch, total_loss))

In [None]:
len(skipgram_train)

In [None]:
print('====Test SkipGram===')
model.eval()

correct_ct = 0

for in_w, out_w, target in skipgram_train:
    in_w_var = Variable(torch.LongTensor([word2idx[in_w]]))
    out_w_var = Variable(torch.LongTensor([word2idx[out_w]]))
    
    if use_cuda:
        in_w_var = in_w_var.cuda()
        out_w_var = out_w_var.cuda()
    
    log_probs = model(in_w_var, out_w_var)
    _, predicted = torch.max(log_probs, 1)
    predicted = predicted[0]
    if predicted == target:
        correct_ct += 1

print('Accuracy: {:.1f}% ({:d}/{:d})'.format(correct_ct/len(skipgram_train)*100, correct_ct, len(skipgram_train)))

In [None]:
words = pd.DataFrame()

## CS244 구현

In [1]:
import pandas as pd
df = pd.read_csv('./data/news_tokenized.csv')
# tokenized_sentence = df['tokenized'].tolist()

In [2]:
tokenized_sentence = df['tokenized_noun'].tolist()

In [3]:
len(tokenized_sentence)

4339

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import random
import numpy as np

In [5]:
torch.cuda.empty_cache()

In [6]:
flattened = ""
for sentence in tokenized_sentence:
    flattened+=(sentence)
corpus = flattened.split()

In [7]:
len(corpus)

957037

In [10]:
word_limit = 750000
corpus = corpus[:word_limit]
len(corpus)

750000

In [11]:
from collections import Counter
word_counter = Counter(corpus)
word_list = word_counter.most_common()
word_dict = dict(word_list)

## more than 500 words

In [None]:
w = [word for word in word_list if word[1] > 200]

In [None]:
word_dict = dict(w)
len(word_dict)

## Generate word2idx

In [12]:
word2idx = dict()
idx2word = dict()

In [13]:
for idx, value in enumerate(word_dict.keys()):
    word2idx[value] = idx
    idx2word[idx] = value
    
last_idx = len(word2idx)
word2idx['<UNK>'] = last_idx
idx2word[last_idx] = '<UNK>'

In [14]:
len(word2idx), len(idx2word)

(27365, 27365)

In [15]:
use_cuda = torch.cuda.is_available()
if use_cuda:
    print("We are using GPU")

We are using GPU


In [16]:
def create_skipgram_dataset_neg_sampl(text):
    import random
    data = []
    for i in range(2, len(text) - 2):
        data.append((text[i], text[i-2], 1))
        data.append((text[i], text[i-1], 1))
        data.append((text[i], text[i+1], 1))
        data.append((text[i], text[i+2], 1))
        # negative sampling
        for _ in range(4):
            if random.random() < 0.5 or i >= len(text) - 3:
                rand_id = random.randint(0, i-1)
            else:
                rand_id = random.randint(i+3, len(text)-1)
            data.append((text[i], text[rand_id], 0))
    return data

def create_skipgram_dataset(text):
    import random
    data = []
    for i in range(2, len(text) - 2):
        data.append((text[i], text[i-2]))
        data.append((text[i], text[i-1]))
        data.append((text[i], text[i+1]))
        data.append((text[i], text[i+2]))
    return data

In [17]:
len(corpus)

750000

In [18]:
vocab = list(set(corpus))
vocab.append('<UNK>')

In [19]:
len(vocab)

27365

In [20]:
train_data = create_skipgram_dataset(vocab)

In [21]:
len(train_data)

109444

In [22]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

def prepare_word(word, word2index):
    return Variable(LongTensor([word2index[word]]) if word2index.get(word) is not None else LongTensor([word2index["<UNK>"]]))

In [23]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex: eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [24]:
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor

In [25]:
len(word2idx)

27365

## Convert string to integer

In [26]:
X_p = []
y_p = []
for tr in train_data:
    X_p.append(prepare_word(tr[0], word2idx).view(1, -1))
    y_p.append(prepare_word(tr[1], word2idx).view(1, -1))

In [27]:
train_data[0]

('축도', '중산')

In [28]:
train_data = list(zip(X_p, y_p))

In [29]:
len(train_data)

109444

In [30]:
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size, projection_dim):
        super(Skipgram,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim)
        self.embedding_u = nn.Embedding(vocab_size, projection_dim)

        self.embedding_v.weight.data.uniform_(-1, 1) # init
        self.embedding_u.weight.data.uniform_(0, 0) # init
        #self.out = nn.Linear(projection_dim,vocab_size)
    def forward(self, center_words,target_words, outer_words):
        center_embeds = self.embedding_v(center_words) # B x 1 x D
        target_embeds = self.embedding_u(target_words) # B x 1 x D
        outer_embeds = self.embedding_u(outer_words) # B x V x D
        
        scores = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2) # Bx1xD * BxDx1 => Bx1
        norm_scores = outer_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2) # BxVxD * BxDx1 => BxV
        
        nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores), 1).unsqueeze(1))) # log-softmax
        
        return nll # negative log likelihood
    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds 

In [35]:
EMBEDDING_SIZE = 2
BATCH_SIZE =128
EPOCH = 30

In [36]:
losses = []
model = Skipgram(len(word2idx), EMBEDDING_SIZE)
if use_cuda:
    model = model.cuda()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [37]:
for epoch in range(EPOCH):
    for i, batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        
        inputs, targets = zip(*batch)
        
        inputs = torch.cat(inputs) # B x 1
        targets = torch.cat(targets) # B x 1
        vocabs = prepare_sequence(list(vocab), word2idx).expand(inputs.size(0), len(vocab))  # B x V
        model.zero_grad()

        loss = model(inputs, targets, vocabs)
        
        loss.backward()
        optimizer.step()
   
        losses.append(loss.item())
        if i % 100 == 0:
            print("batch : %d, mean_loss : %.6f" % (i, np.mean(losses)))

    if epoch % 1 == 0:
        print("Epoch : %d, mean_loss : %.6f" % (epoch, np.mean(losses)))
        losses = []
        torch.save(model.state_dict(), './model/skipgram-embed2-batch128-epoch30')

batch : 0, mean_loss : 10.217020
batch : 100, mean_loss : 10.217395
batch : 200, mean_loss : 10.218100
batch : 300, mean_loss : 10.218887
batch : 400, mean_loss : 10.219243
batch : 500, mean_loss : 10.219417
batch : 600, mean_loss : 10.219728
batch : 700, mean_loss : 10.219438
batch : 800, mean_loss : 10.218013
Epoch : 0, mean_loss : 10.217572
batch : 0, mean_loss : 10.070097
batch : 100, mean_loss : 10.070762
batch : 200, mean_loss : 10.066503
batch : 300, mean_loss : 10.061027
batch : 400, mean_loss : 10.056170
batch : 500, mean_loss : 10.052102
batch : 600, mean_loss : 10.047091
batch : 700, mean_loss : 10.041313
batch : 800, mean_loss : 10.035914
Epoch : 1, mean_loss : 10.032278
batch : 0, mean_loss : 9.718893
batch : 100, mean_loss : 9.744351
batch : 200, mean_loss : 9.740856
batch : 300, mean_loss : 9.742909
batch : 400, mean_loss : 9.742336
batch : 500, mean_loss : 9.739427
batch : 600, mean_loss : 9.739768
batch : 700, mean_loss : 9.736789
batch : 800, mean_loss : 9.734621
Epoc

batch : 300, mean_loss : 8.572153
batch : 400, mean_loss : 8.593785
batch : 500, mean_loss : 8.611284
batch : 600, mean_loss : 8.625329
batch : 700, mean_loss : 8.635472
batch : 800, mean_loss : 8.645738
Epoch : 24, mean_loss : 8.650666
batch : 0, mean_loss : 8.489092
batch : 100, mean_loss : 8.487107
batch : 200, mean_loss : 8.530520
batch : 300, mean_loss : 8.559452
batch : 400, mean_loss : 8.583995
batch : 500, mean_loss : 8.600405
batch : 600, mean_loss : 8.615820
batch : 700, mean_loss : 8.626559
batch : 800, mean_loss : 8.637272
Epoch : 25, mean_loss : 8.641920
batch : 0, mean_loss : 8.559797
batch : 100, mean_loss : 8.481048
batch : 200, mean_loss : 8.518001
batch : 300, mean_loss : 8.549936
batch : 400, mean_loss : 8.572837
batch : 500, mean_loss : 8.590425
batch : 600, mean_loss : 8.604553
batch : 700, mean_loss : 8.616244
batch : 800, mean_loss : 8.626697
Epoch : 26, mean_loss : 8.631489
batch : 0, mean_loss : 8.409271
batch : 100, mean_loss : 8.462296
batch : 200, mean_loss 

In [None]:
torch.cuda.empty_cache()

In [34]:
model.load_state_dict(torch.load('./model/skipgram-001'))

RuntimeError: Error(s) in loading state_dict for Skipgram:
	size mismatch for embedding_v.weight: copying a param with shape torch.Size([27365, 200]) from checkpoint, the shape in current model is torch.Size([27365, 2]).
	size mismatch for embedding_u.weight: copying a param with shape torch.Size([27365, 200]) from checkpoint, the shape in current model is torch.Size([27365, 2]).

In [None]:
def convert_2d_graph(vocab):
    word = []
    vect = []
    for v in vocab:
        if use_cuda:
            vector = model.prediction(prepare_word(v, word2idx))
            word.append(v)
            vect.append(*vector.data.cpu().numpy())
    return word, vect

In [None]:
def word_similarity(target, vocab):
    if use_cuda:
        target_V = model.prediction(prepare_word(target, word2idx))
    else:
        target_V = model.prediction(prepare_word(target, word2idx))
    similarities = []
    for i in range(len(vocab)):
        if vocab[i] == target: continue
        
        if use_cuda:
            vector = model.prediction(prepare_word(list(vocab)[i], word2idx))
        else:
            vector = model.prediction(prepare_word(list(vocab)[i], word2idx))
        cosine_sim = F.cosine_similarity(target_V, vector).data.tolist()[0] 
        similarities.append([vocab[i], cosine_sim])
    return sorted(similarities, key=lambda x: x[1], reverse=True)[:10] # sort by similarity

In [None]:
test = random.choice(list(vocab))
print(test)

In [None]:
word_similarity("유치원", vocab)

In [None]:
common_word = []
for i in word_counter.most_common(300):
    common_word.append(i[0])

In [None]:
word, vect = convert_2d_graph(common_word)

In [None]:
word[0], vect[0]

## Save DataFrame to CSV

In [None]:
points = pd.DataFrame(vect, index=word, columns=['x', 'y'])

In [None]:
points.head()

In [None]:
# points.to_csv('points.csv')

## Visualize w/ Matplotlib

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.font_manager as fm
from matplotlib import rc
%matplotlib inline

In [None]:
font_list = fm.findSystemFonts(fontpaths=None, fontext='ttf')
nanum_list = [f for f in font_list if 'Nanum' in f]
a = fm.FontProperties(fname=nanum_list[0])
a.get_name()

In [None]:
rc('font', family=a.get_name())

In [None]:
fig = plt.figure()
fig.set_size_inches(40, 20)
ax = fig.add_subplot(1, 1, 1)
ax.scatter(points['x'], points['y'])

for word, pos in points.iterrows():
    ax.annotate(word, pos, fontsize=30)
plt.show