In [1]:
# Googleドライブのマウント
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks/IP2Vec')

%cd /content/drive/My Drive/Colab Notebooks/IP2Vec

import torch
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

Mounted at /content/drive
/content/drive/My Drive/Colab Notebooks/IP2Vec


In [52]:
import pandas as pd

data_path = "modified_botnet_first1000rows.csv"
df = pd.read_csv(data_path)
print(df)

            Source    Destination Protocol  Source Port  Destination Port
0    60.190.223.75  147.32.84.165      TCP       2012.0            1048.0
1    60.190.223.75  147.32.84.165      TCP       2012.0            1048.0
2    147.32.84.165  60.190.223.75      TCP       1048.0            2012.0
3    147.32.84.165  60.190.223.75      TCP       1048.0            2012.0
4    60.190.223.75  147.32.84.165      TCP       2012.0            1048.0
..             ...            ...      ...          ...               ...
995  113.70.243.53  147.32.84.208      RDP       3906.0            3389.0
996  147.32.84.208  113.70.243.53      RDP       3389.0            3906.0
997  147.32.84.208  113.70.243.53      TCP       3389.0            3906.0
998  147.32.84.208  113.70.243.53      RDP       3389.0            3906.0
999  147.32.84.208  113.70.243.53      TCP       3389.0            3906.0

[1000 rows x 5 columns]


In [47]:
from tqdm import tqdm_notebook as tqdm
def _w2v(data):
    w2v ={}
    v2w = {}
    fla_d = data.flatten()
    for i in tqdm(fla_d):
        if i not in w2v:
            w2v[i] = len(w2v)
            v2w[len(w2v)-1] = i

    return w2v,v2w

def _corpus(data,w2v):
    corpus = [[w2v[w] for w in ww]  for ww in tqdm(data)]
    return corpus

def _frequency(data):
    freq = {}
    fla_d = data.flatten()
    for w in tqdm(fla_d):
        if w not in freq:
            freq[w] = 0
        freq[w] += 1
    return freq
def _data_loader(corpus,batch_size):

    def func(x):
        return [[x[0],x[1]],[x[0],x[2]],[x[0],x[3]],[x[2],x[1]],[x[3],x[1]]]

    def flatten(nested_list):
        return [e for inner_list in nested_list for e in inner_list]


    l = [func(x) for x in tqdm(corpus)]
    del corpus
    return l
    #l = pd.DataFrame(flatten(l)).to_numpy()
    #batch = [l[batch_size*(i-1):batch_size*i] for i in tqdm(range(1,int(len(l)/batch_size)))]
    #return batch

In [48]:
import torch as th
from torch.autograd import Variable as V
from torch import nn,optim
import numpy as np
import random

class Skipgram(nn.Module):
    def __init__(self,vocab_size,emb_dim):
        super().__init__()
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim
        self.u_embedding = nn.Embedding(vocab_size,emb_dim)
        self.v_embedding = nn.Embedding(vocab_size,emb_dim)
        self.log_sigmoid = nn.LogSigmoid()

        init_range= 0.5/emb_dim
        self.u_embedding.weight.data.uniform_(-init_range,init_range)
        self.v_embedding.weight.data.uniform_(-0,0)

    def forward(self, target, context,neg):
        v_embedd = self.u_embedding(target)
        u_embedd = self.v_embedding(context)
        positive = self.log_sigmoid(th.sum(u_embedd * v_embedd, dim =1)).squeeze()

        u_hat = self.v_embedding(neg)
        #negative_ = th.bmm(u_hat, v_embedd.unsqueeze(2)).squeeze(2)
        negative_ = (v_embedd.unsqueeze(1) * u_hat).sum(2)
        negative = self.log_sigmoid(-th.sum(negative_,dim=1)).squeeze()

        loss = positive + negative
        return -loss.mean()


In [54]:
import torch as th
from torch.autograd import Variable as V
from torch import nn,optim
from tqdm import tqdm_notebook as tqdm
import numpy as np
import random
# from model import Skipgram
class Trainer:
    def __init__(self,w2v,v2w,freq,emb_dim):
        self.v2w = v2w
        self.w2v = w2v
        self.unigram_table = self.noise(w2v,freq)
        self.vocab_size = len(w2v)
        self.model = Skipgram(self.vocab_size,emb_dim).to(device)
        self.optim = optim.Adam(self.model.parameters())

    def noise(self,w2v, freq):
        unigram_table = []
        total_word = sum([c for c in freq.values()])
        for w,v in w2v.items():
            unigram_table.extend([v]*int(((freq[w]/total_word)**0.75)/0.001))
        return unigram_table

    def negative_sampling(self,batch_size,neg_num,batch_target):
        neg = np.zeros((neg_num))
       # print("neg:", neg)
       # for i in range(batch_size):
        for i in range(len(batch_target)):
            sample = random.sample(self.unigram_table, neg_num)
           # print("sample:", sample)
            while batch_target[i] in sample:
                sample = random.sample(self.unigram_table, neg_num)
            neg = np.vstack([neg,sample])
        return neg[1:batch_size+1]

    def fit(self,data,max_epoch,batch_size,neg_num):
        run_losses = []
        for epoch in range(max_epoch):
            run_loss = 0

           # print(data)
            for batch in tqdm(data):

                batch = np.array(batch)  # batchをlistからnumpyのndarrayに変換
               # print("batch:", batch)
                context,target = batch[:,1],batch[:,0]
               # print("context =", context,"target =" , target)
                self.optim.zero_grad()
                batch_neg = self.negative_sampling(batch_size,neg_num,target)
                context = V(th.LongTensor(context)).to(device)
                target = V(th.LongTensor(target)).to(device)
                batch_neg = V(th.LongTensor(batch_neg.astype(int))).to(device)

                loss = self.model(target, context, batch_neg)
                loss.backward()
                self.optim.step()
                run_loss += loss.cpu().item()
            run_losses.append(run_loss/len(data))
            print("epoch:", epoch,"run_loss:", run_loss)
        return run_losses
    def most_similar(self,word,top):
        W = self.model.state_dict()["u_embedding.weight"]
        idx = w2v[word]
        similar_score = {}
        for i,vec in enumerate(W):
            if i != idx:
                d = vec.dot(W[idx])
                similar_score[self.v2w[i]] = d
        similar_score = sorted(similar_score.items(), key=lambda x: -x[1])[:top]
        for k,v in similar_score:
            print(k,":",round(v.item(),2))


In [55]:
import numpy as np
import pandas as pd
#import preprocess as p
#import trainer as t

batch_size = 1024
path = "modified_botnet_first1000rows.csv"
X = pd.read_csv(path)
d = X.to_numpy()
w2v,v2w = _w2v(d)
corpus = pd.DataFrame(_corpus(d, w2v)).to_numpy()
print(corpus)
freq  = _frequency(d)
print(freq)
train = _data_loader(corpus, batch_size)
print(train)

model = Trainer(w2v,v2w,freq,emb_dim=32)
model.fit(data = train,max_epoch=50,batch_size=256,neg_num=10)
# th.save(model.state_dict(),'ip2vec.pth')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm(fla_d):


  0%|          | 0/5000 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  corpus = [[w2v[w] for w in ww]  for ww in tqdm(data)]


  0%|          | 0/1000 [00:00<?, ?it/s]

[[  0   1   2   3   4]
 [  0   1   2   3   4]
 [  1   0   2   4   3]
 ...
 [  5   6   2   8 190]
 [  5   6  22   8 190]
 [  5   6   2   8 190]]


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for w in tqdm(fla_d):


  0%|          | 0/5000 [00:00<?, ?it/s]

{'60.190.223.75': 139, '147.32.84.165': 697, 'TCP': 794, 2012.0: 43, 1048.0: 43, '147.32.84.208': 273, '113.70.243.53': 273, 'TPKT': 23, 3389.0: 273, 3719.0: 94, '147.32.80.9': 29, 'DNS': 29, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, '195.88.191.59': 167, 1049.0: 69, 80.0: 251, 'HTTP': 25, 'RDP': 85, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, '94.63.150.63': 70, 1052.0: 25, nan: 1, nan: 1, 1053.0: 13, 888.0: 47, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, 1054.0: 59, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, '122.224.6.164': 212, 1055.0: 212, 82.0: 212, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, 1056.0: 15, 'T.125': 12, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, 1057.0: 17, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, '174.123.157.154': 14, 1058.0: 14, 1059.0: 17, nan: 1, nan: 1, nan: 1, nan: 1, 1060.0: 15, nan: 1, nan: 1, 1061.0: 49, 2011.0: 49, 3813.0: 155, 1062.0: 39, 'COTP': 2, 1063.0:

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  l = [func(x) for x in tqdm(corpus)]


  0%|          | 0/1000 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch in tqdm(data):


[[[0, 1], [0, 2], [0, 3], [2, 1], [3, 1]], [[0, 1], [0, 2], [0, 3], [2, 1], [3, 1]], [[1, 0], [1, 2], [1, 4], [2, 0], [4, 0]], [[1, 0], [1, 2], [1, 4], [2, 0], [4, 0]], [[0, 1], [0, 2], [0, 3], [2, 1], [3, 1]], [[1, 0], [1, 2], [1, 4], [2, 0], [4, 0]], [[1, 0], [1, 2], [1, 4], [2, 0], [4, 0]], [[0, 1], [0, 2], [0, 3], [2, 1], [3, 1]], [[1, 0], [1, 2], [1, 4], [2, 0], [4, 0]], [[1, 0], [1, 2], [1, 4], [2, 0], [4, 0]], [[0, 1], [0, 2], [0, 3], [2, 1], [3, 1]], [[1, 0], [1, 2], [1, 4], [2, 0], [4, 0]], [[1, 0], [1, 2], [1, 4], [2, 0], [4, 0]], [[0, 1], [0, 2], [0, 3], [2, 1], [3, 1]], [[1, 0], [1, 2], [1, 4], [2, 0], [4, 0]], [[1, 0], [1, 2], [1, 4], [2, 0], [4, 0]], [[0, 1], [0, 2], [0, 3], [2, 1], [3, 1]], [[1, 0], [1, 2], [1, 4], [2, 0], [4, 0]], [[1, 0], [1, 2], [1, 4], [2, 0], [4, 0]], [[0, 1], [0, 2], [0, 3], [2, 1], [3, 1]], [[0, 1], [0, 2], [0, 3], [2, 1], [3, 1]], [[1, 0], [1, 2], [1, 4], [2, 0], [4, 0]], [[1, 0], [1, 2], [1, 4], [2, 0], [4, 0]], [[0, 1], [0, 2], [0, 3], [2, 1], 

  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 0 run_loss: 927.3220138251781


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 1 run_loss: 649.3576189354062


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 2 run_loss: 486.4044941365719


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 3 run_loss: 391.15602896362543


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 4 run_loss: 330.74248000234365


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 5 run_loss: 256.2831135056913


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 6 run_loss: 224.53547748737037


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 7 run_loss: 208.00342533923686


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 8 run_loss: 188.26547495182604


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 9 run_loss: 155.51147493440658


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 10 run_loss: 136.14351683482528


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 11 run_loss: 117.22303722705692


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 12 run_loss: 130.81913209334016


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 13 run_loss: 101.46453778631985


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 14 run_loss: 99.24037615163252


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 15 run_loss: 85.54785285051912


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 16 run_loss: 108.4745910083875


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 17 run_loss: 89.65756185771897


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 18 run_loss: 87.72055403352715


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 19 run_loss: 72.52859999565408


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 20 run_loss: 72.10370241291821


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 21 run_loss: 63.327834212454036


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 22 run_loss: 72.6513014046941


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 23 run_loss: 67.82679251616355


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 24 run_loss: 64.18500764481723


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 25 run_loss: 64.21088593325112


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 26 run_loss: 57.19660785119049


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 27 run_loss: 76.19530211039819


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 28 run_loss: 67.63576403586194


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 29 run_loss: 74.94901683693752


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 30 run_loss: 76.74216881068423


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 31 run_loss: 45.2878502542153


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 32 run_loss: 54.19892245612573


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 33 run_loss: 74.56863969878759


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 34 run_loss: 62.37481857032981


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 35 run_loss: 62.62609808816342


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 36 run_loss: 38.21927848402993


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 37 run_loss: 67.26693036875804


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 38 run_loss: 37.35880191833712


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 39 run_loss: 45.01036167901475


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 40 run_loss: 49.78524820477469


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 41 run_loss: 48.25753061065916


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 42 run_loss: 53.78293635020964


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 43 run_loss: 63.68459897686262


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 44 run_loss: 53.09680818277411


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 45 run_loss: 44.54031064832816


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 46 run_loss: 61.18371679529082


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 47 run_loss: 41.79349774023285


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 48 run_loss: 56.41108950364287


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch: 49 run_loss: 46.69090449390933


[0.9273220138251781,
 0.6493576189354062,
 0.4864044941365719,
 0.39115602896362545,
 0.33074248000234363,
 0.2562831135056913,
 0.22453547748737038,
 0.20800342533923685,
 0.18826547495182602,
 0.1555114749344066,
 0.13614351683482528,
 0.11722303722705692,
 0.13081913209334015,
 0.10146453778631985,
 0.09924037615163252,
 0.08554785285051912,
 0.10847459100838751,
 0.08965756185771898,
 0.08772055403352715,
 0.07252859999565407,
 0.0721037024129182,
 0.06332783421245404,
 0.0726513014046941,
 0.06782679251616355,
 0.06418500764481723,
 0.06421088593325112,
 0.05719660785119049,
 0.07619530211039818,
 0.06763576403586194,
 0.07494901683693751,
 0.07674216881068423,
 0.0452878502542153,
 0.05419892245612573,
 0.07456863969878759,
 0.06237481857032981,
 0.06262609808816341,
 0.03821927848402993,
 0.06726693036875804,
 0.03735880191833712,
 0.04501036167901475,
 0.04978524820477469,
 0.04825753061065916,
 0.053782936350209636,
 0.06368459897686261,
 0.05309680818277411,
 0.04454031064832

In [53]:
model.most_similar("60.190.223.75",10)

195.88.191.59 : 30.88
122.224.6.164 : 30.67
80.0 : 27.23
94.63.150.63 : 23.72
82.0 : 23.66
888.0 : 20.09
2011.0 : 18.22
147.32.84.208 : 17.41
2012.0 : 17.29
147.32.84.165 : 15.2
