In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
def get_row(i,url, label):
    url = url.strip()
    if '"' in url:
        url = url.replace('"', '""')
    if '"' in url or ',' in url:
        url = '"%s"' % url
    return '%d,%s,%s\n' % (i, url, label)

## URL Only

In [3]:
N_URL_L = 2832720
N_URL_P = 2171070

In [4]:
with open('data/ebubekirbbr/url_%d.csv' % (N_URL_L + N_URL_P), 'w') as f:
    names = 'index,text,label\n'
    f.write(names)
    
    for i, url in enumerate(open('data/ebubekirbbr/raw/url_legitimates.txt').readlines()):
        row = get_row(i, url, 'legitimate')
        f.write(row)
    
    for i, url in enumerate(open('data/ebubekirbbr/raw/url_phishings.txt').readlines(), start=N_URL_L):
        row = get_row(i, url, 'phishing')
        f.write(row)

## Domain Only

In [5]:
N_DOMAIN_L = 49228
N_DOMAIN_P = 149808

In [6]:
with open('data/ebubekirbbr/domain_%d.csv' % (N_DOMAIN_L + N_DOMAIN_P), 'w') as f:
    names = 'index,text,label\n'
    f.write(names)
    
    for i, url in enumerate(open('data/ebubekirbbr/raw/domain_legitimates.txt').readlines()):
        row = get_row(i, url, 'legitimate')
        f.write(row)
    
    for i, url in enumerate(open('data/ebubekirbbr/raw/domain_phishings.txt').readlines(), start=N_DOMAIN_L):
        row = get_row(i, url, 'phishing')
        f.write(row)

## URL402,000 + DOMAIN98,000

In [7]:
N_URL = 201000
N_DOMAIN = 49000
N_URL_DOMAIN = N_URL + N_DOMAIN

In [8]:
urls = [x.strip() for x in
        open('data/ebubekirbbr/raw/url_legitimates.txt').readlines()[:N_URL] +
        open('data/ebubekirbbr/raw/domain_legitimates.txt').readlines()[:N_DOMAIN] +
        open('data/ebubekirbbr/raw/url_phishings.txt').readlines()[:N_URL] +
        open('data/ebubekirbbr/raw/domain_phishings.txt').readlines()[:N_DOMAIN]]
urls = pd.Series(urls, dtype=str)
urls.tail()

499995                 http://application-jagex.totalh.com/
499996          http://servicesrunescapelogin.megabyet.net/
499997    http://us.blizzard.com.login.en.forum-password...
499998                                 http://b2bu3.t35.com
499999    http://www.services-runescape-free-level-135-0...
dtype: object

In [9]:
labels = ['legitimate'] * N_URL_DOMAIN + ['phishing'] * N_URL_DOMAIN
labels = pd.Series(labels, dtype='category')
labels.tail()

499995    phishing
499996    phishing
499997    phishing
499998    phishing
499999    phishing
dtype: category
Categories (2, object): [legitimate, phishing]

In [10]:
df = pd.DataFrame()
df['text'] = urls
df['label'] = labels
df.tail()

Unnamed: 0,text,label
0,https://www.factset.com/services/portfolio-dat...,legitimate
1,https://www.factset.com/news/2018/9/18/factset...,legitimate
2,http://www.factset.com/insight/2016/08/resolve...,legitimate
3,https://www.factset.com/hubfs/resources%20sect...,legitimate
4,http://www.factset.com/careers/join-our-team/e...,legitimate
...,...,...
499995,http://application-jagex.totalh.com/,phishing
499996,http://servicesrunescapelogin.megabyet.net/,phishing
499997,http://us.blizzard.com.login.en.forum-password...,phishing
499998,http://b2bu3.t35.com,phishing


In [11]:
df.to_csv('data/url_%d_domain_%d.csv' % (N_URL * 2, N_DOMAIN * 2))

## URL(802,000) + Domain(198,000)

In [12]:
N_URL = 401000
N_DOMAIN_L = 49000
N_DOMAIN_P = 149000

In [13]:
with open('data/ebubekirbbr/url_%d_domain_%d.csv' % (N_URL * 2, N_DOMAIN_L + N_DOMAIN_P), 'w') as f:
    names = 'index,text,label\n'
    f.write(names)
    
    for i, url in enumerate(open('data/ebubekirbbr/raw/url_legitimates.txt').readlines()[:N_URL]):
        row = get_row(i, url, 'legitimate')
        f.write(row)
    
    for i, url in enumerate(open('data/ebubekirbbr/raw/domain_legitimates.txt').readlines()[:N_DOMAIN_L], start=N_URL):
        row = get_row(i, url, 'legitimate')
        f.write(row)
    
    for i, url in enumerate(open('data/ebubekirbbr/raw/url_phishings.txt').readlines()[:N_URL], start=N_URL + N_DOMAIN_L):
        row = get_row(i, url, 'phishing')
        f.write(row)
    
    for i, url in enumerate(open('data/ebubekirbbr/raw/domain_phishings.txt').readlines()[:N_DOMAIN_P], start=N_URL * 2 + N_DOMAIN_L):
        row = get_row(i, url, 'phishing')
        f.write(row)

## URL(1,802,000) + Domain(198,000)(L49,000 + P149,000)

In [14]:
N_URL = 901000
N_DOMAIN_L = 49000
N_DOMAIN_P = 149000

In [15]:
with open('data/ebubekirbbr/url_%d_domain_%d.csv' % (N_URL * 2, N_DOMAIN_L + N_DOMAIN_P), 'w') as f:
    names = 'index,text,label\n'
    f.write(names)
    
    for i, url in enumerate(open('data/ebubekirbbr/raw/url_legitimates.txt').readlines()[:N_URL]):
        row = get_row(i, url, 'legitimate')
        f.write(row)
    
    for i, url in enumerate(open('data/ebubekirbbr/raw/domain_legitimates.txt').readlines()[:N_DOMAIN_L], start=N_URL):
        row = get_row(i, url, 'legitimate')
        f.write(row)
    
    for i, url in enumerate(open('data/ebubekirbbr/raw/url_phishings.txt').readlines()[:N_URL], start=N_URL + N_DOMAIN_L):
        row = get_row(i, url, 'phishing')
        f.write(row)
    
    for i, url in enumerate(open('data/ebubekirbbr/raw/domain_phishings.txt').readlines()[:N_DOMAIN_P], start=N_URL * 2 + N_DOMAIN_L):
        row = get_row(i, url, 'phishing')
        f.write(row)

## URL(5,003,790)(2,832,720+2,171,070) + Domain(199,036)(49,228+149,808)

In [1]:
N_URL_L = 2832720
N_URL_P = 2171070
N_DOMAIN_L = 49228
N_DOMAIN_P = 149808

In [4]:
with open('data/ebubekirbbr/url_%d_domain_%d.csv' % (N_URL_L+N_URL_P, N_DOMAIN_L+N_DOMAIN_P), 'w') as f:
    names = 'index,text,label\n'
    f.write(names)
    
    for i, url in enumerate(open('data/ebubekirbbr/raw/url_legitimates.txt').readlines()):
        row = get_row(i, url, 'legitimate')
        f.write(row)
    
    for i, url in enumerate(open('data/ebubekirbbr/raw/domain_legitimates.txt').readlines(), start=N_URL_L):
        row = get_row(i, url, 'legitimate')
        f.write(row)
    
    for i, url in enumerate(open('data/ebubekirbbr/raw/url_phishings.txt').readlines(), start=N_URL_L + N_DOMAIN_L):
        row = get_row(i, url, 'phishing')
        f.write(row)
    
    for i, url in enumerate(open('data/ebubekirbbr/raw/domain_phishings.txt').readlines(), start=N_URL_L + N_DOMAIN_L + N_URL_P):
        row = get_row(i, url, 'phishing')
        f.write(row)

## Split Train Test

### URL Only

In [7]:
X_train = pd.read_csv('data/ebubekirbbr/url_5003790.csv', dtype={'text': str, 'label': 'category'})
print(len(X_train))
X_train, X_test = train_test_split(X_train, test_size=0.1, random_state=119)
X_train.to_csv('data/ebubekirbbr/splited/train_url_%d.csv' % len(X_train), index=False)
X_test.to_csv('data/ebubekirbbr/splited/val_url_%d.csv' % len(X_test), index=False)

5003790


### Domain Only

In [8]:
X_train = pd.read_csv('data/ebubekirbbr/domain_199036.csv', dtype={'text': str, 'label': 'category'})
print(len(X_train))
X_train, X_test = train_test_split(X_train, test_size=0.2, random_state=119)
X_train.to_csv('data/ebubekirbbr/splited/train_domain_%d.csv' % len(X_train), index=False)
X_test.to_csv('data/ebubekirbbr/splited/val_domain_%d.csv' % len(X_test), index=False)

199036


### URL + Domain

In [4]:
X_train = pd.read_csv('data/ebubekirbbr/url_402000_domain_98000.csv', dtype={'text': str, 'label': 'category'})
print(len(X_train))
X_train, X_test = train_test_split(X_train, test_size=0.1, random_state=119)
X_train.to_csv('data/ebubekirbbr/splited/train_%d.csv' % len(X_train), index=False)
X_test.to_csv('data/ebubekirbbr/splited/val_%d.csv' % len(X_test), index=False)

500000


In [5]:
X_train = pd.read_csv('data/ebubekirbbr/url_802000_domain_198000.csv', dtype={'text': str, 'label': 'category'})
print(len(X_train))
X_train, X_test = train_test_split(X_train, test_size=0.1, random_state=119)
X_train.to_csv('data/ebubekirbbr/splited/train_%d.csv' % len(X_train), index=False)
X_test.to_csv('data/ebubekirbbr/splited/val_%d.csv' % len(X_test), index=False)

1000000


In [6]:
X_train = pd.read_csv('data/ebubekirbbr/url_1802000_domain_198000.csv', dtype={'text': str, 'label': 'category'})
print(len(X_train))
X_train, X_test = train_test_split(X_train, test_size=0.1, random_state=119)
X_train.to_csv('data/ebubekirbbr/splited/train_%d.csv' % len(X_train), index=False)
X_test.to_csv('data/ebubekirbbr/splited/val_%d.csv' % len(X_test), index=False)

2000000


In [5]:
X_train = pd.read_csv('data/ebubekirbbr/url_5003790_domain_199036.csv', dtype={'text': str, 'label': 'category'})
print(len(X_train))
X_train, X_test = train_test_split(X_train, test_size=0.1, random_state=119)
X_train.to_csv('data/ebubekirbbr/splited/train_%d.csv' % len(X_train), index=False)
X_test.to_csv('data/ebubekirbbr/splited/val_%d.csv' % len(X_test), index=False)

5202826
