Concat parts of table stored in ./data/data_parts/ as .parquet files into one table and save it as .parquet file in ./data/ folder.

In [1]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import gc

data_parts = []

for file in tqdm(glob('./data/data_parts/*.parquet')):
    if len(data_parts) >= 100:
        print('Processed 100 files, stopping')
        break
    try:
        part = pd.read_parquet(file)
        part = part[part['html'].str.len() >= 5000]
        data_parts.append(part)
        del part
        gc.collect()
    except Exception as e:
        print(f'Error reading file {file}: {e}')

data_parts[0].head()

 15%|█▌        | 100/652 [00:11<01:03,  8.63it/s]

Processed 100 files, stopping





Unnamed: 0,url,type,html,error
2,http://www.bopsecrets.org/rexroth/cr/1.htm,benign,"\n\n<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4....",
6,http://www.espn.go.com/nba/player/_/id/3457/br...,benign,\n <!doctype html>\n <html lang=...,
9,http://www.allmusic.com/album/crazy-from-the-h...,benign,"<!DOCTYPE html>\n<html lang=""en"">\n<head>\n ...",
12,http://www.myspace.com/video/vid/30602581,benign,<!DOCTYPE html>\n<!-- videodetail -->\n<!-- a0...,
19,http://www.vnic.co/khach-hang.html,defacement,"\r\n\r\n\r\n\r\n<!DOCTYPE html PUBLIC ""-//W3C/...",


In [2]:
data_parts[0].shape

(262, 4)

In [3]:
data = pd.concat(data_parts)
print(data.shape)
data.to_parquet('./data/100_html_data.parquet')
data.head()

(20798, 4)


Unnamed: 0,url,type,html,error
2,http://www.bopsecrets.org/rexroth/cr/1.htm,benign,"\n\n<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4....",
6,http://www.espn.go.com/nba/player/_/id/3457/br...,benign,\n <!doctype html>\n <html lang=...,
9,http://www.allmusic.com/album/crazy-from-the-h...,benign,"<!DOCTYPE html>\n<html lang=""en"">\n<head>\n ...",
12,http://www.myspace.com/video/vid/30602581,benign,<!DOCTYPE html>\n<!-- videodetail -->\n<!-- a0...,
19,http://www.vnic.co/khach-hang.html,defacement,"\r\n\r\n\r\n\r\n<!DOCTYPE html PUBLIC ""-//W3C/...",


In [4]:
del data_parts
gc.collect()

80

In [6]:
data = data[data['html'].str.len() >= 10000]
data.shape

(19936, 4)

In [7]:
data['label'] = data['type'].apply(lambda x: 0 if x == 'benign' else 1)
data['label'].value_counts()

label
0    17387
1     2549
Name: count, dtype: int64

In [9]:
data['label'].value_counts(normalize=True)

label
0    0.872141
1    0.127859
Name: proportion, dtype: float64

In [8]:
data.to_parquet('./data/100_html_data_filtered.parquet')

Currently proportion of benign and malicious samples is 0.87:0.13. We will try to balance it by first undersampling benign samples down to 10000 samples and then we will extract other 7500 samples from other unloaded data parts.

In [12]:
benign_data = data[data['label'] == 0].sample(10000)
benign_data.to_parquet('./data/10000_benign_data.parquet')
benign_data.shape

(10000, 5)

In [13]:
malicious_data = data[data['label'] == 1]
malicious_data.shape

(2549, 5)

In [18]:
malicious_data.to_parquet('./data/2500_malicious_data.parquet')

In [14]:
del benign_data
del data
gc.collect()

27

Now we will load other data parts and extract 7500 samples to balance the dataset.

In [1]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import gc

malicious_data = pd.read_parquet('./data/2500_malicious_data.parquet')
malicious_data.shape

(2549, 5)

In [3]:
data_parts = []
for file in tqdm(glob('./data/data_parts/*.parquet')[100:]):
    try:
        part = pd.read_parquet(file)
        part = part[part['html'].str.len() >= 10000]
        part = part[part['type'] != 'benign']
        data_parts.append(part)
        del part
        gc.collect()
    except Exception as e:
        print(f'Error reading file {file}: {e}')

100%|██████████| 552/552 [01:33<00:00,  5.92it/s]


In [4]:
data = pd.concat(data_parts)
data.shape

(12380, 4)

In [5]:
del data_parts

In [6]:
data = data.sample(10000 - malicious_data.shape[0])
data.shape

(7451, 4)

In [7]:
malicious_data = pd.concat([malicious_data, data])
malicious_data.shape

(10000, 5)

In [8]:
malicious_data.head()

Unnamed: 0,url,type,html,error,label
19,http://www.vnic.co/khach-hang.html,defacement,"\r\n\r\n\r\n\r\n<!DOCTYPE html PUBLIC ""-//W3C/...",,1.0
40,http://www.docs.google.com/spreadsheet/viewfor...,phishing,"<!DOCTYPE html><html lang=""ru"" class=""HB1eCd-U...",,1.0
72,http://www.retajconsultancy.com,phishing,"<!DOCTYPE html><html lang=""en""><head><link hre...",,1.0
162,http://www.familienbund.org/index.php?option=c...,defacement,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML+RDFa ...",,1.0
166,http://www.academiedumeuble.ca/index.php?optio...,defacement,"<!DOCTYPE html>\n<html lang=""fr-FR"">\n<head>\n...",,1.0


In [9]:
malicious_data.to_parquet('./data/10000_malicious_data.parquet')

In [10]:
benign_data = pd.read_parquet('./data/10000_benign_data.parquet')
malicious_data = pd.read_parquet('./data/10000_malicious_data.parquet')

In [11]:
data = pd.concat([benign_data, malicious_data])
data.shape

(20000, 5)

In [12]:
data['label'] = data['type'].apply(lambda x: 0 if x == 'benign' else 1)
data['label'].value_counts()

label
0    10000
1    10000
Name: count, dtype: int64

In [13]:
data['label'].value_counts(normalize=True)

label
0    0.5
1    0.5
Name: proportion, dtype: float64

In [14]:
data.to_parquet('./data/20000_html_data_balanced.parquet')