## Combining Dataset 
- Combine various ad dataset, add class variable and combine it to single dataset

In [1]:
import pandas as pd

### Combine Various Ad Datasets

In [2]:
ad_sites = pd.read_csv("../raw-dataset/adservers.txt", skiprows=10, sep=" ", usecols=[1], names=["domain"])

print (ad_sites.size)

ad_sites.head()

42292


Unnamed: 0,domain
0,0001-cab8-4c8c-43de.reporo.net
1,002-slq-470.mktoresp.com
2,004-btr-463.mktoresp.com
3,005.free-counters.co.uk
4,006.free-counters.co.uk


In [3]:
simple_ad = pd.read_csv("../raw-dataset/simple_ad.txt", skiprows=3, names=["domain"])

print (simple_ad.size)

simple_ad.head()

2701


Unnamed: 0,domain
0,101com.com
1,101order.com
2,123found.com
3,140proof.com
4,180hits.de


In [4]:
admiral = pd.read_csv("../raw-dataset/Admiral.txt", names=["domain"])

print (admiral.size)

admiral.head()

636


Unnamed: 0,domain
0,184.48.190.35.bc.googleusercontent.com
1,202.90.190.35.bc.googleusercontent.com
2,246.39.190.35.bc.googleusercontent.com
3,2znp09oa.com
4,42.219.186.35.bc.googleusercontent.com


In [5]:
prigent_ads = pd.read_csv("../raw-dataset/Prigent-Ads.txt", names=["domain"])

print (prigent_ads.size)

prigent_ads.head()

3670


Unnamed: 0,domain
0,0nlinemeds.com
1,1-2-money.com
2,101com.com
3,101order.com
4,103092804.com


In [6]:
frames = [ad_sites, prigent_ads, simple_ad]

ad_domains = pd.concat(frames)

print(ad_domains.size)

ad_domains.head()

48663


Unnamed: 0,domain
0,0001-cab8-4c8c-43de.reporo.net
1,002-slq-470.mktoresp.com
2,004-btr-463.mktoresp.com
3,005.free-counters.co.uk
4,006.free-counters.co.uk


In [7]:
ad_domains = ad_domains.drop_duplicates()

In [8]:
ad_domains

Unnamed: 0,domain
0,0001-cab8-4c8c-43de.reporo.net
1,002-slq-470.mktoresp.com
2,004-btr-463.mktoresp.com
3,005.free-counters.co.uk
4,006.free-counters.co.uk
...,...
2685,yume.com
2690,zde-affinity.edgecaching.net
2692,zeepmedia.com
2698,zintext.com


In [9]:
ad_domains.isnull().values.any()

False

## Normal Sites from Alexa Top 1M

In [10]:
normal_sites = pd.read_csv("../raw-dataset/top-1m.csv", usecols=[1], names=["domain"])

print(normal_sites.size)

normal_sites.head()

581685


Unnamed: 0,domain
0,google.com
1,youtube.com
2,baidu.com
3,zhihu.com
4,taobao.com


In [11]:
normal_sites.drop_duplicates()

Unnamed: 0,domain
0,google.com
1,youtube.com
2,baidu.com
3,zhihu.com
4,taobao.com
...,...
581680,xdesirecamsx.com
581681,yourcodercamp.com
581682,yourhealthfromhome.com
581683,yunella.com


In [12]:
normal_sites.isnull().values.any()

False

### Randomly select n records from Alexa Top 1M

In [16]:
normal_sites = normal_sites.sample(n=46229)

In [17]:
normal_sites

Unnamed: 0,domain
307713,crowdyfan.com
377353,dreamaways.com
352212,worldtravelserver.com
240236,baarty.com
399659,ladamotors63.ru
...,...
570966,casinfo-maroc.com
115801,jboss.org
128730,rinknet.com
3538,unrealengine.com


## Add class variable

In [18]:
normal_sites['class'] = 0
ad_domains['class'] = 1

In [19]:
normal_sites.head()

Unnamed: 0,domain,class
307713,crowdyfan.com,0
377353,dreamaways.com,0
352212,worldtravelserver.com,0
240236,baarty.com,0
399659,ladamotors63.ru,0


### Merge Datasets

In [20]:
frames = [normal_sites, ad_domains]

merged_ds = pd.concat(frames)

print(merged_ds.size)

merged_ds.head()

184916


Unnamed: 0,domain,class
307713,crowdyfan.com,0
377353,dreamaways.com,0
352212,worldtravelserver.com,0
240236,baarty.com,0
399659,ladamotors63.ru,0


### Check if all domain names are correct

In [21]:
# Fix invalid domain names

replace = {
    "performer.api.naiadsystems.comm": "performer.api.naiadsystems.com",
    "fhits.xy": "fhits.xyz",
    "www.fhits.xy": "www.fhits.xyz",
    "cdn1.fhits.xy": "cdn1.fhits.xyz",
    "accounts.pkr.com.invalid": "accounts.pkr.com",
    "bravenet.com.invalid": "bravenet.com",
    "seeq.com.invalid": "seeq.com"
}

for item in replace:
    merged_ds = merged_ds.replace(to_replace=item, value =replace.get(item))
    
from tld import get_tld

merged_ds["domain"].apply(lambda x: get_tld(x, fix_protocol=True))

307713    com
377353    com
352212    com
240236    com
399659     ru
         ... 
2685      com
2690      net
2692      com
2698      com
2700      com
Name: domain, Length: 92458, dtype: object

In [22]:
merged_ds.to_csv("../raw-dataset/merged.csv")