## Using pydomains to Get Content Category for All the Unique Domains in comScore 2004

In [1]:
# to hide Tensorflow warning messages
import warnings
warnings.filterwarnings('ignore')

import os
import pandas as pd
from pydomains import *

os.environ['PYDOMAINS_DATA_URL'] = 'https://raw.githubusercontent.com/themains/pydomains/master/pydomains/data/'
os.environ['PYDOMAINS_MODELS_URL'] = 'https://raw.githubusercontent.com/themains/pydomains/master/pydomains/models/'

Using TensorFlow backend.


In [2]:
## Load all the unique domains in comScore 2004

YEAR = 2004

df = pd.read_csv('/opt/data/comscore/unique_domains/cs%04d_unique_domains' % YEAR)
df.columns = ['domain_names']
df.dropna(inplace=True)
df

Unnamed: 0,domain_names
0,2o7.net
1,aol.com
2,googlesyndication.com-o02
3,adultfriendfinder.com
4,adzones.com
...,...
1011141,auctionschools.com
1011142,rockgarden.com
1011143,aaarim.com
1011144,67.97.218.140


### Get the Content Category from DMOZ

In [3]:
df = dmoz_cat(df, latest=True)
df

Downloading DMOZ data from the server (dmoz_2016.csv.bz2)...


100%|█████████▉| 93632.0/93670.55078125 [00:00<00:00, 147993.01KB/s]


Loading DMOZ data file...


Unnamed: 0,domain_names,dmoz_2016_domain,dmoz_2016_cat
0,2o7.net,2o7.net,
1,aol.com,aol.com,Top/World/Français/Regional/America/Canada/Bus...
2,googlesyndication.com-o02,googlesyndication.com-o02,
3,adultfriendfinder.com,adultfriendfinder.com,
4,adzones.com,adzones.com,
...,...,...,...
1011140,auctionschools.com,auctionschools.com,Top/Business/Business_Services/Auctions/Educat...
1011141,rockgarden.com,rockgarden.com,Top/Regional/North_America/United_States/Conne...
1011142,aaarim.com,aaarim.com,
1011143,67.97.218.140,67.97.218.140,


### Predict Content Category Using the Shallalist Model

In [4]:
df = pred_shalla(df, latest=True)
df

 98%|█████████▊| 1792.0/1828.75 [00:00<00:00, 29726.95KB/s]

Downloading Shalla model data from the server (shalla_cat_lstm_others_2017.h5)...



 95%|█████████▍| 64.0/67.587890625 [00:00<00:00, 21416.58KB/s]


Downloading Shalla vocab data from the server (shalla_cat_vocab_others_2017.csv)...
Downloading Shalla names data from the server (shalla_cat_names_others_2017.csv)...


100%|█████████▉| 64.0/64.2265625 [00:00<00:00, 24472.19KB/s]


Loading Shalla model, vocab and names data file...


Unnamed: 0,domain_names,dmoz_2016_domain,dmoz_2016_cat,pred_shalla_2017_domain,pred_shalla_2017_lab,pred_shalla_2017_prob_adv,pred_shalla_2017_prob_anonvpn,pred_shalla_2017_prob_downloads,pred_shalla_2017_prob_dynamic,pred_shalla_2017_prob_education/schools,...,pred_shalla_2017_prob_recreation/restaurants,pred_shalla_2017_prob_recreation/sports,pred_shalla_2017_prob_recreation/travel,pred_shalla_2017_prob_redirector,pred_shalla_2017_prob_religion,pred_shalla_2017_prob_science/astronomy,pred_shalla_2017_prob_sex/lingerie,pred_shalla_2017_prob_shopping,pred_shalla_2017_prob_webmail,pred_shalla_2017_prob_webradio
0,2o7.net,2o7.net,,2o7.net,redirector,0.005120,2.620840e-03,0.032742,5.842830e-03,6.718438e-04,...,3.910163e-05,0.013426,0.008528,0.390709,0.000868,2.925687e-05,0.000015,0.033155,2.909468e-02,0.005894
1,aol.com,aol.com,Top/World/Français/Regional/America/Canada/Bus...,aol.com,shopping,0.008347,8.850237e-05,0.006242,1.609948e-03,8.963734e-04,...,9.629981e-04,0.080132,0.079595,0.033303,0.000802,4.554839e-05,0.000189,0.280469,5.471601e-03,0.000950
2,googlesyndication.com-o02,googlesyndication.com-o02,,googlesyndication.com-o02,porn,0.008434,5.467070e-05,0.001602,5.728951e-04,3.153301e-05,...,3.256642e-05,0.029456,0.033208,0.009262,0.001790,9.587610e-06,0.001255,0.012417,8.544709e-04,0.002441
3,adultfriendfinder.com,adultfriendfinder.com,,adultfriendfinder.com,porn,0.000159,7.556627e-07,0.000020,7.004297e-07,3.953016e-08,...,1.478807e-07,0.000059,0.000032,0.000089,0.000003,5.140512e-09,0.000199,0.000343,4.979767e-07,0.000001
4,adzones.com,adzones.com,,adzones.com,shopping,0.219990,7.539586e-05,0.009045,2.974079e-03,1.506220e-05,...,5.534928e-05,0.035193,0.030765,0.023270,0.000140,8.002444e-07,0.000027,0.258313,1.975485e-03,0.000357
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1011140,auctionschools.com,auctionschools.com,Top/Business/Business_Services/Auctions/Educat...,auctionschools.com,shopping,0.001712,2.034779e-06,0.000807,2.467469e-05,3.583400e-04,...,1.461827e-03,0.096761,0.034968,0.004357,0.000490,1.460210e-05,0.000194,0.579539,6.142632e-04,0.000046
1011141,rockgarden.com,rockgarden.com,Top/Regional/North_America/United_States/Conne...,rockgarden.com,recreation/travel,0.003623,4.214202e-05,0.004189,1.051223e-04,4.884837e-04,...,5.067485e-03,0.096913,0.346311,0.014839,0.001353,5.144980e-04,0.000193,0.095473,5.937475e-03,0.000693
1011142,aaarim.com,aaarim.com,,aaarim.com,shopping,0.003600,7.306864e-06,0.001822,7.809890e-04,1.928807e-03,...,2.138689e-03,0.164956,0.093780,0.008003,0.000551,5.108167e-05,0.000430,0.309122,2.177618e-03,0.000128
1011143,67.97.218.140,67.97.218.140,,67.97.218.140,porn,0.020306,1.298387e-01,0.000876,6.101307e-07,4.719260e-04,...,1.281024e-05,0.000528,0.001146,0.064894,0.000040,6.944788e-06,0.000039,0.000979,4.851447e-05,0.016282


### Get the Content Category from Phishtank 

In [5]:
df = phish_cat(df, latest=True)
df

  0%|          | 0.0/926.7646484375 [00:00<?, ?KB/s]

Downloading PhishTank data from the server (phish_2017.csv.bz2)...


 97%|█████████▋| 896.0/926.7646484375 [00:00<00:00, 21403.04KB/s]


Loading PhishTank data file...


Unnamed: 0,domain_names,dmoz_2016_domain,dmoz_2016_cat,pred_shalla_2017_domain,pred_shalla_2017_lab,pred_shalla_2017_prob_adv,pred_shalla_2017_prob_anonvpn,pred_shalla_2017_prob_downloads,pred_shalla_2017_prob_dynamic,pred_shalla_2017_prob_education/schools,...,pred_shalla_2017_prob_recreation/travel,pred_shalla_2017_prob_redirector,pred_shalla_2017_prob_religion,pred_shalla_2017_prob_science/astronomy,pred_shalla_2017_prob_sex/lingerie,pred_shalla_2017_prob_shopping,pred_shalla_2017_prob_webmail,pred_shalla_2017_prob_webradio,phish_2017_domain,phish_2017_cat
0,2o7.net,2o7.net,,2o7.net,redirector,0.005120,2.620840e-03,0.032742,5.842830e-03,6.718438e-04,...,0.008528,0.390709,0.000868,2.925687e-05,0.000015,0.033155,2.909468e-02,0.005894,2o7.net,
1,aol.com,aol.com,Top/World/Français/Regional/America/Canada/Bus...,aol.com,shopping,0.008347,8.850237e-05,0.006242,1.609948e-03,8.963734e-04,...,0.079595,0.033303,0.000802,4.554839e-05,0.000189,0.280469,5.471601e-03,0.000950,aol.com,
2,googlesyndication.com-o02,googlesyndication.com-o02,,googlesyndication.com-o02,porn,0.008434,5.467070e-05,0.001602,5.728951e-04,3.153301e-05,...,0.033208,0.009262,0.001790,9.587610e-06,0.001255,0.012417,8.544709e-04,0.002441,googlesyndication.com-o02,
3,adultfriendfinder.com,adultfriendfinder.com,,adultfriendfinder.com,porn,0.000159,7.556627e-07,0.000020,7.004297e-07,3.953016e-08,...,0.000032,0.000089,0.000003,5.140512e-09,0.000199,0.000343,4.979767e-07,0.000001,adultfriendfinder.com,
4,adzones.com,adzones.com,,adzones.com,shopping,0.219990,7.539586e-05,0.009045,2.974079e-03,1.506220e-05,...,0.030765,0.023270,0.000140,8.002444e-07,0.000027,0.258313,1.975485e-03,0.000357,adzones.com,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1011140,auctionschools.com,auctionschools.com,Top/Business/Business_Services/Auctions/Educat...,auctionschools.com,shopping,0.001712,2.034779e-06,0.000807,2.467469e-05,3.583400e-04,...,0.034968,0.004357,0.000490,1.460210e-05,0.000194,0.579539,6.142632e-04,0.000046,auctionschools.com,
1011141,rockgarden.com,rockgarden.com,Top/Regional/North_America/United_States/Conne...,rockgarden.com,recreation/travel,0.003623,4.214202e-05,0.004189,1.051223e-04,4.884837e-04,...,0.346311,0.014839,0.001353,5.144980e-04,0.000193,0.095473,5.937475e-03,0.000693,rockgarden.com,
1011142,aaarim.com,aaarim.com,,aaarim.com,shopping,0.003600,7.306864e-06,0.001822,7.809890e-04,1.928807e-03,...,0.093780,0.008003,0.000551,5.108167e-05,0.000430,0.309122,2.177618e-03,0.000128,aaarim.com,
1011143,67.97.218.140,67.97.218.140,,67.97.218.140,porn,0.020306,1.298387e-01,0.000876,6.101307e-07,4.719260e-04,...,0.001146,0.064894,0.000040,6.944788e-06,0.000039,0.000979,4.851447e-05,0.016282,67.97.218.140,


### Get the Content Category from Shallalist

In [6]:
df = shalla_cat(df, latest=True)
df

Downloading Shallalist data from the server (shalla_2017.csv.bz2)...


100%|█████████▉| 10432.0/10457.0244140625 [00:00<00:00, 92995.38KB/s]


Loading Shallalist data file...


Unnamed: 0,domain_names,dmoz_2016_domain,dmoz_2016_cat,pred_shalla_2017_domain,pred_shalla_2017_lab,pred_shalla_2017_prob_adv,pred_shalla_2017_prob_anonvpn,pred_shalla_2017_prob_downloads,pred_shalla_2017_prob_dynamic,pred_shalla_2017_prob_education/schools,...,pred_shalla_2017_prob_religion,pred_shalla_2017_prob_science/astronomy,pred_shalla_2017_prob_sex/lingerie,pred_shalla_2017_prob_shopping,pred_shalla_2017_prob_webmail,pred_shalla_2017_prob_webradio,phish_2017_domain,phish_2017_cat,shalla_2017_domain,shalla_2017_cat
0,2o7.net,2o7.net,,2o7.net,redirector,0.005120,2.620840e-03,0.032742,5.842830e-03,6.718438e-04,...,0.000868,2.925687e-05,0.000015,0.033155,2.909468e-02,0.005894,2o7.net,,2o7.net,tracker
1,aol.com,aol.com,Top/World/Français/Regional/America/Canada/Bus...,aol.com,shopping,0.008347,8.850237e-05,0.006242,1.609948e-03,8.963734e-04,...,0.000802,4.554839e-05,0.000189,0.280469,5.471601e-03,0.000950,aol.com,,aol.com,news|isp
2,googlesyndication.com-o02,googlesyndication.com-o02,,googlesyndication.com-o02,porn,0.008434,5.467070e-05,0.001602,5.728951e-04,3.153301e-05,...,0.001790,9.587610e-06,0.001255,0.012417,8.544709e-04,0.002441,googlesyndication.com-o02,,googlesyndication.com-o02,
3,adultfriendfinder.com,adultfriendfinder.com,,adultfriendfinder.com,porn,0.000159,7.556627e-07,0.000020,7.004297e-07,3.953016e-08,...,0.000003,5.140512e-09,0.000199,0.000343,4.979767e-07,0.000001,adultfriendfinder.com,,adultfriendfinder.com,porn|dating
4,adzones.com,adzones.com,,adzones.com,shopping,0.219990,7.539586e-05,0.009045,2.974079e-03,1.506220e-05,...,0.000140,8.002444e-07,0.000027,0.258313,1.975485e-03,0.000357,adzones.com,,adzones.com,adv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1011140,auctionschools.com,auctionschools.com,Top/Business/Business_Services/Auctions/Educat...,auctionschools.com,shopping,0.001712,2.034779e-06,0.000807,2.467469e-05,3.583400e-04,...,0.000490,1.460210e-05,0.000194,0.579539,6.142632e-04,0.000046,auctionschools.com,,auctionschools.com,
1011141,rockgarden.com,rockgarden.com,Top/Regional/North_America/United_States/Conne...,rockgarden.com,recreation/travel,0.003623,4.214202e-05,0.004189,1.051223e-04,4.884837e-04,...,0.001353,5.144980e-04,0.000193,0.095473,5.937475e-03,0.000693,rockgarden.com,,rockgarden.com,
1011142,aaarim.com,aaarim.com,,aaarim.com,shopping,0.003600,7.306864e-06,0.001822,7.809890e-04,1.928807e-03,...,0.000551,5.108167e-05,0.000430,0.309122,2.177618e-03,0.000128,aaarim.com,,aaarim.com,
1011143,67.97.218.140,67.97.218.140,,67.97.218.140,porn,0.020306,1.298387e-01,0.000876,6.101307e-07,4.719260e-04,...,0.000040,6.944788e-06,0.000039,0.000979,4.851447e-05,0.016282,67.97.218.140,,67.97.218.140,


### Predict Content Category Using the 2016 PhishTank Model

In [7]:
df = pred_phish(df, year=2016, latest=True)
df

  0%|          | 0.0/1605.625 [00:00<?, ?KB/s]

Downloading Phishtank model data from the server (phish_cat_lstm_2016.h5)...


100%|█████████▉| 1600.0/1605.625 [00:00<00:00, 57229.36KB/s]
 96%|█████████▌| 64.0/66.5673828125 [00:00<00:00, 20829.94KB/s]


Downloading Phishtank vocab data from the server (phish_cat_vocab_2016.csv)...
Loading the Phishtank model and vocab data file...


Unnamed: 0,domain_names,dmoz_2016_domain,dmoz_2016_cat,pred_shalla_2017_domain,pred_shalla_2017_lab,pred_shalla_2017_prob_adv,pred_shalla_2017_prob_anonvpn,pred_shalla_2017_prob_downloads,pred_shalla_2017_prob_dynamic,pred_shalla_2017_prob_education/schools,...,pred_shalla_2017_prob_shopping,pred_shalla_2017_prob_webmail,pred_shalla_2017_prob_webradio,phish_2017_domain,phish_2017_cat,shalla_2017_domain,shalla_2017_cat,pred_phish_2016_domain,pred_phish_2016_lab,pred_phish_2016_prob
0,2o7.net,2o7.net,,2o7.net,redirector,0.005120,2.620840e-03,0.032742,5.842830e-03,6.718438e-04,...,0.033155,2.909468e-02,0.005894,2o7.net,,2o7.net,tracker,2o7.net,0,0.001055
1,aol.com,aol.com,Top/World/Français/Regional/America/Canada/Bus...,aol.com,shopping,0.008347,8.850237e-05,0.006242,1.609948e-03,8.963734e-04,...,0.280469,5.471601e-03,0.000950,aol.com,,aol.com,news|isp,aol.com,0,0.025806
2,googlesyndication.com-o02,googlesyndication.com-o02,,googlesyndication.com-o02,porn,0.008434,5.467070e-05,0.001602,5.728951e-04,3.153301e-05,...,0.012417,8.544709e-04,0.002441,googlesyndication.com-o02,,googlesyndication.com-o02,,googlesyndication.com-o02,1,0.858359
3,adultfriendfinder.com,adultfriendfinder.com,,adultfriendfinder.com,porn,0.000159,7.556627e-07,0.000020,7.004297e-07,3.953016e-08,...,0.000343,4.979767e-07,0.000001,adultfriendfinder.com,,adultfriendfinder.com,porn|dating,adultfriendfinder.com,0,0.470908
4,adzones.com,adzones.com,,adzones.com,shopping,0.219990,7.539586e-05,0.009045,2.974079e-03,1.506220e-05,...,0.258313,1.975485e-03,0.000357,adzones.com,,adzones.com,adv,adzones.com,0,0.211381
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1011140,auctionschools.com,auctionschools.com,Top/Business/Business_Services/Auctions/Educat...,auctionschools.com,shopping,0.001712,2.034779e-06,0.000807,2.467469e-05,3.583400e-04,...,0.579539,6.142632e-04,0.000046,auctionschools.com,,auctionschools.com,,auctionschools.com,0,0.419289
1011141,rockgarden.com,rockgarden.com,Top/Regional/North_America/United_States/Conne...,rockgarden.com,recreation/travel,0.003623,4.214202e-05,0.004189,1.051223e-04,4.884837e-04,...,0.095473,5.937475e-03,0.000693,rockgarden.com,,rockgarden.com,,rockgarden.com,0,0.212275
1011142,aaarim.com,aaarim.com,,aaarim.com,shopping,0.003600,7.306864e-06,0.001822,7.809890e-04,1.928807e-03,...,0.309122,2.177618e-03,0.000128,aaarim.com,,aaarim.com,,aaarim.com,0,0.439096
1011143,67.97.218.140,67.97.218.140,,67.97.218.140,porn,0.020306,1.298387e-01,0.000876,6.101307e-07,4.719260e-04,...,0.000979,4.851447e-05,0.016282,67.97.218.140,,67.97.218.140,,67.97.218.140,1,0.999114


### Predict Content Category Using the 2017 PhishTank Model

In [8]:
df = pred_phish(df, year=2017, latest=True)
df

  0%|          | 0.0/1601.125 [00:00<?, ?KB/s]

Downloading Phishtank model data from the server (phish_cat_lstm_2017.h5)...


100%|█████████▉| 1600.0/1601.125 [00:00<00:00, 23860.42KB/s]
 96%|█████████▌| 64.0/66.5537109375 [00:00<00:00, 21981.29KB/s]


Downloading Phishtank vocab data from the server (phish_cat_vocab_2017.csv)...
Loading the Phishtank model and vocab data file...


Unnamed: 0,domain_names,dmoz_2016_domain,dmoz_2016_cat,pred_shalla_2017_domain,pred_shalla_2017_lab,pred_shalla_2017_prob_adv,pred_shalla_2017_prob_anonvpn,pred_shalla_2017_prob_downloads,pred_shalla_2017_prob_dynamic,pred_shalla_2017_prob_education/schools,...,phish_2017_domain,phish_2017_cat,shalla_2017_domain,shalla_2017_cat,pred_phish_2016_domain,pred_phish_2016_lab,pred_phish_2016_prob,pred_phish_2017_domain,pred_phish_2017_lab,pred_phish_2017_prob
0,2o7.net,2o7.net,,2o7.net,redirector,0.005120,2.620840e-03,0.032742,5.842830e-03,6.718438e-04,...,2o7.net,,2o7.net,tracker,2o7.net,0,0.001055,2o7.net,0,0.020258
1,aol.com,aol.com,Top/World/Français/Regional/America/Canada/Bus...,aol.com,shopping,0.008347,8.850237e-05,0.006242,1.609948e-03,8.963734e-04,...,aol.com,,aol.com,news|isp,aol.com,0,0.025806,aol.com,0,0.029081
2,googlesyndication.com-o02,googlesyndication.com-o02,,googlesyndication.com-o02,porn,0.008434,5.467070e-05,0.001602,5.728951e-04,3.153301e-05,...,googlesyndication.com-o02,,googlesyndication.com-o02,,googlesyndication.com-o02,1,0.858359,googlesyndication.com-o02,1,0.898066
3,adultfriendfinder.com,adultfriendfinder.com,,adultfriendfinder.com,porn,0.000159,7.556627e-07,0.000020,7.004297e-07,3.953016e-08,...,adultfriendfinder.com,,adultfriendfinder.com,porn|dating,adultfriendfinder.com,0,0.470908,adultfriendfinder.com,0,0.474351
4,adzones.com,adzones.com,,adzones.com,shopping,0.219990,7.539586e-05,0.009045,2.974079e-03,1.506220e-05,...,adzones.com,,adzones.com,adv,adzones.com,0,0.211381,adzones.com,0,0.106603
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1011140,auctionschools.com,auctionschools.com,Top/Business/Business_Services/Auctions/Educat...,auctionschools.com,shopping,0.001712,2.034779e-06,0.000807,2.467469e-05,3.583400e-04,...,auctionschools.com,,auctionschools.com,,auctionschools.com,0,0.419289,auctionschools.com,0,0.485382
1011141,rockgarden.com,rockgarden.com,Top/Regional/North_America/United_States/Conne...,rockgarden.com,recreation/travel,0.003623,4.214202e-05,0.004189,1.051223e-04,4.884837e-04,...,rockgarden.com,,rockgarden.com,,rockgarden.com,0,0.212275,rockgarden.com,0,0.264579
1011142,aaarim.com,aaarim.com,,aaarim.com,shopping,0.003600,7.306864e-06,0.001822,7.809890e-04,1.928807e-03,...,aaarim.com,,aaarim.com,,aaarim.com,0,0.439096,aaarim.com,1,0.706647
1011143,67.97.218.140,67.97.218.140,,67.97.218.140,porn,0.020306,1.298387e-01,0.000876,6.101307e-07,4.719260e-04,...,67.97.218.140,,67.97.218.140,,67.97.218.140,1,0.999114,67.97.218.140,1,0.999489


### Predict Content Category Using the Malware Model

In [9]:
df = pred_malware(df, latest=True)
df

Downloading Malware model data from the server (malware_cat_lstm_2017.h5)...


 96%|█████████▌| 1536.0/1599.25 [00:00<00:00, 32761.17KB/s]
 96%|█████████▌| 64.0/66.529296875 [00:00<00:00, 20086.46KB/s]


Downloading Malware vocab data from the server (malware_cat_vocab_2017.csv)...
Loading Malware model and vocab data file...


Unnamed: 0,domain_names,dmoz_2016_domain,dmoz_2016_cat,pred_shalla_2017_domain,pred_shalla_2017_lab,pred_shalla_2017_prob_adv,pred_shalla_2017_prob_anonvpn,pred_shalla_2017_prob_downloads,pred_shalla_2017_prob_dynamic,pred_shalla_2017_prob_education/schools,...,shalla_2017_cat,pred_phish_2016_domain,pred_phish_2016_lab,pred_phish_2016_prob,pred_phish_2017_domain,pred_phish_2017_lab,pred_phish_2017_prob,pred_malware_2017_domain,pred_malware_2017_lab,pred_malware_2017_prob
0,2o7.net,2o7.net,,2o7.net,redirector,0.005120,2.620840e-03,0.032742,5.842830e-03,6.718438e-04,...,tracker,2o7.net,0,0.001055,2o7.net,0,0.020258,2o7.net,0,0.004661
1,aol.com,aol.com,Top/World/Français/Regional/America/Canada/Bus...,aol.com,shopping,0.008347,8.850237e-05,0.006242,1.609948e-03,8.963734e-04,...,news|isp,aol.com,0,0.025806,aol.com,0,0.029081,aol.com,0,0.015110
2,googlesyndication.com-o02,googlesyndication.com-o02,,googlesyndication.com-o02,porn,0.008434,5.467070e-05,0.001602,5.728951e-04,3.153301e-05,...,,googlesyndication.com-o02,1,0.858359,googlesyndication.com-o02,1,0.898066,googlesyndication.com-o02,1,0.856875
3,adultfriendfinder.com,adultfriendfinder.com,,adultfriendfinder.com,porn,0.000159,7.556627e-07,0.000020,7.004297e-07,3.953016e-08,...,porn|dating,adultfriendfinder.com,0,0.470908,adultfriendfinder.com,0,0.474351,adultfriendfinder.com,0,0.103906
4,adzones.com,adzones.com,,adzones.com,shopping,0.219990,7.539586e-05,0.009045,2.974079e-03,1.506220e-05,...,adv,adzones.com,0,0.211381,adzones.com,0,0.106603,adzones.com,0,0.073935
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1011140,auctionschools.com,auctionschools.com,Top/Business/Business_Services/Auctions/Educat...,auctionschools.com,shopping,0.001712,2.034779e-06,0.000807,2.467469e-05,3.583400e-04,...,,auctionschools.com,0,0.419289,auctionschools.com,0,0.485382,auctionschools.com,0,0.312886
1011141,rockgarden.com,rockgarden.com,Top/Regional/North_America/United_States/Conne...,rockgarden.com,recreation/travel,0.003623,4.214202e-05,0.004189,1.051223e-04,4.884837e-04,...,,rockgarden.com,0,0.212275,rockgarden.com,0,0.264579,rockgarden.com,0,0.141558
1011142,aaarim.com,aaarim.com,,aaarim.com,shopping,0.003600,7.306864e-06,0.001822,7.809890e-04,1.928807e-03,...,,aaarim.com,0,0.439096,aaarim.com,1,0.706647,aaarim.com,1,0.544308
1011143,67.97.218.140,67.97.218.140,,67.97.218.140,porn,0.020306,1.298387e-01,0.000876,6.101307e-07,4.719260e-04,...,,67.97.218.140,1,0.999114,67.97.218.140,1,0.999489,67.97.218.140,1,0.994551


### Predict Content Category Using the Toulouse Model

In [10]:
df = pred_toulouse(df)
df

Using cached Toulouse model data from local (/root/.pydomains/toulouse_cat_lstm_others_2017.h5)...
Using cached Toulouse vocab data from local (/root/.pydomains/toulouse_cat_vocab_others_2017.csv)...
Using cached Toulouse names data from local (/root/.pydomains/toulouse_cat_names_others_2017.csv)...
Loading Toulouse model, vocab and names data file...


Unnamed: 0,domain_names,dmoz_2016_domain,dmoz_2016_cat,pred_shalla_2017_domain,pred_shalla_2017_lab,pred_shalla_2017_prob_adv,pred_shalla_2017_prob_anonvpn,pred_shalla_2017_prob_downloads,pred_shalla_2017_prob_dynamic,pred_shalla_2017_prob_education/schools,...,pred_toulouse_2017_lab,pred_toulouse_2017_prob_adult,pred_toulouse_2017_prob_bank,pred_toulouse_2017_prob_gambling,pred_toulouse_2017_prob_games,pred_toulouse_2017_prob_malware,pred_toulouse_2017_prob_others,pred_toulouse_2017_prob_phishing,pred_toulouse_2017_prob_press,pred_toulouse_2017_prob_shopping
0,2o7.net,2o7.net,,2o7.net,redirector,0.005120,2.620840e-03,0.032742,5.842830e-03,6.718438e-04,...,phishing,0.322767,4.563671e-03,0.000640,0.015970,3.215266e-02,0.063023,0.516860,1.681843e-03,0.042342
1,aol.com,aol.com,Top/World/Français/Regional/America/Canada/Bus...,aol.com,shopping,0.008347,8.850237e-05,0.006242,1.609948e-03,8.963734e-04,...,phishing,0.181279,2.474082e-02,0.000425,0.024904,4.963133e-02,0.109480,0.537275,6.853795e-03,0.065412
2,googlesyndication.com-o02,googlesyndication.com-o02,,googlesyndication.com-o02,porn,0.008434,5.467070e-05,0.001602,5.728951e-04,3.153301e-05,...,phishing,0.039091,3.739592e-05,0.000002,0.000146,5.406279e-03,0.009331,0.945645,3.746040e-06,0.000337
3,adultfriendfinder.com,adultfriendfinder.com,,adultfriendfinder.com,porn,0.000159,7.556627e-07,0.000020,7.004297e-07,3.953016e-08,...,adult,0.978048,9.402128e-07,0.000018,0.000121,7.341327e-05,0.001129,0.020390,2.107359e-05,0.000197
4,adzones.com,adzones.com,,adzones.com,shopping,0.219990,7.539586e-05,0.009045,2.974079e-03,1.506220e-05,...,adult,0.422941,3.488752e-04,0.000302,0.028832,6.782474e-03,0.187602,0.217163,6.957470e-04,0.135334
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1011140,auctionschools.com,auctionschools.com,Top/Business/Business_Services/Auctions/Educat...,auctionschools.com,shopping,0.001712,2.034779e-06,0.000807,2.467469e-05,3.583400e-04,...,adult,0.431963,8.075748e-04,0.000503,0.009057,9.936339e-03,0.028745,0.140292,8.249818e-04,0.377871
1011141,rockgarden.com,rockgarden.com,Top/Regional/North_America/United_States/Conne...,rockgarden.com,recreation/travel,0.003623,4.214202e-05,0.004189,1.051223e-04,4.884837e-04,...,adult,0.587242,1.573371e-03,0.000549,0.018745,7.546314e-03,0.034288,0.087213,2.776731e-03,0.260067
1011142,aaarim.com,aaarim.com,,aaarim.com,shopping,0.003600,7.306864e-06,0.001822,7.809890e-04,1.928807e-03,...,phishing,0.276884,1.402212e-02,0.000650,0.024181,4.556582e-02,0.135822,0.306234,4.998822e-03,0.191642
1011143,67.97.218.140,67.97.218.140,,67.97.218.140,porn,0.020306,1.298387e-01,0.000876,6.101307e-07,4.719260e-04,...,adult,0.988393,5.598351e-06,0.000026,0.000031,9.452147e-07,0.003929,0.007595,1.131237e-07,0.000019


In [11]:
# Save
df.to_csv('/opt/data/comscore/pydomains/cs%04d_unique_domains_pydomains.csv.bz2' % YEAR, index=False, compression='bz2')