## An example using pydomains

In [1]:
# to hide Tensorflow warning messages
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from pydomains import *


Using TensorFlow backend.


In [2]:
df = pd.read_csv('input-header.csv')
df

Unnamed: 0,label,url
0,test1,topshop.com
1,test2,beyondrelief.com
2,test3,golf-tours.com/test
3,test4,thegayhotel.com
4,test5,https://zonasequravlabcp.com/bcp/
5,test6,http://privatix.xyz
6,test7,adultfriendfinder.com
7,test8,giftregistrylocator.com
8,test9,bangbrosonline.com
9,test10,scotland-info.co.uk


### Get the Content Category from DMOZ

In [3]:
df = dmoz_cat(df, domain_names='url')
df

Using cached DMOZ data from local (/root/.pydomains/dmoz_2016.csv.bz2)...
Loading DMOZ data file...


Unnamed: 0,label,url,dmoz_2016_domain,dmoz_2016_cat
0,test1,topshop.com,topshop.com,Top/Regional/Europe/United_Kingdom/Business_an...
1,test2,beyondrelief.com,beyondrelief.com,
2,test3,golf-tours.com/test,golf-tours.com,
3,test4,thegayhotel.com,thegayhotel.com,
4,test5,https://zonasequravlabcp.com/bcp/,zonasequravlabcp.com,
5,test6,http://privatix.xyz,privatix.xyz,
6,test7,adultfriendfinder.com,adultfriendfinder.com,
7,test8,giftregistrylocator.com,giftregistrylocator.com,
8,test9,bangbrosonline.com,bangbrosonline.com,
9,test10,scotland-info.co.uk,scotland-info.co.uk,Top/Regional/Europe/United_Kingdom/Scotland/Tr...


### Predict Content Category Using the Shallalist Model

In [4]:
df = pred_shalla(df, domain_names='url')
df

Using cached Shalla model data from local (/root/.pydomains/shalla_cat_lstm_others_2017.h5)...
Using cached Shalla vocab data from local (/root/.pydomains/shalla_cat_vocab_others_2017.csv)...
Using cached Shalla names data from local (/root/.pydomains/shalla_cat_names_others_2017.csv)...
Loading Shalla model, vocab and names data file...


Unnamed: 0,label,url,dmoz_2016_domain,dmoz_2016_cat,pred_shalla_2017_domain,pred_shalla_2017_lab,pred_shalla_2017_prob_adv,pred_shalla_2017_prob_anonvpn,pred_shalla_2017_prob_downloads,pred_shalla_2017_prob_dynamic,...,pred_shalla_2017_prob_recreation/restaurants,pred_shalla_2017_prob_recreation/sports,pred_shalla_2017_prob_recreation/travel,pred_shalla_2017_prob_redirector,pred_shalla_2017_prob_religion,pred_shalla_2017_prob_science/astronomy,pred_shalla_2017_prob_sex/lingerie,pred_shalla_2017_prob_shopping,pred_shalla_2017_prob_webmail,pred_shalla_2017_prob_webradio
0,test1,topshop.com,topshop.com,Top/Regional/Europe/United_Kingdom/Business_an...,topshop.com,shopping,0.002063,7.398885e-06,0.000658,0.0001041324,...,3.256308e-05,0.002037,0.001086,0.00356,7.164153e-05,2.728982e-05,0.000466,0.817349,0.000177,6.256664e-05
1,test2,beyondrelief.com,beyondrelief.com,,beyondrelief.com,shopping,0.001067,1.734273e-06,0.002037,2.552577e-05,...,0.001726924,0.079913,0.095993,0.001458,0.00783806,3.155856e-05,0.000923,0.479174,0.001244,0.0001356539
2,test3,golf-tours.com/test,golf-tours.com,,golf-tours.com,recreation/sports,1e-06,7.501002e-09,3e-06,1.687661e-08,...,5.252025e-06,0.566369,0.431345,9e-06,1.708439e-06,2.534468e-06,1e-06,0.001196,6e-06,4.243939e-07
3,test4,thegayhotel.com,thegayhotel.com,,thegayhotel.com,porn,1.6e-05,6.654504e-07,5.4e-05,6.977853e-07,...,8.457365e-06,0.000197,0.319903,2e-05,3.024916e-06,1.928131e-07,0.000406,0.000124,1.4e-05,2.568717e-06
4,test5,https://zonasequravlabcp.com/bcp/,zonasequravlabcp.com,,zonasequravlabcp.com,recreation/sports,0.000667,9.059212e-07,0.000379,4.503229e-06,...,0.001468938,0.364249,0.250593,0.003024,0.001249581,2.011401e-05,0.000275,0.037188,0.001155,0.0001711679
5,test6,http://privatix.xyz,privatix.xyz,,privatix.xyz,porn,0.001434,0.00080705,0.000117,4.913021e-05,...,8.918409e-07,0.000113,9.3e-05,0.013645,4.655251e-07,1.950781e-07,0.00018,0.000425,4.4e-05,2.799953e-05
6,test7,adultfriendfinder.com,adultfriendfinder.com,,adultfriendfinder.com,porn,0.000148,1.290912e-06,6e-06,2.791606e-07,...,1.391122e-07,8.9e-05,7.7e-05,7.5e-05,4.755702e-07,4.942816e-08,7.6e-05,0.000451,1.5e-05,2.744758e-06
7,test8,giftregistrylocator.com,giftregistrylocator.com,,giftregistrylocator.com,shopping,0.000476,2.592254e-09,7e-06,8.599612e-07,...,5.07365e-07,0.000456,0.000456,0.000306,4.377169e-05,7.478947e-06,8e-06,0.983568,2.1e-05,7.626026e-08
8,test9,bangbrosonline.com,bangbrosonline.com,,bangbrosonline.com,porn,0.003574,3.734851e-05,0.000422,5.783453e-05,...,9.966338e-05,0.005543,0.003086,0.001471,7.022765e-06,8.292021e-06,9.3e-05,0.014624,0.000261,6.008491e-05
9,test10,scotland-info.co.uk,scotland-info.co.uk,Top/Regional/Europe/United_Kingdom/Scotland/Tr...,scotland-info.co.uk,recreation/travel,0.000246,2.537992e-07,0.000179,2.365065e-06,...,0.001624175,0.099556,0.633067,0.000227,0.0007668439,0.0001490731,6.1e-05,0.159815,5.9e-05,7.84806e-05


### Get the Content Category from Phishtank 

In [5]:
df = phish_cat(df, domain_names='url')
df

Using cached PhishTank data from local (/root/.pydomains/phish_2017.csv.bz2)...
Loading PhishTank data file...


Unnamed: 0,label,url,dmoz_2016_domain,dmoz_2016_cat,pred_shalla_2017_domain,pred_shalla_2017_lab,pred_shalla_2017_prob_adv,pred_shalla_2017_prob_anonvpn,pred_shalla_2017_prob_downloads,pred_shalla_2017_prob_dynamic,...,pred_shalla_2017_prob_recreation/travel,pred_shalla_2017_prob_redirector,pred_shalla_2017_prob_religion,pred_shalla_2017_prob_science/astronomy,pred_shalla_2017_prob_sex/lingerie,pred_shalla_2017_prob_shopping,pred_shalla_2017_prob_webmail,pred_shalla_2017_prob_webradio,phish_2017_domain,phish_2017_cat
0,test1,topshop.com,topshop.com,Top/Regional/Europe/United_Kingdom/Business_an...,topshop.com,shopping,0.002063,7.398885e-06,0.000658,0.0001041324,...,0.001086,0.00356,7.164153e-05,2.728982e-05,0.000466,0.817349,0.000177,6.256664e-05,topshop.com,
1,test2,beyondrelief.com,beyondrelief.com,,beyondrelief.com,shopping,0.001067,1.734273e-06,0.002037,2.552577e-05,...,0.095993,0.001458,0.00783806,3.155856e-05,0.000923,0.479174,0.001244,0.0001356539,beyondrelief.com,
2,test3,golf-tours.com/test,golf-tours.com,,golf-tours.com,recreation/sports,1e-06,7.501002e-09,3e-06,1.687661e-08,...,0.431345,9e-06,1.708439e-06,2.534468e-06,1e-06,0.001196,6e-06,4.243939e-07,golf-tours.com,
3,test4,thegayhotel.com,thegayhotel.com,,thegayhotel.com,porn,1.6e-05,6.654504e-07,5.4e-05,6.977853e-07,...,0.319903,2e-05,3.024916e-06,1.928131e-07,0.000406,0.000124,1.4e-05,2.568717e-06,thegayhotel.com,
4,test5,https://zonasequravlabcp.com/bcp/,zonasequravlabcp.com,,zonasequravlabcp.com,recreation/sports,0.000667,9.059212e-07,0.000379,4.503229e-06,...,0.250593,0.003024,0.001249581,2.011401e-05,0.000275,0.037188,0.001155,0.0001711679,zonasequravlabcp.com,yes
5,test6,http://privatix.xyz,privatix.xyz,,privatix.xyz,porn,0.001434,0.00080705,0.000117,4.913021e-05,...,9.3e-05,0.013645,4.655251e-07,1.950781e-07,0.00018,0.000425,4.4e-05,2.799953e-05,privatix.xyz,yes
6,test7,adultfriendfinder.com,adultfriendfinder.com,,adultfriendfinder.com,porn,0.000148,1.290912e-06,6e-06,2.791606e-07,...,7.7e-05,7.5e-05,4.755702e-07,4.942816e-08,7.6e-05,0.000451,1.5e-05,2.744758e-06,adultfriendfinder.com,
7,test8,giftregistrylocator.com,giftregistrylocator.com,,giftregistrylocator.com,shopping,0.000476,2.592254e-09,7e-06,8.599612e-07,...,0.000456,0.000306,4.377169e-05,7.478947e-06,8e-06,0.983568,2.1e-05,7.626026e-08,giftregistrylocator.com,
8,test9,bangbrosonline.com,bangbrosonline.com,,bangbrosonline.com,porn,0.003574,3.734851e-05,0.000422,5.783453e-05,...,0.003086,0.001471,7.022765e-06,8.292021e-06,9.3e-05,0.014624,0.000261,6.008491e-05,bangbrosonline.com,
9,test10,scotland-info.co.uk,scotland-info.co.uk,Top/Regional/Europe/United_Kingdom/Scotland/Tr...,scotland-info.co.uk,recreation/travel,0.000246,2.537992e-07,0.000179,2.365065e-06,...,0.633067,0.000227,0.0007668439,0.0001490731,6.1e-05,0.159815,5.9e-05,7.84806e-05,scotland-info.co.uk,


### Get the Content Category from Shallalist

In [6]:
df = shalla_cat(df, domain_names='url')
df

Using cached Shallalist data from local (/root/.pydomains/shalla_2017.csv.bz2)...
Loading Shallalist data file...


Unnamed: 0,label,url,dmoz_2016_domain,dmoz_2016_cat,pred_shalla_2017_domain,pred_shalla_2017_lab,pred_shalla_2017_prob_adv,pred_shalla_2017_prob_anonvpn,pred_shalla_2017_prob_downloads,pred_shalla_2017_prob_dynamic,...,pred_shalla_2017_prob_religion,pred_shalla_2017_prob_science/astronomy,pred_shalla_2017_prob_sex/lingerie,pred_shalla_2017_prob_shopping,pred_shalla_2017_prob_webmail,pred_shalla_2017_prob_webradio,phish_2017_domain,phish_2017_cat,shalla_2017_domain,shalla_2017_cat
0,test1,topshop.com,topshop.com,Top/Regional/Europe/United_Kingdom/Business_an...,topshop.com,shopping,0.002063,7.398885e-06,0.000658,0.0001041324,...,7.164153e-05,2.728982e-05,0.000466,0.817349,0.000177,6.256664e-05,topshop.com,,topshop.com,
1,test2,beyondrelief.com,beyondrelief.com,,beyondrelief.com,shopping,0.001067,1.734273e-06,0.002037,2.552577e-05,...,0.00783806,3.155856e-05,0.000923,0.479174,0.001244,0.0001356539,beyondrelief.com,,beyondrelief.com,shopping|recreation/sports
2,test3,golf-tours.com/test,golf-tours.com,,golf-tours.com,recreation/sports,1e-06,7.501002e-09,3e-06,1.687661e-08,...,1.708439e-06,2.534468e-06,1e-06,0.001196,6e-06,4.243939e-07,golf-tours.com,,golf-tours.com,recreation/sports|recreation/travel
3,test4,thegayhotel.com,thegayhotel.com,,thegayhotel.com,porn,1.6e-05,6.654504e-07,5.4e-05,6.977853e-07,...,3.024916e-06,1.928131e-07,0.000406,0.000124,1.4e-05,2.568717e-06,thegayhotel.com,,thegayhotel.com,porn|recreation/travel
4,test5,https://zonasequravlabcp.com/bcp/,zonasequravlabcp.com,,zonasequravlabcp.com,recreation/sports,0.000667,9.059212e-07,0.000379,4.503229e-06,...,0.001249581,2.011401e-05,0.000275,0.037188,0.001155,0.0001711679,zonasequravlabcp.com,yes,zonasequravlabcp.com,
5,test6,http://privatix.xyz,privatix.xyz,,privatix.xyz,porn,0.001434,0.00080705,0.000117,4.913021e-05,...,4.655251e-07,1.950781e-07,0.00018,0.000425,4.4e-05,2.799953e-05,privatix.xyz,yes,privatix.xyz,
6,test7,adultfriendfinder.com,adultfriendfinder.com,,adultfriendfinder.com,porn,0.000148,1.290912e-06,6e-06,2.791606e-07,...,4.755702e-07,4.942816e-08,7.6e-05,0.000451,1.5e-05,2.744758e-06,adultfriendfinder.com,,adultfriendfinder.com,porn|dating
7,test8,giftregistrylocator.com,giftregistrylocator.com,,giftregistrylocator.com,shopping,0.000476,2.592254e-09,7e-06,8.599612e-07,...,4.377169e-05,7.478947e-06,8e-06,0.983568,2.1e-05,7.626026e-08,giftregistrylocator.com,,giftregistrylocator.com,
8,test9,bangbrosonline.com,bangbrosonline.com,,bangbrosonline.com,porn,0.003574,3.734851e-05,0.000422,5.783453e-05,...,7.022765e-06,8.292021e-06,9.3e-05,0.014624,0.000261,6.008491e-05,bangbrosonline.com,,bangbrosonline.com,porn
9,test10,scotland-info.co.uk,scotland-info.co.uk,Top/Regional/Europe/United_Kingdom/Scotland/Tr...,scotland-info.co.uk,recreation/travel,0.000246,2.537992e-07,0.000179,2.365065e-06,...,0.0007668439,0.0001490731,6.1e-05,0.159815,5.9e-05,7.84806e-05,scotland-info.co.uk,,scotland-info.co.uk,recreation/travel


### Predict Content Category Using the 2016 PhishTank Model

In [7]:
df = pred_phish(df, year=2016, domain_names='url')
df

Using cached Phishtank model data from local (/root/.pydomains/phish_cat_lstm_2016.h5)...
Using the cached Phishtank vocab data from local (/root/.pydomains/phish_cat_vocab_2016.csv)...
Loading the Phishtank model and vocab data file...


Unnamed: 0,label,url,dmoz_2016_domain,dmoz_2016_cat,pred_shalla_2017_domain,pred_shalla_2017_lab,pred_shalla_2017_prob_adv,pred_shalla_2017_prob_anonvpn,pred_shalla_2017_prob_downloads,pred_shalla_2017_prob_dynamic,...,pred_shalla_2017_prob_shopping,pred_shalla_2017_prob_webmail,pred_shalla_2017_prob_webradio,phish_2017_domain,phish_2017_cat,shalla_2017_domain,shalla_2017_cat,pred_phish_2016_domain,pred_phish_2016_lab,pred_phish_2016_prob
0,test1,topshop.com,topshop.com,Top/Regional/Europe/United_Kingdom/Business_an...,topshop.com,shopping,0.002063,7.398885e-06,0.000658,0.0001041324,...,0.817349,0.000177,6.256664e-05,topshop.com,,topshop.com,,topshop.com,0,0.133253
1,test2,beyondrelief.com,beyondrelief.com,,beyondrelief.com,shopping,0.001067,1.734273e-06,0.002037,2.552577e-05,...,0.479174,0.001244,0.0001356539,beyondrelief.com,,beyondrelief.com,shopping|recreation/sports,beyondrelief.com,1,0.910623
2,test3,golf-tours.com/test,golf-tours.com,,golf-tours.com,recreation/sports,1e-06,7.501002e-09,3e-06,1.687661e-08,...,0.001196,6e-06,4.243939e-07,golf-tours.com,,golf-tours.com,recreation/sports|recreation/travel,golf-tours.com,0,0.493226
3,test4,thegayhotel.com,thegayhotel.com,,thegayhotel.com,porn,1.6e-05,6.654504e-07,5.4e-05,6.977853e-07,...,0.000124,1.4e-05,2.568717e-06,thegayhotel.com,,thegayhotel.com,porn|recreation/travel,thegayhotel.com,1,0.755859
4,test5,https://zonasequravlabcp.com/bcp/,zonasequravlabcp.com,,zonasequravlabcp.com,recreation/sports,0.000667,9.059212e-07,0.000379,4.503229e-06,...,0.037188,0.001155,0.0001711679,zonasequravlabcp.com,yes,zonasequravlabcp.com,,zonasequravlabcp.com,1,0.980653
5,test6,http://privatix.xyz,privatix.xyz,,privatix.xyz,porn,0.001434,0.00080705,0.000117,4.913021e-05,...,0.000425,4.4e-05,2.799953e-05,privatix.xyz,yes,privatix.xyz,,privatix.xyz,1,0.590286
6,test7,adultfriendfinder.com,adultfriendfinder.com,,adultfriendfinder.com,porn,0.000148,1.290912e-06,6e-06,2.791606e-07,...,0.000451,1.5e-05,2.744758e-06,adultfriendfinder.com,,adultfriendfinder.com,porn|dating,adultfriendfinder.com,0,0.460745
7,test8,giftregistrylocator.com,giftregistrylocator.com,,giftregistrylocator.com,shopping,0.000476,2.592254e-09,7e-06,8.599612e-07,...,0.983568,2.1e-05,7.626026e-08,giftregistrylocator.com,,giftregistrylocator.com,,giftregistrylocator.com,1,0.785598
8,test9,bangbrosonline.com,bangbrosonline.com,,bangbrosonline.com,porn,0.003574,3.734851e-05,0.000422,5.783453e-05,...,0.014624,0.000261,6.008491e-05,bangbrosonline.com,,bangbrosonline.com,porn,bangbrosonline.com,0,0.405968
9,test10,scotland-info.co.uk,scotland-info.co.uk,Top/Regional/Europe/United_Kingdom/Scotland/Tr...,scotland-info.co.uk,recreation/travel,0.000246,2.537992e-07,0.000179,2.365065e-06,...,0.159815,5.9e-05,7.84806e-05,scotland-info.co.uk,,scotland-info.co.uk,recreation/travel,scotland-info.co.uk,1,0.93354


### Predict Content Category Using the 2017 PhishTank Model

In [8]:
df = pred_phish(df, year=2017, domain_names='url')
df

Using cached Phishtank model data from local (/root/.pydomains/phish_cat_lstm_2017.h5)...
Using the cached Phishtank vocab data from local (/root/.pydomains/phish_cat_vocab_2017.csv)...
Loading the Phishtank model and vocab data file...


Unnamed: 0,label,url,dmoz_2016_domain,dmoz_2016_cat,pred_shalla_2017_domain,pred_shalla_2017_lab,pred_shalla_2017_prob_adv,pred_shalla_2017_prob_anonvpn,pred_shalla_2017_prob_downloads,pred_shalla_2017_prob_dynamic,...,phish_2017_domain,phish_2017_cat,shalla_2017_domain,shalla_2017_cat,pred_phish_2016_domain,pred_phish_2016_lab,pred_phish_2016_prob,pred_phish_2017_domain,pred_phish_2017_lab,pred_phish_2017_prob
0,test1,topshop.com,topshop.com,Top/Regional/Europe/United_Kingdom/Business_an...,topshop.com,shopping,0.002063,7.398885e-06,0.000658,0.0001041324,...,topshop.com,,topshop.com,,topshop.com,0,0.133253,topshop.com,0,0.146236
1,test2,beyondrelief.com,beyondrelief.com,,beyondrelief.com,shopping,0.001067,1.734273e-06,0.002037,2.552577e-05,...,beyondrelief.com,,beyondrelief.com,shopping|recreation/sports,beyondrelief.com,1,0.910623,beyondrelief.com,1,0.555731
2,test3,golf-tours.com/test,golf-tours.com,,golf-tours.com,recreation/sports,1e-06,7.501002e-09,3e-06,1.687661e-08,...,golf-tours.com,,golf-tours.com,recreation/sports|recreation/travel,golf-tours.com,0,0.493226,golf-tours.com,0,0.33051
3,test4,thegayhotel.com,thegayhotel.com,,thegayhotel.com,porn,1.6e-05,6.654504e-07,5.4e-05,6.977853e-07,...,thegayhotel.com,,thegayhotel.com,porn|recreation/travel,thegayhotel.com,1,0.755859,thegayhotel.com,1,0.567629
4,test5,https://zonasequravlabcp.com/bcp/,zonasequravlabcp.com,,zonasequravlabcp.com,recreation/sports,0.000667,9.059212e-07,0.000379,4.503229e-06,...,zonasequravlabcp.com,yes,zonasequravlabcp.com,,zonasequravlabcp.com,1,0.980653,zonasequravlabcp.com,1,0.999658
5,test6,http://privatix.xyz,privatix.xyz,,privatix.xyz,porn,0.001434,0.00080705,0.000117,4.913021e-05,...,privatix.xyz,yes,privatix.xyz,,privatix.xyz,1,0.590286,privatix.xyz,1,0.741686
6,test7,adultfriendfinder.com,adultfriendfinder.com,,adultfriendfinder.com,porn,0.000148,1.290912e-06,6e-06,2.791606e-07,...,adultfriendfinder.com,,adultfriendfinder.com,porn|dating,adultfriendfinder.com,0,0.460745,adultfriendfinder.com,0,0.397522
7,test8,giftregistrylocator.com,giftregistrylocator.com,,giftregistrylocator.com,shopping,0.000476,2.592254e-09,7e-06,8.599612e-07,...,giftregistrylocator.com,,giftregistrylocator.com,,giftregistrylocator.com,1,0.785598,giftregistrylocator.com,1,0.73852
8,test9,bangbrosonline.com,bangbrosonline.com,,bangbrosonline.com,porn,0.003574,3.734851e-05,0.000422,5.783453e-05,...,bangbrosonline.com,,bangbrosonline.com,porn,bangbrosonline.com,0,0.405968,bangbrosonline.com,0,0.385076
9,test10,scotland-info.co.uk,scotland-info.co.uk,Top/Regional/Europe/United_Kingdom/Scotland/Tr...,scotland-info.co.uk,recreation/travel,0.000246,2.537992e-07,0.000179,2.365065e-06,...,scotland-info.co.uk,,scotland-info.co.uk,recreation/travel,scotland-info.co.uk,1,0.93354,scotland-info.co.uk,1,0.850082


### Predict Content Category Using the Malware Model

In [9]:
df = pred_malware(df, domain_names='url')
df

Using cached Malware model data from local (/root/.pydomains/malware_cat_lstm_2017.h5)...
Using cached Malware vocab data from local (/root/.pydomains/malware_cat_vocab_2017.csv)...
Loading Malware model and vocab data file...


Unnamed: 0,label,url,dmoz_2016_domain,dmoz_2016_cat,pred_shalla_2017_domain,pred_shalla_2017_lab,pred_shalla_2017_prob_adv,pred_shalla_2017_prob_anonvpn,pred_shalla_2017_prob_downloads,pred_shalla_2017_prob_dynamic,...,shalla_2017_cat,pred_phish_2016_domain,pred_phish_2016_lab,pred_phish_2016_prob,pred_phish_2017_domain,pred_phish_2017_lab,pred_phish_2017_prob,pred_malware_2017_domain,pred_malware_2017_lab,pred_malware_2017_prob
0,test1,topshop.com,topshop.com,Top/Regional/Europe/United_Kingdom/Business_an...,topshop.com,shopping,0.002063,7.398885e-06,0.000658,0.0001041324,...,,topshop.com,0,0.133253,topshop.com,0,0.146236,topshop.com,0,0.117508
1,test2,beyondrelief.com,beyondrelief.com,,beyondrelief.com,shopping,0.001067,1.734273e-06,0.002037,2.552577e-05,...,shopping|recreation/sports,beyondrelief.com,1,0.910623,beyondrelief.com,1,0.555731,beyondrelief.com,0,0.410566
2,test3,golf-tours.com/test,golf-tours.com,,golf-tours.com,recreation/sports,1e-06,7.501002e-09,3e-06,1.687661e-08,...,recreation/sports|recreation/travel,golf-tours.com,0,0.493226,golf-tours.com,0,0.33051,golf-tours.com,0,0.190595
3,test4,thegayhotel.com,thegayhotel.com,,thegayhotel.com,porn,1.6e-05,6.654504e-07,5.4e-05,6.977853e-07,...,porn|recreation/travel,thegayhotel.com,1,0.755859,thegayhotel.com,1,0.567629,thegayhotel.com,0,0.238392
4,test5,https://zonasequravlabcp.com/bcp/,zonasequravlabcp.com,,zonasequravlabcp.com,recreation/sports,0.000667,9.059212e-07,0.000379,4.503229e-06,...,,zonasequravlabcp.com,1,0.980653,zonasequravlabcp.com,1,0.999658,zonasequravlabcp.com,1,0.939674
5,test6,http://privatix.xyz,privatix.xyz,,privatix.xyz,porn,0.001434,0.00080705,0.000117,4.913021e-05,...,,privatix.xyz,1,0.590286,privatix.xyz,1,0.741686,privatix.xyz,0,0.127168
6,test7,adultfriendfinder.com,adultfriendfinder.com,,adultfriendfinder.com,porn,0.000148,1.290912e-06,6e-06,2.791606e-07,...,porn|dating,adultfriendfinder.com,0,0.460745,adultfriendfinder.com,0,0.397522,adultfriendfinder.com,0,0.155892
7,test8,giftregistrylocator.com,giftregistrylocator.com,,giftregistrylocator.com,shopping,0.000476,2.592254e-09,7e-06,8.599612e-07,...,,giftregistrylocator.com,1,0.785598,giftregistrylocator.com,1,0.73852,giftregistrylocator.com,1,0.653701
8,test9,bangbrosonline.com,bangbrosonline.com,,bangbrosonline.com,porn,0.003574,3.734851e-05,0.000422,5.783453e-05,...,porn,bangbrosonline.com,0,0.405968,bangbrosonline.com,0,0.385076,bangbrosonline.com,0,0.156366
9,test10,scotland-info.co.uk,scotland-info.co.uk,Top/Regional/Europe/United_Kingdom/Scotland/Tr...,scotland-info.co.uk,recreation/travel,0.000246,2.537992e-07,0.000179,2.365065e-06,...,recreation/travel,scotland-info.co.uk,1,0.93354,scotland-info.co.uk,1,0.850082,scotland-info.co.uk,0,0.411318


### Predict Content Category Using the Toulouse Model

In [10]:
df = pred_toulouse(df, domain_names='url')
df

Using cached Toulouse model data from local (/root/.pydomains/toulouse_cat_lstm_others_2017.h5)...
Using cached Toulouse vocab data from local (/root/.pydomains/toulouse_cat_vocab_others_2017.csv)...
Using cached Toulouse names data from local (/root/.pydomains/toulouse_cat_names_others_2017.csv)...
Loading Toulouse model, vocab and names data file...


Unnamed: 0,label,url,dmoz_2016_domain,dmoz_2016_cat,pred_shalla_2017_domain,pred_shalla_2017_lab,pred_shalla_2017_prob_adv,pred_shalla_2017_prob_anonvpn,pred_shalla_2017_prob_downloads,pred_shalla_2017_prob_dynamic,...,pred_toulouse_2017_prob_audio-video,pred_toulouse_2017_prob_bank,pred_toulouse_2017_prob_gambling,pred_toulouse_2017_prob_games,pred_toulouse_2017_prob_malware,pred_toulouse_2017_prob_others,pred_toulouse_2017_prob_phishing,pred_toulouse_2017_prob_press,pred_toulouse_2017_prob_publicite,pred_toulouse_2017_prob_shopping
0,test1,topshop.com,topshop.com,Top/Regional/Europe/United_Kingdom/Business_an...,topshop.com,shopping,0.002063,7.398885e-06,0.000658,0.0001041324,...,0.003793,0.0001161209,0.000291161,0.002073,0.003976,0.014862,0.112132,0.0008404782,0.000761,0.727203
1,test2,beyondrelief.com,beyondrelief.com,,beyondrelief.com,shopping,0.001067,1.734273e-06,0.002037,2.552577e-05,...,0.016359,0.003912277,0.006484169,0.022408,0.018371,0.046011,0.172208,0.02525989,0.002821,0.164577
2,test3,golf-tours.com/test,golf-tours.com,,golf-tours.com,recreation/sports,1e-06,7.501002e-09,3e-06,1.687661e-08,...,0.008208,0.001783388,0.0008022182,0.013352,0.006392,0.021287,0.060633,0.01853484,0.00099,0.681934
3,test4,thegayhotel.com,thegayhotel.com,,thegayhotel.com,porn,1.6e-05,6.654504e-07,5.4e-05,6.977853e-07,...,0.00108,8.920376e-05,6.256416e-05,0.000713,0.000934,0.005018,0.017201,0.0002208831,0.000135,0.003094
4,test5,https://zonasequravlabcp.com/bcp/,zonasequravlabcp.com,,zonasequravlabcp.com,recreation/sports,0.000667,9.059212e-07,0.000379,4.503229e-06,...,0.001063,0.0006226784,0.0001073761,0.012431,0.077391,0.031691,0.416989,0.00279639,0.000284,0.391121
5,test6,http://privatix.xyz,privatix.xyz,,privatix.xyz,porn,0.001434,0.00080705,0.000117,4.913021e-05,...,0.002241,6.823017e-07,1.969112e-06,0.001021,0.004949,0.003069,0.002094,4.559168e-06,0.000252,3.8e-05
6,test7,adultfriendfinder.com,adultfriendfinder.com,,adultfriendfinder.com,porn,0.000148,1.290912e-06,6e-06,2.791606e-07,...,0.000211,1.742063e-07,6.485795e-08,4.4e-05,5.9e-05,0.001674,0.058497,1.133889e-07,7e-06,6.6e-05
7,test8,giftregistrylocator.com,giftregistrylocator.com,,giftregistrylocator.com,shopping,0.000476,2.592254e-09,7e-06,8.599612e-07,...,0.00057,0.0003973926,1.019526e-05,0.004112,0.016339,0.015631,0.131174,0.01115336,0.000436,0.805531
8,test9,bangbrosonline.com,bangbrosonline.com,,bangbrosonline.com,porn,0.003574,3.734851e-05,0.000422,5.783453e-05,...,0.004017,9.122134e-05,0.0001142885,0.002216,0.000422,0.017964,0.012573,0.0005098382,0.000785,0.015817
9,test10,scotland-info.co.uk,scotland-info.co.uk,Top/Regional/Europe/United_Kingdom/Scotland/Tr...,scotland-info.co.uk,recreation/travel,0.000246,2.537992e-07,0.000179,2.365065e-06,...,0.003745,0.0003962543,0.000497739,0.014452,0.006615,0.057622,0.111698,0.000733116,0.000168,0.547802


In [11]:
df = pred_toulouse(df, domain_names='url')
df

Using cached Toulouse model data from local (/root/.pydomains/toulouse_cat_lstm_others_2017.h5)...
Using cached Toulouse vocab data from local (/root/.pydomains/toulouse_cat_vocab_others_2017.csv)...
Using cached Toulouse names data from local (/root/.pydomains/toulouse_cat_names_others_2017.csv)...
Loading Toulouse model, vocab and names data file...


Unnamed: 0,label,url,dmoz_2016_domain,dmoz_2016_cat,pred_shalla_2017_domain,pred_shalla_2017_lab,pred_shalla_2017_prob_adv,pred_shalla_2017_prob_anonvpn,pred_shalla_2017_prob_downloads,pred_shalla_2017_prob_dynamic,...,pred_toulouse_2017_prob_audio-video,pred_toulouse_2017_prob_bank,pred_toulouse_2017_prob_gambling,pred_toulouse_2017_prob_games,pred_toulouse_2017_prob_malware,pred_toulouse_2017_prob_others,pred_toulouse_2017_prob_phishing,pred_toulouse_2017_prob_press,pred_toulouse_2017_prob_publicite,pred_toulouse_2017_prob_shopping
0,test1,topshop.com,topshop.com,Top/Regional/Europe/United_Kingdom/Business_an...,topshop.com,shopping,0.002063,7.398885e-06,0.000658,0.0001041324,...,0.003793,0.0001161209,0.000291161,0.002073,0.003976,0.014862,0.112132,0.0008404782,0.000761,0.727203
1,test2,beyondrelief.com,beyondrelief.com,,beyondrelief.com,shopping,0.001067,1.734273e-06,0.002037,2.552577e-05,...,0.016359,0.003912277,0.006484169,0.022408,0.018371,0.046011,0.172208,0.02525989,0.002821,0.164577
2,test3,golf-tours.com/test,golf-tours.com,,golf-tours.com,recreation/sports,1e-06,7.501002e-09,3e-06,1.687661e-08,...,0.008208,0.001783388,0.0008022182,0.013352,0.006392,0.021287,0.060633,0.01853484,0.00099,0.681934
3,test4,thegayhotel.com,thegayhotel.com,,thegayhotel.com,porn,1.6e-05,6.654504e-07,5.4e-05,6.977853e-07,...,0.00108,8.920376e-05,6.256416e-05,0.000713,0.000934,0.005018,0.017201,0.0002208831,0.000135,0.003094
4,test5,https://zonasequravlabcp.com/bcp/,zonasequravlabcp.com,,zonasequravlabcp.com,recreation/sports,0.000667,9.059212e-07,0.000379,4.503229e-06,...,0.001063,0.0006226784,0.0001073761,0.012431,0.077391,0.031691,0.416989,0.00279639,0.000284,0.391121
5,test6,http://privatix.xyz,privatix.xyz,,privatix.xyz,porn,0.001434,0.00080705,0.000117,4.913021e-05,...,0.002241,6.823017e-07,1.969112e-06,0.001021,0.004949,0.003069,0.002094,4.559168e-06,0.000252,3.8e-05
6,test7,adultfriendfinder.com,adultfriendfinder.com,,adultfriendfinder.com,porn,0.000148,1.290912e-06,6e-06,2.791606e-07,...,0.000211,1.742063e-07,6.485795e-08,4.4e-05,5.9e-05,0.001674,0.058497,1.133889e-07,7e-06,6.6e-05
7,test8,giftregistrylocator.com,giftregistrylocator.com,,giftregistrylocator.com,shopping,0.000476,2.592254e-09,7e-06,8.599612e-07,...,0.00057,0.0003973926,1.019526e-05,0.004112,0.016339,0.015631,0.131174,0.01115336,0.000436,0.805531
8,test9,bangbrosonline.com,bangbrosonline.com,,bangbrosonline.com,porn,0.003574,3.734851e-05,0.000422,5.783453e-05,...,0.004017,9.122134e-05,0.0001142885,0.002216,0.000422,0.017964,0.012573,0.0005098382,0.000785,0.015817
9,test10,scotland-info.co.uk,scotland-info.co.uk,Top/Regional/Europe/United_Kingdom/Scotland/Tr...,scotland-info.co.uk,recreation/travel,0.000246,2.537992e-07,0.000179,2.365065e-06,...,0.003745,0.0003962543,0.000497739,0.014452,0.006615,0.057622,0.111698,0.000733116,0.000168,0.547802


In [12]:
df[['url', 'dmoz_2016_cat', 'shalla_2017_cat', 'phish_2017_cat', 'pred_shalla_2017_lab', 'pred_phish_2016_lab', 'pred_phish_2017_lab', 'pred_malware_2017_lab', 'pred_toulouse_2017_lab']]

Unnamed: 0,url,dmoz_2016_cat,shalla_2017_cat,phish_2017_cat,pred_shalla_2017_lab,pred_phish_2016_lab,pred_phish_2017_lab,pred_malware_2017_lab,pred_toulouse_2017_lab
0,topshop.com,Top/Regional/Europe/United_Kingdom/Business_an...,,,shopping,0,0,0,shopping
1,beyondrelief.com,,shopping|recreation/sports,,shopping,1,1,0,adult
2,golf-tours.com/test,,recreation/sports|recreation/travel,,recreation/sports,0,0,0,shopping
3,thegayhotel.com,,porn|recreation/travel,,porn,1,1,0,adult
4,https://zonasequravlabcp.com/bcp/,,,yes,recreation/sports,1,1,1,phishing
5,http://privatix.xyz,,,yes,porn,1,1,0,adult
6,adultfriendfinder.com,,porn|dating,,porn,0,0,0,adult
7,giftregistrylocator.com,,,,shopping,1,1,1,shopping
8,bangbrosonline.com,,porn,,porn,0,0,0,adult
9,scotland-info.co.uk,Top/Regional/Europe/United_Kingdom/Scotland/Tr...,recreation/travel,,recreation/travel,1,1,0,shopping
