## An example using pydomains

In [1]:
# to hide Tensorflow warning messages
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from pydomains import *


Using TensorFlow backend.


In [2]:
df = pd.read_csv('input-header.csv')
df

Unnamed: 0,label,url
0,test1,topshop.com
1,test2,beyondrelief.com
2,test3,golf-tours.com/test
3,test4,thegayhotel.com
4,test5,https://zonasequravlabcp.com/bcp/
5,test6,http://privatix.xyz
6,test7,adultfriendfinder.com
7,test8,giftregistrylocator.com
8,test9,bangbrosonline.com
9,test10,scotland-info.co.uk


### Get the Content Category from DMOZ

In [3]:
df = dmoz_cat(df, domain_names='url')
df

Using cached DMOZ data from local (/root/.pydomains/dmoz_2016.csv.bz2)...
Loading DMOZ data file...


Unnamed: 0,label,url,dmoz_2016_domain,dmoz_2016_cat
0,test1,topshop.com,topshop.com,Top/Regional/Europe/United_Kingdom/Business_an...
1,test2,beyondrelief.com,beyondrelief.com,
2,test3,golf-tours.com/test,golf-tours.com,
3,test4,thegayhotel.com,thegayhotel.com,
4,test5,https://zonasequravlabcp.com/bcp/,zonasequravlabcp.com,
5,test6,http://privatix.xyz,privatix.xyz,
6,test7,adultfriendfinder.com,adultfriendfinder.com,
7,test8,giftregistrylocator.com,giftregistrylocator.com,
8,test9,bangbrosonline.com,bangbrosonline.com,
9,test10,scotland-info.co.uk,scotland-info.co.uk,Top/Regional/Europe/United_Kingdom/Scotland/Tr...


### Predict Content Category Using the Shallalist Model

In [4]:
df = pred_shalla(df, domain_names='url')
df

Using cached Shalla model data from local (/root/.pydomains/shalla_cat_lstm_others_2017.h5)...
Using cached Shalla vocab data from local (/root/.pydomains/shalla_cat_vocab_others_2017.csv)...
Using cached Shalla names data from local (/root/.pydomains/shalla_cat_names_others_2017.csv)...
Loading Shalla model, vocab and names data file...


Unnamed: 0,label,url,dmoz_2016_domain,dmoz_2016_cat,pred_shalla_2017_domain,pred_shalla_2017_lab,pred_shalla_2017_prob_adv,pred_shalla_2017_prob_anonvpn,pred_shalla_2017_prob_downloads,pred_shalla_2017_prob_dynamic,...,pred_shalla_2017_prob_recreation/restaurants,pred_shalla_2017_prob_recreation/sports,pred_shalla_2017_prob_recreation/travel,pred_shalla_2017_prob_redirector,pred_shalla_2017_prob_religion,pred_shalla_2017_prob_science/astronomy,pred_shalla_2017_prob_sex/lingerie,pred_shalla_2017_prob_shopping,pred_shalla_2017_prob_webmail,pred_shalla_2017_prob_webradio
0,test1,topshop.com,topshop.com,Top/Regional/Europe/United_Kingdom/Business_an...,topshop.com,shopping,0.001026,2.059451e-06,0.003806,1.206135e-05,...,0.0004904862,0.001987,0.001928,0.00612,7.9e-05,7.045513e-05,0.000250294,0.794791,0.0003849004,2.956166e-05
1,test2,beyondrelief.com,beyondrelief.com,,beyondrelief.com,shopping,0.001183,7.219888e-06,0.001623,0.0002614287,...,0.001601024,0.112013,0.094791,0.007562,0.003159,3.263207e-05,0.0003943433,0.316539,0.001392313,0.000150173
2,test3,golf-tours.com/test,golf-tours.com,,golf-tours.com,recreation/sports,3e-06,4.937107e-08,6e-06,4.701477e-07,...,3.715636e-05,0.503158,0.494601,1.4e-05,2e-06,1.133914e-06,2.178794e-07,0.001155,3.818599e-06,4.000348e-07
3,test4,thegayhotel.com,thegayhotel.com,,thegayhotel.com,porn,4e-05,5.606255e-06,3e-06,2.719857e-08,...,1.384283e-06,0.000217,0.161528,1.9e-05,2e-06,7.682852e-09,2.824876e-05,0.000103,2.314461e-06,4.17132e-06
4,test5,https://zonasequravlabcp.com/bcp/,zonasequravlabcp.com,,zonasequravlabcp.com,recreation/travel,0.000367,4.334551e-05,0.00011,0.0001695504,...,0.001490313,0.096042,0.698986,0.001285,0.000272,7.290782e-05,6.6823e-05,0.030918,0.0001274286,5.110677e-05
5,test6,http://privatix.xyz,privatix.xyz,,privatix.xyz,porn,0.014903,0.001005627,0.001272,2.857855e-05,...,7.032053e-06,0.0003,0.00043,0.058998,1.2e-05,3.474888e-07,9.895959e-05,0.001085,0.0003721285,0.0003435317
6,test7,adultfriendfinder.com,adultfriendfinder.com,,adultfriendfinder.com,porn,0.000159,7.556627e-07,2e-05,7.004297e-07,...,1.478807e-07,5.9e-05,3.2e-05,8.9e-05,3e-06,5.140512e-09,0.0001994979,0.000343,4.979767e-07,1.416092e-06
7,test8,giftregistrylocator.com,giftregistrylocator.com,,giftregistrylocator.com,shopping,0.011014,5.074874e-07,0.000298,0.0001397032,...,0.0004225496,0.005732,0.013036,0.000809,0.120866,4.813892e-06,0.0001945478,0.681082,6.409415e-05,0.0001473038
8,test9,bangbrosonline.com,bangbrosonline.com,,bangbrosonline.com,porn,0.002722,3.04098e-06,0.000388,6.509819e-05,...,0.001147169,0.012507,0.009487,0.000503,0.00055,2.52e-05,0.0006271042,0.027426,0.0003778504,0.00032654
9,test10,scotland-info.co.uk,scotland-info.co.uk,Top/Regional/Europe/United_Kingdom/Scotland/Tr...,scotland-info.co.uk,recreation/travel,0.000132,2.046485e-06,3.6e-05,5.609132e-06,...,0.003264121,0.202897,0.56952,0.000689,0.000565,3.945612e-05,2.42581e-05,0.141922,6.989195e-05,3.037718e-05


### Get the Content Category from Phishtank 

In [5]:
df = phish_cat(df, domain_names='url')
df

Using cached PhishTank data from local (/root/.pydomains/phish_2017.csv.bz2)...
Loading PhishTank data file...


Unnamed: 0,label,url,dmoz_2016_domain,dmoz_2016_cat,pred_shalla_2017_domain,pred_shalla_2017_lab,pred_shalla_2017_prob_adv,pred_shalla_2017_prob_anonvpn,pred_shalla_2017_prob_downloads,pred_shalla_2017_prob_dynamic,...,pred_shalla_2017_prob_recreation/travel,pred_shalla_2017_prob_redirector,pred_shalla_2017_prob_religion,pred_shalla_2017_prob_science/astronomy,pred_shalla_2017_prob_sex/lingerie,pred_shalla_2017_prob_shopping,pred_shalla_2017_prob_webmail,pred_shalla_2017_prob_webradio,phish_2017_domain,phish_2017_cat
0,test1,topshop.com,topshop.com,Top/Regional/Europe/United_Kingdom/Business_an...,topshop.com,shopping,0.001026,2.059451e-06,0.003806,1.206135e-05,...,0.001928,0.00612,7.9e-05,7.045513e-05,0.000250294,0.794791,0.0003849004,2.956166e-05,topshop.com,
1,test2,beyondrelief.com,beyondrelief.com,,beyondrelief.com,shopping,0.001183,7.219888e-06,0.001623,0.0002614287,...,0.094791,0.007562,0.003159,3.263207e-05,0.0003943433,0.316539,0.001392313,0.000150173,beyondrelief.com,
2,test3,golf-tours.com/test,golf-tours.com,,golf-tours.com,recreation/sports,3e-06,4.937107e-08,6e-06,4.701477e-07,...,0.494601,1.4e-05,2e-06,1.133914e-06,2.178794e-07,0.001155,3.818599e-06,4.000348e-07,golf-tours.com,
3,test4,thegayhotel.com,thegayhotel.com,,thegayhotel.com,porn,4e-05,5.606255e-06,3e-06,2.719857e-08,...,0.161528,1.9e-05,2e-06,7.682852e-09,2.824876e-05,0.000103,2.314461e-06,4.17132e-06,thegayhotel.com,
4,test5,https://zonasequravlabcp.com/bcp/,zonasequravlabcp.com,,zonasequravlabcp.com,recreation/travel,0.000367,4.334551e-05,0.00011,0.0001695504,...,0.698986,0.001285,0.000272,7.290782e-05,6.6823e-05,0.030918,0.0001274286,5.110677e-05,zonasequravlabcp.com,yes
5,test6,http://privatix.xyz,privatix.xyz,,privatix.xyz,porn,0.014903,0.001005627,0.001272,2.857855e-05,...,0.00043,0.058998,1.2e-05,3.474888e-07,9.895959e-05,0.001085,0.0003721285,0.0003435317,privatix.xyz,yes
6,test7,adultfriendfinder.com,adultfriendfinder.com,,adultfriendfinder.com,porn,0.000159,7.556627e-07,2e-05,7.004297e-07,...,3.2e-05,8.9e-05,3e-06,5.140512e-09,0.0001994979,0.000343,4.979767e-07,1.416092e-06,adultfriendfinder.com,
7,test8,giftregistrylocator.com,giftregistrylocator.com,,giftregistrylocator.com,shopping,0.011014,5.074874e-07,0.000298,0.0001397032,...,0.013036,0.000809,0.120866,4.813892e-06,0.0001945478,0.681082,6.409415e-05,0.0001473038,giftregistrylocator.com,
8,test9,bangbrosonline.com,bangbrosonline.com,,bangbrosonline.com,porn,0.002722,3.04098e-06,0.000388,6.509819e-05,...,0.009487,0.000503,0.00055,2.52e-05,0.0006271042,0.027426,0.0003778504,0.00032654,bangbrosonline.com,
9,test10,scotland-info.co.uk,scotland-info.co.uk,Top/Regional/Europe/United_Kingdom/Scotland/Tr...,scotland-info.co.uk,recreation/travel,0.000132,2.046485e-06,3.6e-05,5.609132e-06,...,0.56952,0.000689,0.000565,3.945612e-05,2.42581e-05,0.141922,6.989195e-05,3.037718e-05,scotland-info.co.uk,


### Get the Content Category from Shallalist

In [6]:
df = shalla_cat(df, domain_names='url')
df

Using cached Shallalist data from local (/root/.pydomains/shalla_2017.csv.bz2)...
Loading Shallalist data file...


Unnamed: 0,label,url,dmoz_2016_domain,dmoz_2016_cat,pred_shalla_2017_domain,pred_shalla_2017_lab,pred_shalla_2017_prob_adv,pred_shalla_2017_prob_anonvpn,pred_shalla_2017_prob_downloads,pred_shalla_2017_prob_dynamic,...,pred_shalla_2017_prob_religion,pred_shalla_2017_prob_science/astronomy,pred_shalla_2017_prob_sex/lingerie,pred_shalla_2017_prob_shopping,pred_shalla_2017_prob_webmail,pred_shalla_2017_prob_webradio,phish_2017_domain,phish_2017_cat,shalla_2017_domain,shalla_2017_cat
0,test1,topshop.com,topshop.com,Top/Regional/Europe/United_Kingdom/Business_an...,topshop.com,shopping,0.001026,2.059451e-06,0.003806,1.206135e-05,...,7.9e-05,7.045513e-05,0.000250294,0.794791,0.0003849004,2.956166e-05,topshop.com,,topshop.com,
1,test2,beyondrelief.com,beyondrelief.com,,beyondrelief.com,shopping,0.001183,7.219888e-06,0.001623,0.0002614287,...,0.003159,3.263207e-05,0.0003943433,0.316539,0.001392313,0.000150173,beyondrelief.com,,beyondrelief.com,shopping|recreation/sports
2,test3,golf-tours.com/test,golf-tours.com,,golf-tours.com,recreation/sports,3e-06,4.937107e-08,6e-06,4.701477e-07,...,2e-06,1.133914e-06,2.178794e-07,0.001155,3.818599e-06,4.000348e-07,golf-tours.com,,golf-tours.com,recreation/sports|recreation/travel
3,test4,thegayhotel.com,thegayhotel.com,,thegayhotel.com,porn,4e-05,5.606255e-06,3e-06,2.719857e-08,...,2e-06,7.682852e-09,2.824876e-05,0.000103,2.314461e-06,4.17132e-06,thegayhotel.com,,thegayhotel.com,porn|recreation/travel
4,test5,https://zonasequravlabcp.com/bcp/,zonasequravlabcp.com,,zonasequravlabcp.com,recreation/travel,0.000367,4.334551e-05,0.00011,0.0001695504,...,0.000272,7.290782e-05,6.6823e-05,0.030918,0.0001274286,5.110677e-05,zonasequravlabcp.com,yes,zonasequravlabcp.com,
5,test6,http://privatix.xyz,privatix.xyz,,privatix.xyz,porn,0.014903,0.001005627,0.001272,2.857855e-05,...,1.2e-05,3.474888e-07,9.895959e-05,0.001085,0.0003721285,0.0003435317,privatix.xyz,yes,privatix.xyz,
6,test7,adultfriendfinder.com,adultfriendfinder.com,,adultfriendfinder.com,porn,0.000159,7.556627e-07,2e-05,7.004297e-07,...,3e-06,5.140512e-09,0.0001994979,0.000343,4.979767e-07,1.416092e-06,adultfriendfinder.com,,adultfriendfinder.com,porn|dating
7,test8,giftregistrylocator.com,giftregistrylocator.com,,giftregistrylocator.com,shopping,0.011014,5.074874e-07,0.000298,0.0001397032,...,0.120866,4.813892e-06,0.0001945478,0.681082,6.409415e-05,0.0001473038,giftregistrylocator.com,,giftregistrylocator.com,
8,test9,bangbrosonline.com,bangbrosonline.com,,bangbrosonline.com,porn,0.002722,3.04098e-06,0.000388,6.509819e-05,...,0.00055,2.52e-05,0.0006271042,0.027426,0.0003778504,0.00032654,bangbrosonline.com,,bangbrosonline.com,porn
9,test10,scotland-info.co.uk,scotland-info.co.uk,Top/Regional/Europe/United_Kingdom/Scotland/Tr...,scotland-info.co.uk,recreation/travel,0.000132,2.046485e-06,3.6e-05,5.609132e-06,...,0.000565,3.945612e-05,2.42581e-05,0.141922,6.989195e-05,3.037718e-05,scotland-info.co.uk,,scotland-info.co.uk,recreation/travel


### Predict Content Category Using the 2016 PhishTank Model

In [7]:
df = pred_phish(df, year=2016, domain_names='url')
df

Using cached Phishtank model data from local (/root/.pydomains/phish_cat_lstm_2016.h5)...
Using the cached Phishtank vocab data from local (/root/.pydomains/phish_cat_vocab_2016.csv)...
Loading the Phishtank model and vocab data file...


Unnamed: 0,label,url,dmoz_2016_domain,dmoz_2016_cat,pred_shalla_2017_domain,pred_shalla_2017_lab,pred_shalla_2017_prob_adv,pred_shalla_2017_prob_anonvpn,pred_shalla_2017_prob_downloads,pred_shalla_2017_prob_dynamic,...,pred_shalla_2017_prob_shopping,pred_shalla_2017_prob_webmail,pred_shalla_2017_prob_webradio,phish_2017_domain,phish_2017_cat,shalla_2017_domain,shalla_2017_cat,pred_phish_2016_domain,pred_phish_2016_lab,pred_phish_2016_prob
0,test1,topshop.com,topshop.com,Top/Regional/Europe/United_Kingdom/Business_an...,topshop.com,shopping,0.001026,2.059451e-06,0.003806,1.206135e-05,...,0.794791,0.0003849004,2.956166e-05,topshop.com,,topshop.com,,topshop.com,0,0.081151
1,test2,beyondrelief.com,beyondrelief.com,,beyondrelief.com,shopping,0.001183,7.219888e-06,0.001623,0.0002614287,...,0.316539,0.001392313,0.000150173,beyondrelief.com,,beyondrelief.com,shopping|recreation/sports,beyondrelief.com,1,0.509255
2,test3,golf-tours.com/test,golf-tours.com,,golf-tours.com,recreation/sports,3e-06,4.937107e-08,6e-06,4.701477e-07,...,0.001155,3.818599e-06,4.000348e-07,golf-tours.com,,golf-tours.com,recreation/sports|recreation/travel,golf-tours.com,0,0.323873
3,test4,thegayhotel.com,thegayhotel.com,,thegayhotel.com,porn,4e-05,5.606255e-06,3e-06,2.719857e-08,...,0.000103,2.314461e-06,4.17132e-06,thegayhotel.com,,thegayhotel.com,porn|recreation/travel,thegayhotel.com,0,0.132156
4,test5,https://zonasequravlabcp.com/bcp/,zonasequravlabcp.com,,zonasequravlabcp.com,recreation/travel,0.000367,4.334551e-05,0.00011,0.0001695504,...,0.030918,0.0001274286,5.110677e-05,zonasequravlabcp.com,yes,zonasequravlabcp.com,,zonasequravlabcp.com,1,0.890309
5,test6,http://privatix.xyz,privatix.xyz,,privatix.xyz,porn,0.014903,0.001005627,0.001272,2.857855e-05,...,0.001085,0.0003721285,0.0003435317,privatix.xyz,yes,privatix.xyz,,privatix.xyz,1,0.603823
6,test7,adultfriendfinder.com,adultfriendfinder.com,,adultfriendfinder.com,porn,0.000159,7.556627e-07,2e-05,7.004297e-07,...,0.000343,4.979767e-07,1.416092e-06,adultfriendfinder.com,,adultfriendfinder.com,porn|dating,adultfriendfinder.com,0,0.470908
7,test8,giftregistrylocator.com,giftregistrylocator.com,,giftregistrylocator.com,shopping,0.011014,5.074874e-07,0.000298,0.0001397032,...,0.681082,6.409415e-05,0.0001473038,giftregistrylocator.com,,giftregistrylocator.com,,giftregistrylocator.com,1,0.939195
8,test9,bangbrosonline.com,bangbrosonline.com,,bangbrosonline.com,porn,0.002722,3.04098e-06,0.000388,6.509819e-05,...,0.027426,0.0003778504,0.00032654,bangbrosonline.com,,bangbrosonline.com,porn,bangbrosonline.com,0,0.217134
9,test10,scotland-info.co.uk,scotland-info.co.uk,Top/Regional/Europe/United_Kingdom/Scotland/Tr...,scotland-info.co.uk,recreation/travel,0.000132,2.046485e-06,3.6e-05,5.609132e-06,...,0.141922,6.989195e-05,3.037718e-05,scotland-info.co.uk,,scotland-info.co.uk,recreation/travel,scotland-info.co.uk,1,0.635387


### Predict Content Category Using the 2017 PhishTank Model

In [8]:
df = pred_phish(df, year=2017, domain_names='url')
df

Using cached Phishtank model data from local (/root/.pydomains/phish_cat_lstm_2017.h5)...
Using the cached Phishtank vocab data from local (/root/.pydomains/phish_cat_vocab_2017.csv)...
Loading the Phishtank model and vocab data file...


Unnamed: 0,label,url,dmoz_2016_domain,dmoz_2016_cat,pred_shalla_2017_domain,pred_shalla_2017_lab,pred_shalla_2017_prob_adv,pred_shalla_2017_prob_anonvpn,pred_shalla_2017_prob_downloads,pred_shalla_2017_prob_dynamic,...,phish_2017_domain,phish_2017_cat,shalla_2017_domain,shalla_2017_cat,pred_phish_2016_domain,pred_phish_2016_lab,pred_phish_2016_prob,pred_phish_2017_domain,pred_phish_2017_lab,pred_phish_2017_prob
0,test1,topshop.com,topshop.com,Top/Regional/Europe/United_Kingdom/Business_an...,topshop.com,shopping,0.001026,2.059451e-06,0.003806,1.206135e-05,...,topshop.com,,topshop.com,,topshop.com,0,0.081151,topshop.com,0,0.088315
1,test2,beyondrelief.com,beyondrelief.com,,beyondrelief.com,shopping,0.001183,7.219888e-06,0.001623,0.0002614287,...,beyondrelief.com,,beyondrelief.com,shopping|recreation/sports,beyondrelief.com,1,0.509255,beyondrelief.com,0,0.436622
2,test3,golf-tours.com/test,golf-tours.com,,golf-tours.com,recreation/sports,3e-06,4.937107e-08,6e-06,4.701477e-07,...,golf-tours.com,,golf-tours.com,recreation/sports|recreation/travel,golf-tours.com,0,0.323873,golf-tours.com,0,0.369796
3,test4,thegayhotel.com,thegayhotel.com,,thegayhotel.com,porn,4e-05,5.606255e-06,3e-06,2.719857e-08,...,thegayhotel.com,,thegayhotel.com,porn|recreation/travel,thegayhotel.com,0,0.132156,thegayhotel.com,0,0.326386
4,test5,https://zonasequravlabcp.com/bcp/,zonasequravlabcp.com,,zonasequravlabcp.com,recreation/travel,0.000367,4.334551e-05,0.00011,0.0001695504,...,zonasequravlabcp.com,yes,zonasequravlabcp.com,,zonasequravlabcp.com,1,0.890309,zonasequravlabcp.com,1,0.998016
5,test6,http://privatix.xyz,privatix.xyz,,privatix.xyz,porn,0.014903,0.001005627,0.001272,2.857855e-05,...,privatix.xyz,yes,privatix.xyz,,privatix.xyz,1,0.603823,privatix.xyz,0,0.485223
6,test7,adultfriendfinder.com,adultfriendfinder.com,,adultfriendfinder.com,porn,0.000159,7.556627e-07,2e-05,7.004297e-07,...,adultfriendfinder.com,,adultfriendfinder.com,porn|dating,adultfriendfinder.com,0,0.470908,adultfriendfinder.com,0,0.474351
7,test8,giftregistrylocator.com,giftregistrylocator.com,,giftregistrylocator.com,shopping,0.011014,5.074874e-07,0.000298,0.0001397032,...,giftregistrylocator.com,,giftregistrylocator.com,,giftregistrylocator.com,1,0.939195,giftregistrylocator.com,1,0.860993
8,test9,bangbrosonline.com,bangbrosonline.com,,bangbrosonline.com,porn,0.002722,3.04098e-06,0.000388,6.509819e-05,...,bangbrosonline.com,,bangbrosonline.com,porn,bangbrosonline.com,0,0.217134,bangbrosonline.com,0,0.404384
9,test10,scotland-info.co.uk,scotland-info.co.uk,Top/Regional/Europe/United_Kingdom/Scotland/Tr...,scotland-info.co.uk,recreation/travel,0.000132,2.046485e-06,3.6e-05,5.609132e-06,...,scotland-info.co.uk,,scotland-info.co.uk,recreation/travel,scotland-info.co.uk,1,0.635387,scotland-info.co.uk,1,0.69798


### Predict Content Category Using the Malware Model

In [9]:
df = pred_malware(df, domain_names='url')
df

Using cached Malware model data from local (/root/.pydomains/malware_cat_lstm_2017.h5)...
Using cached Malware vocab data from local (/root/.pydomains/malware_cat_vocab_2017.csv)...
Loading Malware model and vocab data file...


Unnamed: 0,label,url,dmoz_2016_domain,dmoz_2016_cat,pred_shalla_2017_domain,pred_shalla_2017_lab,pred_shalla_2017_prob_adv,pred_shalla_2017_prob_anonvpn,pred_shalla_2017_prob_downloads,pred_shalla_2017_prob_dynamic,...,shalla_2017_cat,pred_phish_2016_domain,pred_phish_2016_lab,pred_phish_2016_prob,pred_phish_2017_domain,pred_phish_2017_lab,pred_phish_2017_prob,pred_malware_2017_domain,pred_malware_2017_lab,pred_malware_2017_prob
0,test1,topshop.com,topshop.com,Top/Regional/Europe/United_Kingdom/Business_an...,topshop.com,shopping,0.001026,2.059451e-06,0.003806,1.206135e-05,...,,topshop.com,0,0.081151,topshop.com,0,0.088315,topshop.com,0,0.161202
1,test2,beyondrelief.com,beyondrelief.com,,beyondrelief.com,shopping,0.001183,7.219888e-06,0.001623,0.0002614287,...,shopping|recreation/sports,beyondrelief.com,1,0.509255,beyondrelief.com,0,0.436622,beyondrelief.com,0,0.246524
2,test3,golf-tours.com/test,golf-tours.com,,golf-tours.com,recreation/sports,3e-06,4.937107e-08,6e-06,4.701477e-07,...,recreation/sports|recreation/travel,golf-tours.com,0,0.323873,golf-tours.com,0,0.369796,golf-tours.com,0,0.141004
3,test4,thegayhotel.com,thegayhotel.com,,thegayhotel.com,porn,4e-05,5.606255e-06,3e-06,2.719857e-08,...,porn|recreation/travel,thegayhotel.com,0,0.132156,thegayhotel.com,0,0.326386,thegayhotel.com,0,0.306572
4,test5,https://zonasequravlabcp.com/bcp/,zonasequravlabcp.com,,zonasequravlabcp.com,recreation/travel,0.000367,4.334551e-05,0.00011,0.0001695504,...,,zonasequravlabcp.com,1,0.890309,zonasequravlabcp.com,1,0.998016,zonasequravlabcp.com,1,0.986355
5,test6,http://privatix.xyz,privatix.xyz,,privatix.xyz,porn,0.014903,0.001005627,0.001272,2.857855e-05,...,,privatix.xyz,1,0.603823,privatix.xyz,0,0.485223,privatix.xyz,0,0.233505
6,test7,adultfriendfinder.com,adultfriendfinder.com,,adultfriendfinder.com,porn,0.000159,7.556627e-07,2e-05,7.004297e-07,...,porn|dating,adultfriendfinder.com,0,0.470908,adultfriendfinder.com,0,0.474351,adultfriendfinder.com,0,0.103906
7,test8,giftregistrylocator.com,giftregistrylocator.com,,giftregistrylocator.com,shopping,0.011014,5.074874e-07,0.000298,0.0001397032,...,,giftregistrylocator.com,1,0.939195,giftregistrylocator.com,1,0.860993,giftregistrylocator.com,1,0.994493
8,test9,bangbrosonline.com,bangbrosonline.com,,bangbrosonline.com,porn,0.002722,3.04098e-06,0.000388,6.509819e-05,...,porn,bangbrosonline.com,0,0.217134,bangbrosonline.com,0,0.404384,bangbrosonline.com,0,0.114192
9,test10,scotland-info.co.uk,scotland-info.co.uk,Top/Regional/Europe/United_Kingdom/Scotland/Tr...,scotland-info.co.uk,recreation/travel,0.000132,2.046485e-06,3.6e-05,5.609132e-06,...,recreation/travel,scotland-info.co.uk,1,0.635387,scotland-info.co.uk,1,0.69798,scotland-info.co.uk,1,0.723313


### Predict Content Category Using the Toulouse Model

In [10]:
df = pred_toulouse(df, domain_names='url')
df

Using cached Toulouse model data from local (/root/.pydomains/toulouse_cat_lstm_others_2017.h5)...
Using cached Toulouse vocab data from local (/root/.pydomains/toulouse_cat_vocab_others_2017.csv)...
Using cached Toulouse names data from local (/root/.pydomains/toulouse_cat_names_others_2017.csv)...
Loading Toulouse model, vocab and names data file...


Unnamed: 0,label,url,dmoz_2016_domain,dmoz_2016_cat,pred_shalla_2017_domain,pred_shalla_2017_lab,pred_shalla_2017_prob_adv,pred_shalla_2017_prob_anonvpn,pred_shalla_2017_prob_downloads,pred_shalla_2017_prob_dynamic,...,pred_toulouse_2017_lab,pred_toulouse_2017_prob_adult,pred_toulouse_2017_prob_bank,pred_toulouse_2017_prob_gambling,pred_toulouse_2017_prob_games,pred_toulouse_2017_prob_malware,pred_toulouse_2017_prob_others,pred_toulouse_2017_prob_phishing,pred_toulouse_2017_prob_press,pred_toulouse_2017_prob_shopping
0,test1,topshop.com,topshop.com,Top/Regional/Europe/United_Kingdom/Business_an...,topshop.com,shopping,0.001026,2.059451e-06,0.003806,1.206135e-05,...,shopping,0.271566,0.001456542,0.001363023,0.008374,0.008257,0.031519,0.160733,0.001497,0.515235
1,test2,beyondrelief.com,beyondrelief.com,,beyondrelief.com,shopping,0.001183,7.219888e-06,0.001623,0.0002614287,...,adult,0.728981,0.001523621,0.0004605205,0.009057,0.006343,0.024769,0.100508,0.002152,0.126206
2,test3,golf-tours.com/test,golf-tours.com,,golf-tours.com,recreation/sports,3e-06,4.937107e-08,6e-06,4.701477e-07,...,shopping,0.395231,0.00271949,0.0005266654,0.024734,0.007197,0.037336,0.095911,0.006155,0.43019
3,test4,thegayhotel.com,thegayhotel.com,,thegayhotel.com,porn,4e-05,5.606255e-06,3e-06,2.719857e-08,...,adult,0.975226,8.760451e-06,3.409255e-05,0.000253,0.000281,0.002514,0.019806,0.000102,0.001776
4,test5,https://zonasequravlabcp.com/bcp/,zonasequravlabcp.com,,zonasequravlabcp.com,recreation/travel,0.000367,4.334551e-05,0.00011,0.0001695504,...,phishing,0.076784,0.0005248904,0.0001624237,0.003851,0.031768,0.014402,0.838332,0.000622,0.033553
5,test6,http://privatix.xyz,privatix.xyz,,privatix.xyz,porn,0.014903,0.001005627,0.001272,2.857855e-05,...,adult,0.995279,8.919195e-06,2.852488e-07,5.4e-05,0.001001,0.003326,0.000294,3e-06,3.3e-05
6,test7,adultfriendfinder.com,adultfriendfinder.com,,adultfriendfinder.com,porn,0.000159,7.556627e-07,2e-05,7.004297e-07,...,adult,0.978048,9.402128e-07,1.829964e-05,0.000121,7.3e-05,0.001129,0.02039,2.1e-05,0.000197
7,test8,giftregistrylocator.com,giftregistrylocator.com,,giftregistrylocator.com,shopping,0.011014,5.074874e-07,0.000298,0.0001397032,...,shopping,0.049579,0.0004880234,0.0001580799,0.012057,0.008287,0.026848,0.051026,0.004657,0.8469
8,test9,bangbrosonline.com,bangbrosonline.com,,bangbrosonline.com,porn,0.002722,3.04098e-06,0.000388,6.509819e-05,...,adult,0.973677,8.173973e-05,0.0001299159,0.000951,0.000566,0.007519,0.008231,0.000531,0.008313
9,test10,scotland-info.co.uk,scotland-info.co.uk,Top/Regional/Europe/United_Kingdom/Scotland/Tr...,scotland-info.co.uk,recreation/travel,0.000132,2.046485e-06,3.6e-05,5.609132e-06,...,shopping,0.361648,0.0005405789,0.0001374729,0.010349,0.002892,0.030299,0.022747,0.000717,0.570668


In [11]:
df = pred_toulouse(df, domain_names='url')
df

Using cached Toulouse model data from local (/root/.pydomains/toulouse_cat_lstm_others_2017.h5)...
Using cached Toulouse vocab data from local (/root/.pydomains/toulouse_cat_vocab_others_2017.csv)...
Using cached Toulouse names data from local (/root/.pydomains/toulouse_cat_names_others_2017.csv)...
Loading Toulouse model, vocab and names data file...


Unnamed: 0,label,url,dmoz_2016_domain,dmoz_2016_cat,pred_shalla_2017_domain,pred_shalla_2017_lab,pred_shalla_2017_prob_adv,pred_shalla_2017_prob_anonvpn,pred_shalla_2017_prob_downloads,pred_shalla_2017_prob_dynamic,...,pred_toulouse_2017_prob_shopping,pred_toulouse_2017_prob_adult,pred_toulouse_2017_prob_bank,pred_toulouse_2017_prob_gambling,pred_toulouse_2017_prob_games,pred_toulouse_2017_prob_malware,pred_toulouse_2017_prob_others,pred_toulouse_2017_prob_phishing,pred_toulouse_2017_prob_press,pred_toulouse_2017_prob_shopping.1
0,test1,topshop.com,topshop.com,Top/Regional/Europe/United_Kingdom/Business_an...,topshop.com,shopping,0.001026,2.059451e-06,0.003806,1.206135e-05,...,0.515235,0.271566,0.001456542,0.001363023,0.008374,0.008257,0.031519,0.160733,0.001497,0.515235
1,test2,beyondrelief.com,beyondrelief.com,,beyondrelief.com,shopping,0.001183,7.219888e-06,0.001623,0.0002614287,...,0.126206,0.728981,0.001523621,0.0004605205,0.009057,0.006343,0.024769,0.100508,0.002152,0.126206
2,test3,golf-tours.com/test,golf-tours.com,,golf-tours.com,recreation/sports,3e-06,4.937107e-08,6e-06,4.701477e-07,...,0.43019,0.395231,0.00271949,0.0005266654,0.024734,0.007197,0.037336,0.095911,0.006155,0.43019
3,test4,thegayhotel.com,thegayhotel.com,,thegayhotel.com,porn,4e-05,5.606255e-06,3e-06,2.719857e-08,...,0.001776,0.975226,8.760451e-06,3.409255e-05,0.000253,0.000281,0.002514,0.019806,0.000102,0.001776
4,test5,https://zonasequravlabcp.com/bcp/,zonasequravlabcp.com,,zonasequravlabcp.com,recreation/travel,0.000367,4.334551e-05,0.00011,0.0001695504,...,0.033553,0.076784,0.0005248904,0.0001624237,0.003851,0.031768,0.014402,0.838332,0.000622,0.033553
5,test6,http://privatix.xyz,privatix.xyz,,privatix.xyz,porn,0.014903,0.001005627,0.001272,2.857855e-05,...,3.3e-05,0.995279,8.919195e-06,2.852488e-07,5.4e-05,0.001001,0.003326,0.000294,3e-06,3.3e-05
6,test7,adultfriendfinder.com,adultfriendfinder.com,,adultfriendfinder.com,porn,0.000159,7.556627e-07,2e-05,7.004297e-07,...,0.000197,0.978048,9.402128e-07,1.829964e-05,0.000121,7.3e-05,0.001129,0.02039,2.1e-05,0.000197
7,test8,giftregistrylocator.com,giftregistrylocator.com,,giftregistrylocator.com,shopping,0.011014,5.074874e-07,0.000298,0.0001397032,...,0.8469,0.049579,0.0004880234,0.0001580799,0.012057,0.008287,0.026848,0.051026,0.004657,0.8469
8,test9,bangbrosonline.com,bangbrosonline.com,,bangbrosonline.com,porn,0.002722,3.04098e-06,0.000388,6.509819e-05,...,0.008313,0.973677,8.173973e-05,0.0001299159,0.000951,0.000566,0.007519,0.008231,0.000531,0.008313
9,test10,scotland-info.co.uk,scotland-info.co.uk,Top/Regional/Europe/United_Kingdom/Scotland/Tr...,scotland-info.co.uk,recreation/travel,0.000132,2.046485e-06,3.6e-05,5.609132e-06,...,0.570668,0.361648,0.0005405789,0.0001374729,0.010349,0.002892,0.030299,0.022747,0.000717,0.570668


In [12]:
df[['url', 'dmoz_2016_cat', 'shalla_2017_cat', 'phish_2017_cat', 'pred_shalla_2017_lab', 'pred_phish_2016_lab', 'pred_phish_2017_lab', 'pred_malware_2017_lab', 'pred_toulouse_2017_lab']]

Unnamed: 0,url,dmoz_2016_cat,shalla_2017_cat,phish_2017_cat,pred_shalla_2017_lab,pred_phish_2016_lab,pred_phish_2017_lab,pred_malware_2017_lab,pred_toulouse_2017_lab
0,topshop.com,Top/Regional/Europe/United_Kingdom/Business_an...,,,shopping,0,0,0,shopping
1,beyondrelief.com,,shopping|recreation/sports,,shopping,1,0,0,adult
2,golf-tours.com/test,,recreation/sports|recreation/travel,,recreation/sports,0,0,0,shopping
3,thegayhotel.com,,porn|recreation/travel,,porn,0,0,0,adult
4,https://zonasequravlabcp.com/bcp/,,,yes,recreation/travel,1,1,1,phishing
5,http://privatix.xyz,,,yes,porn,1,0,0,adult
6,adultfriendfinder.com,,porn|dating,,porn,0,0,0,adult
7,giftregistrylocator.com,,,,shopping,1,1,1,shopping
8,bangbrosonline.com,,porn,,porn,0,0,0,adult
9,scotland-info.co.uk,Top/Regional/Europe/United_Kingdom/Scotland/Tr...,recreation/travel,,recreation/travel,1,1,1,shopping
