# URL Preprocessing

We prepare a list of urls which should be accessed during the web crawl. The urls are exctracted from the Alexa top 1 million webites collection. 

## Data Preparation

In [0]:
import pandas as pd

#import Alexa top 1 million websites from csv file
websites_df = pd.read_csv(r"data\top-1million-sites_2.csv", sep = ";", header = None)

#rename columns of data frame
websites_df.columns = ["index", "url"]

#print properties of data frame
print(websites_df.shape, "\n")
print(websites_df.head())

In [0]:
#extract observations having an url ending with ".de" and create a new data frame
de_websites_df = websites_df[websites_df.url.str.endswith('.de')]

#export data frame containing only ".de"-website urls to a separate csv file
de_websites_df.to_csv(r'data/websites_de.csv', index=False)

## Import Data Frame with selected URLs

In [0]:
import pandas as pd

#import data containing ".de"-websites from csv file and convert to a data frame
de_websites_df = pd.read_csv(r"data/websites_de.csv", sep = ",") 

#print properties of data frame (numer of observations in total, first elements of data frame)
print(de_websites_df.shape)
print(de_websites_df.head(10))

(23809, 2)
   index                    url
0     23              google.de
1     83              amazon.de
2    118                ebay.de
3    312  ebay-kleinanzeigen.de
4    351                 web.de
5    427             spiegel.de
6    513            t-online.de
7    593                bild.de
8    625                chip.de
9    691              mobile.de


## Filtering Websites

### Remove Domains that are present on Filter Lists

#### Blacklist

In [0]:
#import os package for folder/file structure functionalities
import os
#import re package for regular expressions functionalities
import re

#specify root direction of filter lists
root_dir = 'C:/Users/timpe_000/Desktop/IT_Security/fitler_lists/blacklists_ut1'

#create a list of all folders (subdirectories) that are in the filter list root directory
blacklist_categories = os.listdir(root_dir)

#create empty list as container for all domains from the filter lists
all_domains = []

#loop through all subdirectories in the filter list root directory and access files containing the domains to be indexed
for category in blacklist_categories:
    #adding "/domains" the root path
    file_dir = root_dir + '/' + category + '/domains'
    
    #open file containing the domains
    domains = open(file_dir)
    
    #read file containing domains
    domains = domains.readlines()
    
    #delete all line break operators from the file content
    cleaned_list = [re.sub(r'\n', '', x) for x in domains]
    
    #add domains to the list of domains
    all_domains.append(cleaned_list)

In [0]:
#print first 5 elements of first entry in indexed domains list
all_domains[0][:5]

['--little--princess--.tumblr.com',
 '-allporn-.tumblr.com',
 '-becca-anal-.tumblr.com',
 '-celestial-beings-.tumblr.com',
 '-cocks.tumblr.com']

#### Shalla List

In [0]:
#repeat extraction of indexed lists with filter lists from shallalist
root_dir = 'C:/Users/timpe_000/Desktop/IT_Security/fitler_lists/shallalist/BL'

shallalist_categories = os.listdir(root_dir)

all_domains_shallalist = []

for category in shallalist_categories:
    file_dir = root_dir + '/' + category + '/domains'

    domains = open(file_dir)
    domains = domains.readlines()
    
    cleaned_list = [re.sub(r'\n', '', x) for x in domains]
    
    all_domains_shallalist.append(cleaned_list)

In [0]:
all_domains_shallalist[0][:5]

['000freexxx.com',
 '004.frnl.de',
 '01sexe.com',
 '01viral.com',
 '039068a.dialer-select.com']

#### Custom Buzzword List

In [0]:
#import custom list with words to be filtered out
#open file containing filtering buzzwords
custom_list = open(r'C:/Users/timpe_000/Desktop/IT_Security/fitler_lists/blacklist_words.txt')

#read file containing filter words
custom_list = custom_list.readlines()

#remove line break operators
custom_list = [re.sub(r'\n', '', x) for x in custom_list]

#remove duplicates in custom list
custom_list = list(set(custom_list))

### Merge Sublists

In [0]:
#turn all lists into single domain list

#import package itertools, providing extended list functionalities
import itertools

#covert list, containing domains in several sublists, into a single list containing all domains from the different filtering categories
merged_blacklist = list(itertools.chain(*all_domains))
merged_shallalist = list(itertools.chain(*all_domains_shallalist))

#print number of domains of the 3 different lists
print(len(merged_blacklist))
print(len(merged_shallalist))
print(len(custom_list))

2474878
1285799
507


### Keep only URLs with '.de'-Ending in Filter Lists

In [0]:
#to reduce total number of filter elements, only those containing a ".de"-ending are kept for filtering the list of ".de"-urls

###Blacklist
#convert to pandas data frame
merged_blacklist = pd.DataFrame(merged_blacklist)

#rename column to url
merged_blacklist.columns = ["url"]

#filter .de sites from merged blacklist
de_blacklist = merged_blacklist[merged_blacklist.url.str.endswith('.de')]

###Shallalist
#convert to pandas data frame
merged_shallalist = pd.DataFrame(merged_shallalist)

#rename column to url
merged_shallalist.columns = ["url"]

#filter .de sites from merged shallalist
de_shallalist = merged_shallalist[merged_shallalist.url.str.endswith('.de')]

#re-convert data frames to lists
de_blacklist = list(de_blacklist['url'])
de_shallalist = list(de_shallalist['url'])

### Filter URLs for Web Crawl

#### Indexing Lists

In [0]:
#convert data frame containing ".de"-urls from alexa top one million websites to a list
de_websites_list = list(de_websites_df.url)

#remove ".de"-websites that are contained in one of the filter lists
#using set function erases ordering of websites elements
filtered_de_websites_list = list(set(de_websites_list) - set(de_blacklist))
filtered_de_websites_list = list(set(filtered_de_websites_list) - set(de_shallalist))

#extract again .de-urls from unfiltered dataframe to restore original ordering of domains
filtered_de_websites_df = de_websites_df[de_websites_df['url'].isin(filtered_de_websites_list)]

#convert filtered urls to list
filtered_de_websites_list = list(filtered_de_websites_df.url)

In [0]:
print(de_websites_df.head())
print(de_websites_df.shape)
print(filtered_de_websites_df.head())
print(filtered_de_websites_df.shape)
print(filtered_de_websites_list[:10])

   index                    url
0     23              google.de
1     83              amazon.de
2    118                ebay.de
3    312  ebay-kleinanzeigen.de
4    351                 web.de
(23809, 2)
    index                   url
14   1349  immobilienscout24.de
19   1520               bahn.de
20   1570                dhl.de
25   1993       deref-web-02.de
26   2095            linguee.de
(21092, 2)
['immobilienscout24.de', 'bahn.de', 'dhl.de', 'deref-web-02.de', 'linguee.de', 'duden.de', 'chefkoch.de', 'check24.de', 'definicion.de', 'mydealz.de']


#### Buzzword List

In [0]:
#filter list domains that contain one of the buzzwords as substring, specified in the custom list file
for word in custom_list:
    filtered_de_websites_list = [x for x in filtered_de_websites_list if word not in x]
    
#print number of elements after filtering with custom list
print(len(filtered_de_websites_list))
print(filtered_de_websites_list[0:10])

20523
['immobilienscout24.de', 'bahn.de', 'dhl.de', 'deref-web-02.de', 'linguee.de', 'duden.de', 'chefkoch.de', 'check24.de', 'definicion.de', 'mydealz.de']


In [0]:
#restore ordering
filtered_de_websites_df = filtered_de_websites_df[filtered_de_websites_df['url'].isin(filtered_de_websites_list)]
filtered_de_websites_list = list(filtered_de_websites_df.url)

In [0]:
print(len(de_websites_list))
print(de_websites_list[:10])

print(len(filtered_de_websites_list))
print(filtered_de_websites_list[:10])

23809
['google.de', 'amazon.de', 'ebay.de', 'ebay-kleinanzeigen.de', 'web.de', 'spiegel.de', 't-online.de', 'bild.de', 'chip.de', 'mobile.de']
20523
['immobilienscout24.de', 'bahn.de', 'dhl.de', 'deref-web-02.de', 'linguee.de', 'duden.de', 'chefkoch.de', 'check24.de', 'definicion.de', 'mydealz.de']


### Finalize URL-List for Crawling

In [0]:
#adding "http://" to all websites
filtered_de_websites_list = ['http://' + s for s in filtered_de_websites_list]

#export final filtered list containing ".de"-websites to a csv file
filtered_de_websites_df = pd.DataFrame(filtered_de_websites_list)
filtered_de_websites_df[0].to_csv(r'data/filtered_websites_de_all.csv', index=False, header=False)