In [None]:
import pandas as pd
from tld import get_tld
import seaborn as sns
from pprint import pprint
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
from urllib.parse import urlparse
import re
import tldextract

In [None]:
t3n = pd.read_csv('t3n_url_spiderall_domains.csv')

In [None]:
t3n['tld'] = t3n.url.map(lambda url:  get_tld(url, fail_silently=True))
#create new column tld which contains the output of get_tld()
ax = sns.countplot(x='tld',data=t3n, order=t3n.tld.value_counts().iloc[:5].index).set_title('Count of 5 most occuring high level domains of URLs')
#plot only 10 most occuring
t3n['tld'].value_counts()
fig = ax.get_figure()
fig.savefig('plots/tld_t3n.png')

In [None]:
ax = sns.distplot(t3n['from'].value_counts())
ax.set(xlabel='#outgoing links')
ax.set_title('Distribution of outgoing links per page')
fig = ax.get_figure()
fig.savefig('plots/outgoing_links_t3n.png')
t3n['from'].value_counts()

In [None]:
ax = sns.distplot(t3n['url'].value_counts()[t3n['url'].value_counts() > 10]) #Plot only Urls which occure at least 10 times
ax.set(xlabel='#incoming links')
ax.set_title('Distribution of incoming links per page')
fig = ax.get_figure()
fig.savefig('plots/incoming_links_t3n.png')
t3n['url'].value_counts()

In [None]:
num_external_urls = t3n[~t3n['url'].str.contains('//(.*\.)?t3n\.de')].url.count()
num_internal_urls = t3n[t3n['url'].str.contains('//(.*\.)?t3n\.de')].url.count()

In [None]:
plt.bar([0,1], height= [num_internal_urls,num_external_urls], color=['red', 'blue'])
plt.xticks([0,1],['# t3n URLs','# external URLs'])
plt.title('Number of outgoing internal vs. external URLs')
plt.savefig('plots/distribution_internal_external_links_t3n.png')

In [None]:
graph = pd.DataFrame()
graph['to'] = t3n.url.map(lambda url:  tldextract.extract(url).domain)
graph['from'] = t3n['from'].map(lambda url:  tldextract.extract(url).domain)
graph = graph.groupby(['to', 'from']).size().reset_index(name='weight')

G = nx.DiGraph()

for index, row in graph.iterrows():
    if(row['weight'] > 10):
        G.add_edge(row['from'], row['to'], weight=row['weight'])
        
nx.write_gexf(G, '05_pos-mat.gexf')

In [None]:
G = nx.DiGraph()
for index, row in graph.iterrows():
    if(row['weight'] > 10):
        G.add_edge(row['from'], row['to'], weight=row['weight'])
nx.write_gexf(G, '05_pos-mat.gexf')

In [None]:
nx.draw(G,with_labels=True)
plt.title('Sites > 10 Outgoing t3n Links')
plt.savefig("plots/graph_t3n.png", format="PNG")