<a href="https://colab.research.google.com/github/wingmenonlinemarketing/iv_colab/blob/master/Interne_Verlinkung_optimieren.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Interne Verlinkung optimieren

Notebook zum Webinar "pragmatische Optimierung der internen Verlinkung" von johan.huelsen@wngmn.de für audisto.com

Im Webinar sind wir verschiedene Ansätze zur Optimierung der internen Verlinkung durchgegangen. Als Beispiel haben wir einen Crawl von web.dev genutzt.

# Initialisieren

## Notwendige Libraries laden

In [None]:
# Non-Standard Libraries installieren
!pip install tldextract
!pip install swifter 

import pandas as pd # Für Datenverarbeitung
import networkx as nx # Für PageRank
from google.colab import files # Für Dateiupload
import io # Für Dateiupload
import swifter # Zum Beschleunigen mancher Pandas-Funktionen
import tldextract # Zur Extraction von Domains aus URLs
import numpy as np # Für Logarithmus-Kram bei Backlinks

Collecting tldextract
[?25l  Downloading https://files.pythonhosted.org/packages/12/cf/d0ff82625e53bd245d6173ce6333d190abbfcd94e4c30e54b4e16b474216/tldextract-2.2.3-py2.py3-none-any.whl (48kB)
[K     |██████▊                         | 10kB 15.3MB/s eta 0:00:01[K     |█████████████▍                  | 20kB 1.7MB/s eta 0:00:01[K     |████████████████████            | 30kB 2.1MB/s eta 0:00:01[K     |██████████████████████████▊     | 40kB 2.5MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 1.6MB/s 
[?25hCollecting requests-file>=1.4
  Downloading https://files.pythonhosted.org/packages/77/86/cdb5e8eaed90796aa83a6d9f75cfbd37af553c47a291cd47bc410ef9bdb2/requests_file-1.5.1-py2.py3-none-any.whl
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-1.5.1 tldextract-2.2.3
Collecting swifter
  Downloading https://files.pythonhosted.org/packages/7c/07/ad5bda46a33f1d869eaf71f431615111f251adfbceb4e3a02d66ad402582/swifter-1.0.3-py3-no

## ScreamingFrog Exporte hochladen

In [None]:
# Upload files (bei größeren Datenmengen als .gz hochladen)
print('Upload: all_outlinks.csv, redirects.csv, internal_all.csv')
uploaded = files.upload()

Upload: all_outlinks.csv, redirects.csv, internal_all.csv


Saving redirects.csv to redirects.csv
Saving all_outlinks.csv to all_outlinks.csv
Saving backlinks.csv to backlinks.csv
Saving internal_all.csv to internal_all.csv


## Variablen definieren

In [None]:
# Für welche Domain sollen Links als interne Links gewertet werden?
domain = 'web.dev'

# Wie sollen Link-Positionen gewichtet werden (beim Droppen von Duplikaten werden wichtige Links (und deren Linktexte) beibehalten und weniger wichtige gelöscht)

element_weights = {
    'Header': 0.05,
    'Navigation': 0.1,
    'Footer': 0.05,
    'Content': 1,
    'Aside': 0.5
}

## Funktionen initialisieren

In [None]:
# main functions
check_internal = lambda x: '.'.join(tldextract.extract(x)[1:3]) == domain

# because we will reload the files a lot
def read_files():
  docs = read_file('internal_all.csv')
  docs = docs[docs['Address'].str.endswith('/index.json')==False] # Komische URLs im Crawl

  canonicals = docs[['Address','Canonical Link Element 1', 'Redirect URL']]
  canonicals['Final Address'] = canonicals['Canonical Link Element 1'].fillna(canonicals['Redirect URL'])
  canonicals = canonicals[canonicals['Final Address'] == canonicals['Final Address']] #Final Address given
  canonicals = canonicals[canonicals['Address'] != canonicals['Final Address']] # Final Address not Address
  canonicals = canonicals.drop(columns=['Canonical Link Element 1', 'Redirect URL'])
  
  redirects = read_file('redirects.csv')[['Address', 'Final Address']]

  links = read_file('all_outlinks.csv')
  links = normalize_links(
      links,
      redirects=redirects,
      canonicals=canonicals)

  return links, docs


def read_file(filename):
  '''read uploaded file to variable'''
  return pd.read_csv(io.BytesIO(uploaded[filename]))

def simulate_pagerank(links, personalization=None, weight=None):
  """To include Backlink values for weighting of nodes use a dictionary with a
personalization score for all URLs between 0 and 1 you could build something
like Kevins TIPR https://www.kevin-indig.com/internal-link-optimization-with-tipr/
for example: Fetch Backlinks, score them with Backlinks visibility or referrers
and use a log for normalization
  
To increase the impact of specific links (f.e. in content) one could use
sth. like: https://stackoverflow.com/questions/9136539/how-do-weighted-edges-affect-pagerank-in-networkx
Library: https://networkx.github.io/documentation/networkx-1.10/reference/generated/networkx.algorithms.link_analysis.pagerank_alg.pagerank.html
"""
  if weight:
    g = nx.from_pandas_edgelist(links, 'Source', 'Destination', 'weights')
  else:
    g = nx.from_pandas_edgelist(links, 'Source', 'Destination')

  # Calculate Pagerank
  pr = nx.pagerank_scipy(
      g,
      alpha=0.85,
      weight=weight, # Link Gewichtung
      personalization=personalization, #Personalisierung (Backlinks)
      nstart=personalization,#Personalisierung (Backlinks)
      dangling=personalization #Personalisierung (Backlinks)
      )
  pr = pd.DataFrame.from_dict(pr, orient='index').rename(columns={0:'pr'})
  return pr

def replace_destinations(df, replacer):
  # obey redirects
  df = pd.merge(df,replacer,left_on='Destination',right_on='Address',how='left')
  # Replace Destination with Final Address, If not present use Destination
  df['Destination'] = df['Final Address'].fillna(df['Destination'])
  # If Source is redirected or has Canonical: Drop Link
  df = df[df['Source'].isin(replacer['Address'])==False]
  # drop temp-columns
  df = df.drop(columns=['Address','Final Address'])
  return df


def normalize_links(
    df, redirects=pd.DataFrame(), canonicals=pd.DataFrame(), duplicates=True):
  """Reduce number of Links and delete data you don't need
  """

  df = df.rename(columns={'Anchor':'linkText'})

  print(len(df), 'Links')
  df = df[df['Type']=='Hyperlink'] #only <a href>
  print(len(df), '<a href>-Links, dropped other')
  df['Destination'] = df['Destination'].str.replace('#.*', '') # Sprungmarken-links normalisieren

  # Redirects und Canonicals normalisieren
  # Jaja, erst auflösen, dann ersetzen wäre sauberer
  df = replace_destinations(df, redirects) # obey redirects
  df = replace_destinations(df, canonicals) # obey canonicals

  df = df[df['Source']!=df['Destination']] # links zum gleichen Dokument löschen
  print(len(df), 'to different URLs, dropped Links to self (including #)')
  df = df[df['Link Position']==df['Link Position']] # Drop Links without Position given
  print(len(df), 'links with known Link Position')
  df['weights'] = df['Link Position'].swifter.apply(lambda x: element_weights[x])
  df = df.sort_values('weights', ascending=False).drop_duplicates(subset=['Source','Destination'])
  print(len(df), 'Unique links (dropped duplicates according to link weights)')

  df['linkText'] = df['linkText'].fillna(df['Alt Text']) # Use Alt Text if no linktext given
  df['linkText'] = df['linkText'].fillna('').astype(str) # Ensure Strings
  df['internal'] = df['Destination'].swifter.apply(check_internal) # Check if internal link

  df['Source'] = df['Source'].str.replace('/index.json', '/') # Komische URLs im Crawl (web.dev)
  df['Destination'] = df['Destination'].str.replace('/index.json', '/')# Komische URLs im Crawl (web.dev)
  
  # Only one link from Source to destination
  if duplicates==True:
    df = df.drop_duplicates(subset=['Source','Destination'])

  return df

# Basic Analysis
Basis-Analysen allein auf Basis der Exporte ohne weitere Berechnungen:

* Meist genutzte Linktexte
* Linktexte mit unterschiedlichen Zielseiten
* Dokumente mit den meisten verschiedenen Linktexten
* Dokumente mit den meisten ausgehenden externen Links
* Meistverlinkte externe Dokumente + Linktext
* Dokumente mit eingehenden Nofollow-Links

## Rohdaten laden

In [None]:
links, docs = read_files()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


69962 Links
50079 <a href>-Links, dropped other
643007 to different URLs, dropped Links to self (including #)
639313 links with known Link Position


HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=639313.0, style=ProgressStyle(descript…


28905 Unique links (dropped duplicates according to link weights)


HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=28905.0, style=ProgressStyle(descripti…




## Meistgenutzte Linktexte
Sind die Linktexte
* spezifisch?
* beschreibend?
* hilfreich?

In [None]:
# Most used linktexts
links[
    links['internal']==True
].groupby(
    'linkText'
).agg(
    numLinks=('Destination','count'), 
    uniqueDestinations=('Destination',pd.Series.nunique)
).sort_values(
    'numLinks', ascending=False
).head(15)

Unnamed: 0_level_0,numLinks,uniqueDestinations
linkText,Unnamed: 1_level_1,Unnamed: 2_level_1
About,631,1
web.dev,631,2
Community Guidelines,631,1
Podcasts,630,1
Live,628,1
Learn,627,1
Measure,623,1
Subscribe,566,1
Blog,513,1
,385,238


## Linktexte mit unterschiedlichen Zielen
Wird der gleiche Linktext für unterschiedliche Zielseiten verwendet.
* Viele unterschiedliche Texte: schwächen im Template / Kandidaten für Löschung
* Weniger unterschiedliche Texte: Oft Konkurrenz um den gleichen Begriff

In [None]:
# Linktexts with most different Destinations
links[
    links['internal']==True
].groupby(
    'linkText'
).agg(
    numLinks=('Destination','count'), 
    uniqueDestinations=('Destination',pd.Series.nunique)
).sort_values(
    'uniqueDestinations', ascending=False
).head(15)

Unnamed: 0_level_0,numLinks,uniqueDestinations
linkText,Unnamed: 1_level_1,Unnamed: 2_level_1
,385,238
RSS Feed,80,78
Return to article,23,17
Return to all articles,337,17
performance budgets,8,4
here,4,4
field metrics,5,4
screenshot,4,4
1,3,3
2,7,3


## Zielseiten mit hoher Linktext-Varianz / Linkziele mit unterschiedlichen Texten
Seiten die mit besonders vielen unterschiedlichen Linktexten verlinkt werden.
Unterschiedliche Ankertexte sind grundsätzlich hilfreich, sie sollten aber schon eine Einordnung ermöglichen


In [None]:
# Destinations with most different Linktexts
links[
    links['internal']==True
].groupby(
    'Destination'
).agg(
    numLinks=('Source','count'), 
    uniqueTexts=('linkText',pd.Series.nunique),
    texts = ('linkText', lambda x: ', '.join(set(x)))
).sort_values(
    'uniqueTexts', ascending=False
).head(15)

Unnamed: 0_level_0,numLinks,uniqueTexts,texts
Destination,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
https://web.dev/authors/jakearchibald/,328,32,"Chris Anstey, Jonathon Imperiosi, Performance,..."
https://web.dev/fast/,80,29,"resource prioritization, Fast load times, use ..."
https://web.dev/user-centric-performance-metrics/,33,22,", load fast, lab data, in the lab, field metri..."
https://web.dev/reduce-javascript-payloads-with-code-splitting/,28,19,", code splitting, split, lazy load, Only send ..."
https://web.dev/progressive-web-apps/,41,17,"Introduction, progressive web app, our guidanc..."
https://web.dev/customize-install/,15,13,"Add to Home Screen, installable guide, install..."
https://web.dev/http-cache/,17,11,The HTTP cache: your first line of defense gui...
https://web.dev/add-manifest/,21,11,", icons array, manifest file, define a Progres..."
https://web.dev/lighthouse-performance/,44,11,Collection 33 resources Updated Performance au...
https://web.dev/remove-unused-code/,14,10,", removing unused code, npm makes adding code ..."


## Nicht indexierbare Linkziele
Nicht indexierbare Seiten sollten selten verlinkt werden.
Bei häufig verlinkten Seiten:
* Macht es Sinn die Seite zu indexieren?
* Macht es Sinn die Seite weniger häufig zu verlinken?


In [None]:
links[
    links['Destination'].isin(
        docs[docs['Indexability']=='Non-Indexable']['Address']
    )
].groupby(
    'Destination'
).agg(
    numLinks=('Source','count'), 
    uniqueSource=('Source',pd.Series.nunique),
    texts = ('linkText', lambda x: ', '.join(set(x)))
).sort_values(
    'numLinks', ascending=False
).head(15)

Unnamed: 0_level_0,numLinks,uniqueSource,texts
Destination,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
https://web.dev/handbook/content-checklist/,3,3,"web.dev content checklist, content checklist"
https://web.dev/handbook/grammar/,3,3,"Lists section of the Grammar, mechanics, and u..."
https://web.dev/undefined/,3,3,Return to all articles
https://web.dev/handbook/author-profile/,2,2,"Author profile, Authors profile"
https://web.dev/handbook/self-assessment-components/,2,2,Self-assessments
https://web.dev/handbook/yaml-front-matter/,2,2,YAML front matter
https://web.dev/cls/(/metrics/,1,1,field metrics
"https://web.dev/live/\""https://web.dev/prefers-color-scheme/\""",1,1,https://web.dev/prefers-color-scheme/
"https://web.dev/live/\""https://web.dev/storage-for-the-web/",1,1,Service Workers and IndexedDB
"https://web.dev/live/\""https://web.dev/sign-in-form-best-practices/\""",1,1,https://web.dev/sign-in-form-best-practices/


## Seiten mit vielen ausgehenden externen Links
Seiten mit vielen ausgehenden Links sollten inhaltlich geprüft werden.

In [None]:
# Dokumente mit den meisten ausgehenden externen Links
links[
    links['internal'] == False
].groupby(
    'Source'
).agg(
    numLinks=('Destination','count'), 
    uniqueDestinations=('Destination',pd.Series.nunique),
).sort_values(
    'numLinks',ascending=False
).head(10)

Unnamed: 0_level_0,numLinks,uniqueDestinations
Source,Unnamed: 1_level_1,Unnamed: 2_level_1
https://web.dev/prefers-color-scheme/,96,96
https://web.dev/live/,92,92
https://web.dev/covid19/,76,76
https://web.dev/quictransport/,72,72
https://web.dev/overloaded-server/,69,69
https://web.dev/optimize-lcp/,62,62
https://web.dev/next-gen-css-2019/,62,62
https://web.dev/shape-detection/,62,62
https://web.dev/digging-into-the-privacy-sandbox/,61,61
https://web.dev/websocketstream/,61,61


## Externe Dokumente mit vielen eingehenden Links
Diese Dokumente werden häufig verlinkt. Ist das sinnvoll?

In [None]:
# Meistverlinkte externe Dokumente + Linktext
links[
    (links['internal'] == False) &
    (links['Type'] == 'Hyperlink')
].groupby(
    'Destination'
).agg(
    numLinks=('Source','count'), 
    texts = ('linkText', lambda x: ', '.join(set(x)))
).sort_values(
    'numLinks',ascending=False
).head(15)

Unnamed: 0_level_0,numLinks,texts
Destination,Unnamed: 1_level_1,Unnamed: 2_level_1
https://www.apache.org/licenses/LICENSE-2.0,632,Apache 2.0 License
https://policies.google.com/technologies/cookies,632,More details
https://developers.google.com/,632,Google Developers
https://developers.google.com/products,632,All products
https://github.com/GoogleChrome/web.dev/issues/new?assignees=&labels=bug&template=bug_report.md&title=,632,File a bug
https://developers.google.com/terms/site-policies,632,Google Developers Site Policies
https://creativecommons.org/licenses/by/4.0/,632,Creative Commons Attribution 4.0 License
https://developers.google.com/web/,632,Web Fundamentals
https://www.twitter.com/@ChromiumDev,632,Twitter
https://firebase.google.com/,632,Firebase


## Interne Links mit Nofollow
Nofollow bei internen Links nur sehr selten sinnvoll

In [None]:
# Nofollow-Linked Documents
links[
    links['Follow']==False
].groupby(
    ['Source','internal']
).agg(
    numLinks=('Destination','count'), 
    uniqueDestinations=('Destination',pd.Series.nunique)
).sort_values(
    'numLinks',ascending=False
).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,numLinks,uniqueDestinations
Source,internal,Unnamed: 2_level_1,Unnamed: 3_level_1


# PageRank Calculations
Wir versuchen uns einer realistischen PageRank-Kalkulation anzunähern.

## Basic PageRank

In [None]:
links, docs = read_files() # Read files to variables

result = pd.merge(
    docs, simulate_pagerank(
        links
    ).rename(columns={'pr': "pr"}),
    left_on='Address',right_index=True, how='outer')

result = result[['Address', 'pr']].sort_values('pr',ascending=False).reset_index().drop(columns='index') # Datenmenge reduzieren
result.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


69962 Links
50079 <a href>-Links, dropped other
643007 to different URLs, dropped Links to self (including #)
639313 links with known Link Position


HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=639313.0, style=ProgressStyle(descript…


28905 Unique links (dropped duplicates according to link weights)


HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=28905.0, style=ProgressStyle(descripti…




Unnamed: 0,Address,pr
0,https://web.dev/live/,0.013145
1,https://web.dev/blog/,0.009262
2,https://web.dev/podcasts/,0.009104
3,https://web.dev/,0.009025
4,https://web.dev/measure/,0.009
5,https://web.dev/community-guidelines/,0.008988
6,https://web.dev/learn/,0.008949
7,https://web.dev/about/,0.008949
8,https://github.com/googlechrome/web.dev,0.008725
9,https://developers.google.com/web/updates/,0.008725


## Wir löschen Nofollow und Links auf/von Noindex-Seiten

In [None]:
links, docs = read_files() # Read files to variables

# Wir löschen Nofollow-Links:
links = links[links['Follow']==True]

indexable = docs[docs['Indexability']=='Indexable']['Address'] # all indexable URLs
links = links[ #Only internal, only indexable
          (links['Source'].isin(indexable)) &
          (links['Destination'].isin(indexable))
        ]

pr_name = 'pr_noindex'
result_old = result # wir sichern die Berechnung aus dem vorherigen Schritt
result = pd.merge(
    docs, simulate_pagerank(
        links
    ).rename(columns={'pr': pr_name}), # Wir benennen den PR anders
    left_on='Address',right_index=True, how='outer')

result = result[['Address', pr_name]].sort_values(pr_name, ascending=False).reset_index().drop(columns='index')

pd.merge( # wir vergleichen alt und neu
    result, result_old, on='Address',how='outer'
).sort_values(pr_name, ascending=False).head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


69962 Links
50079 <a href>-Links, dropped other
643007 to different URLs, dropped Links to self (including #)
639313 links with known Link Position


HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=639313.0, style=ProgressStyle(descript…


28905 Unique links (dropped duplicates according to link weights)


HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=28905.0, style=ProgressStyle(descripti…




Unnamed: 0,Address,pr_noindex,pr
0,https://web.dev/podcasts/,0.032554,0.009104
1,https://web.dev/community-guidelines/,0.032554,0.008988
2,https://web.dev/about/,0.032554,0.008949
3,https://web.dev/learn/,0.032554,0.008949
4,https://web.dev/blog/,0.032554,0.009262
5,https://web.dev/,0.032554,0.009025
6,https://web.dev/live/,0.032554,0.013145
7,https://web.dev/measure/,0.032554,0.009
8,https://web.dev/newsletter/,0.029781,0.008348
9,https://web.dev/authors/jakearchibald/,0.017412,0.005127


## Wir versuchen einen Reasonable Surfer zu simulieren
(indem wir als einfach Annährung Links aus Navigation und Footer ignorieren)

In [None]:
links, docs = read_files() # Read files to variables

# Wir löschen Nofollow-Links:
links = links[links['Follow']==True]

indexable = docs[docs['Indexability']=='Indexable']['Address']
links = links[
          (links['Source'].isin(indexable)) &
          (links['Destination'].isin(indexable))
        ]

pr_name = 'pr_reasonable'
result_old = result # wir sichern die Berechnung aus dem vorherigen Schritt
result = pd.merge(
    docs, simulate_pagerank(
        links, weight='weights'
    ).rename(columns={'pr': pr_name}),
    left_on='Address',right_index=True, how='outer')

result = result[['Address', pr_name]].sort_values(pr_name, ascending=False).reset_index().drop(columns='index')

pd.merge( # wir vergleichen alt und neu
    result, result_old, on='Address',how='outer'
).sort_values(pr_name, ascending=False).head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


69962 Links
50079 <a href>-Links, dropped other
643007 to different URLs, dropped Links to self (including #)
639313 links with known Link Position


HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=639313.0, style=ProgressStyle(descript…


28905 Unique links (dropped duplicates according to link weights)


HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=28905.0, style=ProgressStyle(descripti…




Unnamed: 0,Address,pr_reasonable,pr_backlinks
0,https://web.dev/newsletter/,0.077724,0.073989
1,https://web.dev/authors/jakearchibald/,0.035989,0.034335
2,https://web.dev/measure/,0.009557,0.001563
3,https://web.dev/learn/,0.009167,0.002915
4,https://web.dev/blog/,0.009155,0.005656
5,https://web.dev/live/,0.009014,0.003746
6,https://web.dev/podcasts/,0.008926,0.00038
7,https://web.dev/about/,0.008828,0.000523
8,https://web.dev/,0.008828,0.002304
9,https://web.dev/community-guidelines/,0.008828,


## Wir beziehen Backlinks mit ein

Im Beispiel Backlinks von ahrefs logarithmiert als Personalisierung für die PageRank-Funktion übergeben. Andere Werde zur Bestimmung des externen Trusts ebenfalls denkbar: Klicks, Impressions, Social Signals.

Aber: Cross-Check: Sinnhaftigkeit, Ursache-/Wirkungsbeziehung

In [None]:
backlinks = files.upload() # Backlink-Export aus AHREFs hochladen

Saving backlinks.csv to backlinks (1).csv


In [None]:
backlinks = pd.read_csv(io.BytesIO(backlinks['backlinks.csv'])) # Datei einlesen
indexable = docs[docs['Indexability']=='Indexable']['Address']

df = pd.merge(
    indexable,backlinks[['Page URL', 'URL Rating (desc)']],
    left_on='Address', right_on='Page URL',
    how='left'
).set_index('Address')['URL Rating (desc)'].fillna(0)+0.001 # Backlinkziele mit indexierbaren URLs zusammenfassen
backlinks = np.log(df)/np.log(df.max()) # Logarithmisch anpassen, um starke / Schwache URLs zu relativieren
backlinks = backlinks.to_dict() # Dictionary für einfaches Handling

In [None]:
links, docs = read_files() # Read files to variables

# Wir löschen Nofollow-Links:
links = links[links['Follow']==True]

indexable = docs[docs['Indexability']=='Indexable']['Address']
links = links[
          (links['Source'].isin(indexable)) &
          (links['Destination'].isin(indexable))
        ]

pr_name = 'pr_backlinks'
result_old = result # wir sichern die Berechnung aus dem vorherigen Schritt
result = pd.merge(
    docs, simulate_pagerank(
        links,
        weight='weights',
        personalization = backlinks # Einbindung der Backlinks als Personalisierungswert
    ).rename(columns={'pr': pr_name}),
    left_on='Address',right_index=True, how='outer')

result = result[['Address', pr_name]].sort_values(pr_name, ascending=False).reset_index().drop(columns='index')

pd.merge( # wir vergleichen alt und neu
    result, result_old, on='Address',how='outer'
).sort_values(pr_name, ascending=False).head(10)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


69962 Links
50079 <a href>-Links, dropped other
643007 to different URLs, dropped Links to self (including #)
639313 links with known Link Position


HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=639313.0, style=ProgressStyle(descript…


28905 Unique links (dropped duplicates according to link weights)


HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=28905.0, style=ProgressStyle(descripti…




Unnamed: 0,Address,pr_backlinks,pr_reasonable
0,https://web.dev/newsletter/,0.077653,0.077724
1,https://web.dev/authors/jakearchibald/,0.036309,0.035989
2,https://web.dev/measure/,0.009656,0.009557
3,https://web.dev/learn/,0.009134,0.009167
4,https://web.dev/live/,0.009118,0.009014
5,https://web.dev/podcasts/,0.008979,0.008926
6,https://web.dev/,0.008971,0.008828
7,https://web.dev/blog/,0.008959,0.009155
8,https://web.dev/about/,0.008885,0.008828
9,https://web.dev/community-guidelines/,0.008856,0.008828


# Wir versuchen mal ein paar Aggregationen

In [None]:
pd.merge(
    pr.reset_index().rename(columns={'index':'Address'}), 
    docs[['Address','Content']],
    on='Address', how='left'
).groupby('Content').agg(
    anzahl=('Address','count'), 
    pr=('pr',sum)
).sort_values('pr')

Unnamed: 0_level_0,anzahl,pr
Content,Unnamed: 1_level_1,Unnamed: 2_level_1
image/svg+xml,6,0.001882
image/jpeg,6,0.002128
image/png,11,0.00374
application/xml,78,0.01678
text/html; charset=UTF-8,103,0.150478
text/html,524,0.824992


## Externe Ziele

In [None]:
df = pr.reset_index().rename(columns={'index':'Address'})
df['internal'] = df['pr'].astype(str).swifter.apply(lambda x: check_internal(x))
df.groupby('internal').agg(
    anzahl=('Address','count'), 
    pr=('pr',sum)
)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=728.0, style=ProgressStyle(description…




Unnamed: 0_level_0,anzahl,pr
internal,Unnamed: 1_level_1,Unnamed: 2_level_1
False,728,1.0


## Group by URL-Segment

In [None]:
def url_grouping(url): # so baut man das natürlich nicht
  if check_internal(url) == False:
    return 'external'
  if url.endswith('.xml'):
    return 'RSS'
  if url.endswith('.png'):
    return 'Image'
  if url.endswith('.svg'):
    return 'Image'
  if url.endswith('.jpg'):
    return 'Image'
  if 'handbook' in url:
    return '/handbook/'
  if 'codelab' in url:
    return 'codelab'
  if '/newsletter/archive' in url:
    return 'Newsletter Archive'
  if url in ['https://web.dev/', 'https://web.dev/live/', 'https://web.dev/blog/', 'https://web.dev/learn/','https://web.dev/measure/', 'https://web.dev/about/']:
    return 'Main Nav'
  if 'authors' in url:
    return 'authors'
  if 'tag' in url:
    return 'tag'
  if '/blog/' in url:
    return 'Blog-Pagination'
  return np.nan

df = pr.reset_index().rename(columns={'index':'url'})
df['url_group'] = df['url'].swifter.apply(lambda x: url_grouping(x))

df[
   (df['url_group']!=df['url_group'])
   ].sort_values('pr')
df['url_group'] = df['url_group'].fillna('article')
df.groupby('url_group').agg(
    anzahl=('url','count'),
    pr=('pr',sum)
).sort_values('pr',ascending=False)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=728.0, style=ProgressStyle(description…




## Group by Custom Extraction von Breadcrumb-Elementen oder Tags

In [None]:
# Upload files (bei größeren Datenmengen als .gz hochladen)
extractions = files.upload()

Saving custom_extraction_all.csv to custom_extraction_all.csv


In [None]:
extractions = pd.read_csv(io.BytesIO(extractions['custom_extraction_all.csv']))
extractions

Unnamed: 0,Address,Status Code,Status,Breadcrumb 1,Breadcrumb 2,Tag 1
0,https://web.dev/,200,OK,,,
1,https://web.dev/aria-poison-or-antidote/,200,OK,\n \n Home\n \n,\n \n All articles\n \n,accessibility
2,https://web.dev/measure/,200,OK,,,
3,https://web.dev/tags/accessibility/,200,OK,,,
4,https://web.dev/tags/capabilities/,200,OK,,,
...,...,...,...,...,...,...
671,https://web.dev/handbook/voice/,200,OK,,,
672,https://web.dev/handbook/reviews/,200,OK,,,
673,https://web.dev/handbook/author-profile/,200,OK,,,
674,https://web.dev/handbook/yaml-front-matter/,200,OK,,,


In [None]:
pd.merge(
    pr.reset_index().rename(columns={'index':'Address'}), 
    extractions[['Address','Tag 1']],
    on='Address', how='left'
).groupby('Tag 1').agg(
    anzahl=('Address','count'), 
    pr=('pr',sum)
).sort_values('pr', ascending=False)

Unnamed: 0_level_0,anzahl,pr
Tag 1,Unnamed: 1_level_1,Unnamed: 2_level_1
performance,112,0.169003
newsletter,4,0.078981
capabilities,21,0.030447
progressive-web-apps,16,0.026222
security,16,0.014443
case-study,8,0.010198
css,8,0.008539
accessibility,6,0.008205
payments,8,0.007811
angular,9,0.00709
