<a href="https://colab.research.google.com/github/wingmenonlinemarketing/iv_colab/blob/master/Interne_Verlinkung_optimieren.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Interne Verlinkung optimieren

Notebook zum Webinar "pragmatische Optimierung der internen Verlinkung" von johan.huelsen@wngmn.de für audisto.com

Im Webinar sind wir verschiedene Ansätze zur Optimierung der internen Verlinkung durchgegangen. Als Beispiel haben wir einen Crawl von web.dev genutzt.

In [None]:
# Notwendige Libraries laden

# Non-Standard Libraries installieren
!pip install tldextract
!pip install swifter 

import pandas as pd # Für Datenverarbeitung
import networkx as nx # Für PageRank
from google.colab import files # Für Dateiupload
import io # Für Dateiupload
import swifter # Zum Beschleunigen mancher Pandas-Funktionen
import tldextract # Zur Extraction von Domains aus URLs
import numpy as np # Für Logarithmus-Kram bei Backlinks

In [None]:
# ScreamingFrog Exporte hochladen
# Upload files (bei größeren Datenmengen als .gz hochladen)
print('Upload: all_outlinks.csv, redirect_and_canonical_chains.csv, internal_all.csv')
uploaded = files.upload()

Upload: all_outlinks.csv, redirect_and_canonical_chains.csv, internal_all.csv


Saving all_outlinks.csv to all_outlinks (1).csv
Saving internal_all.csv to internal_all.csv
Saving redirect_and_canonical_chains.csv to redirect_and_canonical_chains.csv


In [None]:
# Für welche Domain sollen Links als interne Links gewertet werden?
domain = 'web.dev'

In [None]:
# main functions
check_internal = lambda x: '.'.join(tldextract.extract(x)[1:3]) == domain

def read_file(filename):
  '''read uploaded file to variable'''
  return pd.read_csv(io.BytesIO(uploaded[filename]))

def simulate_pagerank(links, personalization=None):
  """To include Backlink values for weighting of nodes use a dictionary with a
personalization score for all URLs between 0 and 1 you could build something
like Kevins TIPR https://www.kevin-indig.com/internal-link-optimization-with-tipr/
for example: Fetch Backlinks, score them with Backlinks visibility or referrers
and use a log for normalization
  
To increase the impact of specific links (f.e. in content) one could use
sth. like: https://stackoverflow.com/questions/9136539/how-do-weighted-edges-affect-pagerank-in-networkx
Library: https://networkx.github.io/documentation/networkx-1.10/reference/generated/networkx.algorithms.link_analysis.pagerank_alg.pagerank.html
"""
  g = nx.from_pandas_edgelist(links, 'Source', 'Destination')

  # Calculate Pagerank
  pr = nx.pagerank_scipy(
      g,
      alpha=0.85,
      max_iter=5000, 
      personalization=personalization
      )
  pr = pd.DataFrame.from_dict(pr, orient='index').rename(columns={0:'pr'})
  return pr
  
def normalize_links(
    df, redirects=pd.DataFrame(), nofollow=True, duplicates=True,
    normalize_redirects=False
):
  """Reduce number of Links and delete data you don't need
  """
  # only <a hrefs>
  df = df[df['Type']=='Hyperlink']
  # only links from internal_all
  df = df[df['Source'].isin(docs['Address'])]

  # Delete Anchors from Links
  df['Destination'] = df['Destination'].str.replace('#.*','') #delete Anchors from destination-URLs
  df = df[df['Source'].str.contains('#')==False] # drop Anchors from Source (shouldn't exist anyways)
  df = df[df['Source']!=df['Destination']] # drop links to self

  # drop nofollow-Links
  if nofollow==False:
    df = df[df['Follow']==True]
  
  if normalize_redirects:
  # obey redirects and canonicals
    df = pd.merge(df,redirects,left_on='Destination',right_on='Address',how='left')
    # Replace Destination with Final Address, If not present use Destination
    df['Destination'] = df['Final Address'].fillna(df['Destination'])
    # If Source is redirected or has Canonical: Drop Link
    df = df[df['Source'].isin(redirects['Address'])==False]
    # drop temp-columns
    df = df.drop(columns=['Address','Final Address'])
  
  # Only one link from Source to destination
  if duplicates==False:
    df = df.drop_duplicates(subset=['Source','Destination'])

  return df

# Basic Analysis
Basis-Analysen allein auf Basis der Exporte ohne weitere Berechnungen:

* Meist genutzte Linktexte
* Linktexte mit unterschiedlichen Zielseiten
* Dokumente mit den meisten verschiedenen Linktexten
* Dokumente mit den meisten ausgehenden externen Links
* Meistverlinkte externe Dokumente + Linktext
* Dokumetne mit eingehenden Nofollow-Links

## Rohdaten laden

In [None]:
links = read_file('all_outlinks.csv')
docs = read_file('internal_all.csv')
redirects = read_file('redirect_and_canonical_chains.csv')[
    ['Address','Final Address']
]

links['Anchor'] = links['Anchor'].fillna(links['Alt Text']) # Use Alt Text if no linktext given
links['Anchor'] = links['Anchor'].fillna('').astype(str) # Ensure Strings
links['internal'] = links['Destination'].swifter.apply(check_internal) # Check if internal link
links = links.drop(columns=['Size (Bytes)','Alt Text','Status','Status Code']) # drop unneccessary cols
links = links.rename(columns={'Anchor':'linkText'})

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=75440.0, style=ProgressStyle(descripti…




## Remove index.jsons from web.dev-Example

In [None]:
docs = docs[docs['Address'].str.endswith('/index.json')==False]
links['Source'] = links['Source'].str.replace('/index.json', '/')
links['Destination'] = links['Destination'].str.replace('/index.json', '/')

## Meistgenutzte Linktexte
Sind die Linktexte
* spezifisch?
* beschreibend?
* hilfreich?

In [None]:
# Most used linktexts
links[
    links['internal']==True
].groupby(
    'linkText'
).agg(
    numLinks=('Destination','count'), 
    uniqueDestinations=('Destination',pd.Series.nunique)
).sort_values(
    'numLinks', ascending=False
).head(10)

Unnamed: 0_level_0,numLinks,uniqueDestinations
linkText,Unnamed: 1_level_1,Unnamed: 2_level_1
,10853,798
#,3660,483
web.dev,2532,3
Measure,1340,3
Learn,1339,2
Blog,1339,2
About,1338,1
Live,1338,1
Podcasts,669,1
Community Guidelines,669,1


## Linktexte mit unterschiedlichen Zielen
Wird der gleiche Linktext für unterschiedliche Zielseiten verwendet.
* Viele unterschiedliche Texte: schwächen im Template / Kandidaten für Löschung
* Weniger unterschiedliche Texte: Oft Konkurrenz um den gleichen Begriff

In [None]:
# Linktexts with most different Destinations
links[
    links['internal']==True
].groupby(
    'linkText'
).agg(
    numLinks=('Destination','count'), 
    uniqueDestinations=('Destination',pd.Series.nunique)
).sort_values(
    'uniqueDestinations', ascending=False
).head(10)

Unnamed: 0_level_0,numLinks,uniqueDestinations
linkText,Unnamed: 1_level_1,Unnamed: 2_level_1
,10853,798
#,3660,483
1,168,156
RSS Feed,103,101
Return to article,32,26
Return to all articles,408,17
In progress,6,6
2,20,6
Installable,8,6
performance budgets,7,5


## Zielseiten mit hoher Linktext-Varianz / Linkziele mit unterschiedlichen Texten
Seiten die mit besonders vielen unterschiedlichen Linktexten verlinkt werden.
Unterschiedliche Ankertexte sind grundsätzlich hilfreich, sie sollten aber schon eine Einordnung ermöglichen


In [None]:
# Destinations with most different Linktexts
links[
    links['internal']==True
].groupby(
    'Destination'
).agg(
    numLinks=('Source','count'), 
    uniqueTexts=('linkText',pd.Series.nunique),
    texts = ('linkText', lambda x: ', '.join(set(x)))
).sort_values(
    'uniqueTexts', ascending=False
).head(5)

NameError: ignored

## Nicht indexierbare Linkziele
Nicht indexierbare Seiten sollten selten verlinkt werden.
Bei häufig verlinkten Seiten:
* Macht es Sinn die Seite zu indexieren?
* Macht es Sinn die Seite weniger häufig zu verlinken?


In [None]:
links[
    links['Destination'].isin(
        docs[docs['Indexability']=='Non-Indexable']['Address']
    )
].groupby(
    'Destination'
).agg(
    numLinks=('Source','count'), 
    uniqueSource=('Source',pd.Series.nunique),
    texts = ('linkText', lambda x: ', '.join(set(x)))
).sort_values(
    'numLinks', ascending=False
).head(5)

Unnamed: 0_level_0,numLinks,uniqueSource,texts
Destination,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
https://web.dev/tags/performance/,543,198,", 1, Performance"
https://web.dev/blog,326,166,"blog homepage, All articles, See all articles,..."
https://web.dev/tags/capabilities/,157,68,", 1, Capabilities"
https://web.dev/tags/progressive-web-apps/,126,61,"Progressive Web Apps, 1,"
https://web.dev/fast,121,68,Collection 57 resources Updated Fast load time...


## Seiten mit vielen ausgehenden externen Links
Seiten mit vielen ausgehenden Links sollten inhaltlich geprüft werden.

In [None]:
# Dokumente mit den meisten ausgehenden externen Links
links[
    links['internal'] == False
].groupby(
    'Source'
).agg(
    numLinks=('Destination','count'), 
    uniqueDestinations=('Destination',pd.Series.nunique),
).sort_values(
    'numLinks',ascending=False
).head(10)

Unnamed: 0_level_0,numLinks,uniqueDestinations
Source,Unnamed: 1_level_1,Unnamed: 2_level_1
https://web.dev/prefers-color-scheme/,215,194
https://web.dev/live/,182,142
https://web.dev/blog/2/,159,108
https://web.dev/blog/,155,122
https://web.dev/responsive-web-design-basics/,154,152
https://web.dev/authors/houssein/,151,77
https://web.dev/blog/4/,151,105
https://web.dev/tags/capabilities/,149,91
https://web.dev/authors/jeffposnick/,147,64
https://web.dev/tags/progressive-web-apps/,143,88


## Externe Dokumente mit vielen eingehenden Links
Diese Dokumente werden häufig verlinkt. Ist das sinnvoll?

In [None]:
# Meistverlinkte externe Dokumente + Linktext
links[
    (links['internal'] == False) &
    (links['Type'] == 'Hyperlink')
].groupby(
    'Destination'
).agg(
    numLinks=('Source','count'), 
    texts = ('linkText', lambda x: ', '.join(set(x)))
).sort_values(
    'numLinks',ascending=False
).head(10)

Unnamed: 0_level_0,numLinks,uniqueDestinations,texts
Destination,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
https://firebaseremoteconfig.googleapis.com/v1/projects/web-dev-production-1/namespaces/fireperf:fetch?key=AIzaSyCyThSjI_ZUT1NwV9aQLtqklVcNj72gvo8,670,670,
https://firebase.google.com/,670,669,Firebase
https://developers.google.com/web/,669,669,Web Fundamentals
https://www.youtube.com/user/ChromeDevelopers,669,669,YouTube
https://developer.chrome.com/home,669,669,Chrome
https://developers.google.com/products,669,669,All products
https://github.com/GoogleChrome/web.dev/issues/new?assignees=&labels=bug&template=bug_report.md&title=,669,669,File a bug
https://developers.google.com/site-policies,669,669,Google Developers Site Policies
https://www.apache.org/licenses/LICENSE-2.0,669,669,Apache 2.0 License
https://cloud.google.com/,669,669,Google Cloud Platform


## Interne Links mit Nofollow
Nofollow bei internen Links nur sehr selten sinnvoll

In [None]:
# Nofollow-Linked Documents
links[
    links['Follow']==False
].groupby(
    ['Source','internal']
).agg(
    numLinks=('Destination','count'), 
    uniqueDestinations=('Destination',pd.Series.nunique)
).sort_values(
    'numLinks',ascending=False
).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,numLinks,uniqueDestinations
Source,internal,Unnamed: 2_level_1,Unnamed: 3_level_1


# PageRank Calculations
Wir versuchen uns einer realistischen PageRank-Kalkulation anzunähern.

In [None]:
# because we will reload the files a lot
def read_files():
  # Read files to variables
  links = read_file('all_outlinks.csv')
  docs = read_file('internal_all.csv')
  redirects = read_file('redirect_and_canonical_chains.csv')[
      ['Address','Final Address']
  ]

  docs = docs[docs['Address'].str.endswith('/index.json')==False] # Hacks for web.dev
  links['Source'] = links['Source'].str.replace('/index.json', '/') # Hacks for web.dev
  links['Destination'] = links['Destination'].str.replace('/index.json', '/') # Hacks for web.dev

  return links, docs, redirects

## Basic PageRank

In [None]:
links, docs, redirects = read_files() # Read files to variables

result = pd.merge(
    docs, simulate_pagerank(
        links
    ).rename(columns={'pr': "pr"}),
    left_on='Address',right_index=True, how='outer')

result = result[['Address', 'pr']].sort_values('pr',ascending=False).reset_index().drop(columns='index') # Datenmenge reduzieren
result.head(10)

Unnamed: 0,Address,pr
0,https://web.dev/live/,0.005344
1,https://play.google.com/store/apps/details?id=...,0.00428
2,https://play.google.com/store/apps/details?id=...,0.004274
3,https://play.google.com/store/apps/details?id=...,0.004274
4,https://play.google.com/store/apps/details?id=...,0.004274
5,https://web.dev/blog/,0.00427
6,https://support.microsoft.com/en-us/help/13776...,0.004217
7,https://web.dev/,0.003975
8,https://support.apple.com/en-us/HT207570,0.003947
9,https://support.apple.com/en-us/HT201859,0.003897


## Wir normalisieren Redirects und Canonicals

In [None]:
links, docs, redirects = read_files() # Read files to variables

# Wir manipulieren die Links:
links = normalize_links(
    links,
    redirects=redirects, # Wir übergeben der Funktion die Redirect-Tabelle und ersetzen das Linkziel durch die finale URL, falls Weiterleitungen oder abweichendes Canonical
    normalize_redirects=True # Wir wollen die Redirects normalisieren
)

pr_name = 'pr_correct_redir'
result_old = result # wir sichern die Berechnung aus dem vorherigen Schritt
result = pd.merge(
    docs, simulate_pagerank(
        links
    ).rename(columns={'pr': pr_name}), # Wir benennen den PR anders
    left_on='Address',right_index=True, how='outer')

result = result[['Address', pr_name]].sort_values(pr_name, ascending=False).reset_index().drop(columns='index')

pd.merge( # wir vergleichen alt und neu
    result, result_old, on='Address',how='outer'
).sort_values(pr_name, ascending=False).head(10)

Unnamed: 0,Address,pr_correct_redir,pr
0,https://web.dev/live/,0.011432,0.005344
1,https://web.dev/blog/,0.008281,0.00427
2,https://web.dev/,0.00815,0.003975
3,https://web.dev/learn/,0.008129,0.003632
4,https://web.dev/podcasts/,0.008058,0.003879
5,https://web.dev/about/,0.008027,0.003886
6,https://web.dev/measure/,0.007987,0.0035
7,https://web.dev/community-guidelines/,0.007925,0.003416
8,https://firebaseremoteconfig.googleapis.com/v1...,0.007713,0.003249
18,https://www.youtube.com/user/ChromeDevelopers,0.007698,0.003245


## Wir löschen Nofollow und Links auf/von Noindex-Seiten

In [None]:
links, docs, redirects = read_files() # Read files to variables

# Wir manipulieren die Links:
links = normalize_links(
    links,
    nofollow=False, # Nofollow-Links werden gedroppt
    redirects=redirects, 
    normalize_redirects=True
)
indexable = docs[docs['Indexability']=='Indexable']['Address'] # all indexable URLs
links = links[ #Only internal, only indexable
          (links['Source'].isin(indexable)) &
          (links['Destination'].isin(indexable))
        ]

pr_name = 'pr_noindex'
result_old = result # wir sichern die Berechnung aus dem vorherigen Schritt
result = pd.merge(
    docs, simulate_pagerank(
        links
    ).rename(columns={'pr': pr_name}), # Wir benennen den PR anders
    left_on='Address',right_index=True, how='outer')

result = result[['Address', pr_name]].sort_values(pr_name, ascending=False).reset_index().drop(columns='index')

pd.merge( # wir vergleichen alt und neu
    result, result_old, on='Address',how='outer'
).sort_values(pr_name, ascending=False).head(10)

Unnamed: 0,Address,pr_noindex,pr_correct_redir
0,https://web.dev/live/,0.035639,0.011432
1,https://web.dev/,0.03551,0.00815
2,https://web.dev/blog/,0.03551,0.008281
3,https://web.dev/podcasts/,0.03551,0.008058
4,https://web.dev/about/,0.03551,0.008027
5,https://web.dev/measure/,0.035425,0.007987
6,https://web.dev/learn/,0.035425,0.008129
7,https://web.dev/community-guidelines/,0.035425,0.007925
8,https://web.dev/newsletter/,0.032458,0.007391
9,https://web.dev/authors/jakearchibald/,0.006988,0.001872


## Wir versuchen einen Reasonable Surfer zu simulieren
(indem wir als einfach Annährung Links aus Navigation und Footer ignorieren)

In [None]:
links, docs, redirects = read_files() # Read files to variables
ignored_positions = ['Header','Navigation', 'Footer','Head']

# Wir manipulieren die Links:
links = normalize_links(
    links,
    nofollow=False,
    redirects=redirects, 
    normalize_redirects=True
)
indexable = docs[docs['Indexability']=='Indexable']['Address']
links = links[
          (links['Source'].isin(indexable)) &
          (links['Destination'].isin(indexable)) &
          (links['Link Position'].isin(ignored_positions)==False) # Drop Links from ignored positions 
        ]


pr_name = 'pr_reasonable'
result_old = result # wir sichern die Berechnung aus dem vorherigen Schritt
result = pd.merge(
    docs, simulate_pagerank(
        links
    ).rename(columns={'pr': pr_name}),
    left_on='Address',right_index=True, how='outer')

result = result[['Address', pr_name]].sort_values(pr_name, ascending=False).reset_index().drop(columns='index')

pd.merge( # wir vergleichen alt und neu
    result, result_old, on='Address',how='outer'
).sort_values(pr_name, ascending=False).head(10)

Unnamed: 0,Address,pr_reasonable,pr_backlinks
0,https://web.dev/,0.085322,0.084854
1,https://web.dev/newsletter/,0.076607,0.076303
2,https://web.dev/lighthouse-accessibility/,0.008651,0.008687
3,https://web.dev/fast/,0.00825,0.008805
4,https://web.dev/accessibility-scoring/,0.006209,0.006298
5,https://web.dev/tags/security/,0.0059,0.006148
6,https://web.dev/tags/case-study/,0.005852,0.005986
7,https://web.dev/blog/,0.005599,0.005653
8,https://web.dev/blog/6/,0.005464,0.005698
9,https://web.dev/blog/3/,0.005397,0.005673


## Wir ignorieren doppelte Links

In [None]:
links, docs, redirects = read_files() # Read files to variables
ignored_positions = ['Header','Navigation', 'Footer','Head']

# Wir manipulieren die Links:
links = normalize_links(
    links,
    nofollow=False,
    redirects=redirects, 
    normalize_redirects=True,
    duplicates=True
)
indexable = docs[docs['Indexability']=='Indexable']['Address']
links = links[
          (links['Source'].isin(indexable)) &
          (links['Destination'].isin(indexable)) &
          (links['Link Position'].isin(ignored_positions)==False) # Drop Links from ignored positions 
        ]


pr_name = 'pr_unique_links'
result_old = result # wir sichern die Berechnung aus dem vorherigen Schritt
result = pd.merge(
    docs, simulate_pagerank(
        links
    ).rename(columns={'pr': pr_name}),
    left_on='Address',right_index=True, how='outer')

result = result[['Address', pr_name]].sort_values(pr_name, ascending=False).reset_index().drop(columns='index')

pd.merge( # wir vergleichen alt und neu
    result, result_old, on='Address',how='outer'
).sort_values(pr_name, ascending=False).head(10)

Unnamed: 0,Address,pr_unique_links,pr_reasonable
0,https://web.dev/,0.085322,0.085322
1,https://web.dev/newsletter/,0.076607,0.076607
2,https://web.dev/lighthouse-accessibility/,0.008651,0.008651
3,https://web.dev/fast/,0.00825,0.00825
4,https://web.dev/accessibility-scoring/,0.006209,0.006209
5,https://web.dev/tags/security/,0.0059,0.0059
6,https://web.dev/tags/case-study/,0.005852,0.005852
7,https://web.dev/blog/,0.005599,0.005599
8,https://web.dev/blog/6/,0.005464,0.005464
9,https://web.dev/blog/3/,0.005397,0.005397


## Wir beziehen Backlinks mit ein

Im Beispiel Backlinks von ahrefs logarithmiert als Personalisierung für die PageRank-Funktion übergeben. Andere Werde zur Bestimmung des externen Trusts ebenfalls denkbar: Klicks, Impressions, Social Signals.

Aber: Cross-Check: Sinnhaftigkeit, Ursache-/Wirkungsbeziehung

In [None]:
backlinks = files.upload() # Backlink-Export aus AHREFs hochladen

Saving backlinks.csv to backlinks (4).csv


In [None]:
backlinks = pd.read_csv(io.BytesIO(backlinks['backlinks.csv'])) # Datei einlesen
indexable = docs[docs['Indexability']=='Indexable']['Address']

df = pd.merge(
    indexable,backlinks[['Page URL', 'URL Rating (desc)']],
    left_on='Address', right_on='Page URL',
    how='left'
).set_index('Address')['URL Rating (desc)'].fillna(0)+0.001 # Backlinkziele mit indexierbaren URLs zusammenfassen
backlinks = np.log(df)/np.log(df.max()) # Logarithmisch anpassen, um starke / Schwache URLs zu relativieren
backlinks = backlinks.to_dict() # Dictionary für einfaches Handling

In [None]:
links, docs, redirects = read_files() # Read files to variables
ignored_positions = ['Header','Navigation', 'Footer','Head']

# Wir manipulieren die Links:
links = normalize_links(
    links,
    nofollow=False,
    redirects=redirects, 
    normalize_redirects=True,
    duplicates=True
)
indexable = docs[docs['Indexability']=='Indexable']['Address']
links = links[
          (links['Source'].isin(indexable)) &
          (links['Destination'].isin(indexable)) &
          (links['Link Position'].isin(ignored_positions)==False) # Drop Links from ignored positions 
        ]

pr_name = 'pr_backlinks'
result_old = result # wir sichern die Berechnung aus dem vorherigen Schritt
result = pd.merge(
    docs, simulate_pagerank(
        links,
        personalization = backlinks # Einbindung der Backlinks als Personalisierungswert
    ).rename(columns={'pr': pr_name}),
    left_on='Address',right_index=True, how='outer')

result = result[['Address', pr_name]].sort_values(pr_name, ascending=False).reset_index().drop(columns='index')

pd.merge( # wir vergleichen alt und neu
    result, result_old, on='Address',how='outer'
).sort_values(pr_name, ascending=False).head(10)




Unnamed: 0,Address,pr_backlinks,pr_unique_links
0,https://web.dev/,0.084854,0.085322
1,https://web.dev/newsletter/,0.076303,0.076607
2,https://web.dev/fast/,0.008805,0.00825
3,https://web.dev/lighthouse-accessibility/,0.008687,0.008651
4,https://web.dev/accessibility-scoring/,0.006298,0.006209
5,https://web.dev/tags/security/,0.006148,0.0059
6,https://web.dev/tags/case-study/,0.005986,0.005852
7,https://web.dev/lighthouse-performance/,0.005798,0.005204
8,https://web.dev/blog/6/,0.005698,0.005464
9,https://web.dev/blog/3/,0.005673,0.005397
