<a href="https://colab.research.google.com/github/tylergargula/seo-redirect-testing/blob/main/SEO_Migration_Redirect_Testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



# SEO Migration Redirect Testing
### Requirements & Considerations:
<ul>
  <li>Save a copy to use (File > Save a copy in Drive).</li>
  <li>File must be .csv format.</li>
  <li>First column must consist of <i>full URL</i> (URLs paths will not work).
  <li>URL redirects must currently be in place.</li>
  <li>If testing on redirects on a staging 
environment, URLs must be accessible without login credentials.</li>
  <li>Run each cell individually, or run all.</li>
</ul>





## 1. Import All Python Libraries



In [None]:
from concurrent.futures import ThreadPoolExecutor
from google.colab import files
import numpy as np
import pandas as pd
import requests
from requests.sessions import Session
import time
from threading import Thread,local



## 2. Upload CSV file


In [None]:
upload = files.upload()
input_file = list(upload.keys())[0]
request_urls = pd.read_csv(input_file)
col_1 = list(request_urls.columns)[0]
request_urls[col_1].head()




## 3. Enter Testing Domains
<ul><li><i>If testing redirects on non-production environment</i> , "New/Staging Domain" should be full testing domain.</li>
<li><i>If testing redirects on production environment</i> , "New/Staging Domain" should be the new/updated domain.<br><i>NOTE: legacy/new domains can be the same if not changed</i></li></ul>


In [None]:
LEGACY_DOMAIN = input('Enter full Legacy Domain (ex: https://www.site.com/):   ')
NEW_DOMAIN = input('Enter full New/Staging Domain (ex: https://www.stage.site.com/):   ')


request_list = request_urls['Address'].tolist()

new_request_list = []

for url in request_list:
  if LEGACY_DOMAIN in url:
    new_url = url.replace(LEGACY_DOMAIN, NEW_DOMAIN)
    new_request_list.append(new_url)


## 4. Perform URL analysis 


In [None]:

redirection_dict = {
    'Request URL': [],
    'Request Status Code': [],
    'Redirected URL': [],
    'Redirected Status Code': []             
}


thread_local = local()

def get_session():
    if not hasattr(thread_local,'session'):
        thread_local.session = requests.Session()
    return thread_local.session

def download_link(url:str):
    session = get_session()
    with session.head(url) as response:
        response = requests.head(url, allow_redirects=True)
        redirection_dict['Request URL'].append(url)
        redirection_dict['Request Status Code'].append(response.history)
        redirection_dict['Redirected URL'].append(response.url)
        redirection_dict['Redirected Status Code'].append(response.status_code)

        print(f'Analyzing :: {url}')


def download_all(urls:list):
    with ThreadPoolExecutor(max_workers=10) as executor:
        executor.map(download_link,new_request_list)

start = time.time()
download_all(new_request_list)
end = time.time()
print(f'download {len(new_request_list)} links in {end - start} seconds')



## 5. Format Data 



In [None]:

redirect_df = pd.DataFrame.from_dict(redirection_dict)
redirect_df['Request Status Code'] = redirect_df['Request Status Code'].astype(str).str[12:15]
redirect_df['Request Status Code'] = np.where(redirect_df['Request URL'] == redirect_df['Redirected URL'],redirect_df['Redirected Status Code'],redirect_df['Request Status Code'])
redirect_df['Redirected URL'] = np.where(redirect_df['Request URL'] == redirect_df['Redirected URL'], 'N/A', redirect_df['Redirected URL'])
redirect_df['Redirected Status Code'] = np.where(redirect_df['Redirected URL'] == 'N/A', 'URL did not redirect',redirect_df['Redirected Status Code'])
redirect_df.head()


## 6. Export to Excel

In [None]:
from datetime import date
today = date.today()
redirect_df.to_excel(f'redirect_analysis_{today}.xlsx', index=False)
files.download(f'redirect_analysis_{today}.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>