<a href="https://colab.research.google.com/github/ysugiyama3/google_colab/blob/master/link_checker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Link checker**

by Yukari Sugiyama  (View source code on <a href='https://github.com/ysugiyama3/google_colab'>Github</a>)

- The program checks for broken links using HTTP status codes
- You need an Excel spreadsheet in which the 1st column must be assigned for URL. The spreadsheet can have as many columns as necesary and must have column headers.
---


In [None]:
#@title 1. Upload an input Excel file (.xls, .xlsx, .xlsm, .xlsb, .odf)

from google.colab import files
import pandas as pd
import requests
import xml.etree.ElementTree as ET
from IPython.display import HTML, display
import time
import re

def progress(value, max=50000):
    return HTML("""
        <progress
            value='{value}'
            max='{max}',
            style='width: 40%'
        >
            {value}
        </progress>
        <br>{value}/{max}</br>
    """.format(value=value, max=max))

def check_url(url):
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'}     
   
    try:
        r = requests.head(url, allow_redirects=True, headers=headers, timeout=(15, 15))
        code = r.status_code
    except (requests.ConnectTimeout, requests.ReadTimeout, requests.exceptions.Timeout):
        code = 'Timeout Error'              
    except (requests.TooManyRedirects, requests.ConnectionError):
        code = 'Connection Error' 
    except (requests.HTTPError, requests.URLRequired, requests.exceptions.RequestException):
        code = 'Other Error'  
    except (requests.HTTPError, requests.URLRequired, requests.exceptions.RequestException, requests.TooManyRedirects, requests.ConnectionError, requests.ConnectTimeout, requests.ReadTimeout, requests.exceptions.Timeout):
        code = 'Error'
    if str(code).startswith('5'):
        try:
            r = requests.get(url, allow_redirects=True, timeout=(15, 15))
            code = r.status_code       
        except (requests.ConnectTimeout, requests.ReadTimeout, requests.exceptions.Timeout):
            code = 'Timeout Error'              
        except (requests.TooManyRedirects, requests.ConnectionError):
            code = 'Connection Error' 
        except (requests.HTTPError, requests.URLRequired, requests.exceptions.RequestException):
            code = 'Other Error'  
        except (requests.HTTPError, requests.URLRequired, requests.exceptions.RequestException, requests.TooManyRedirects, requests.ConnectionError, requests.ConnectTimeout, requests.ReadTimeout, requests.exceptions.Timeout):
            code = 'Error'           
    return code

# Upload an input Excel file
uploaded = files.upload()
input_name = str(list(uploaded.keys())[0])

# Read an input Excel file into a pandas DataFrame
input_df = pd.read_excel(input_name)

# Create an output Excel file based on input excel file
output_name = input_name.rsplit( ".", 1 )[0] + "_output.xlsx"

# Create an output DataFrame
output_df = input_df.copy()
output_df = pd.concat([input_df, pd.DataFrame(columns = [ 'STATUS'])], sort=False)

In [None]:
#@title 2. Run the program

# number of rows
total = len(input_df.index)

# count
count = 0
notFound = 0
connectionError = 0
timeoutError = 0
otherError = 0

out = display(progress(0, total), display_id=True)

for index, row in output_df.iterrows():
    count += 1
    time.sleep(0.02)
    out.update(progress(count, total))

    link = row[0]
    status = check_url(link)

    if str(status).startswith('4'):
        output_df.loc[index, 'STATUS'] = status
        notFound += 1
    elif str(status).startswith('Timeout'):
        output_df.loc[index, 'STATUS'] = status
        timeoutError += 1
    elif str(status).startswith('Connection'):
        output_df.loc[index, 'STATUS'] = status
        connectionError += 1
    elif str(status).startswith('Other'):
        output_df.loc[index, 'STATUS'] = status
        otherError += 1
    else:
        output_df = output_df.drop(index)

print('\nAmong ', total, ' links\n')
print('Not Found: ', notFound)
print('Connection Error: ', connectionError)
print('Timeout Error: ', timeoutError)
print('Other Error: ', otherError)

In [None]:
#@title 3. Export an output file

try:
    output_df.to_excel(output_name, index=False, engine='xlsxwriter')
except:
    output_name = output_name[:-5] + '.csv'
    output_df.to_csv(output_name, index=False)
print('Done!\U0001f600')