# Crawl License Information

In [None]:
# based on https://github.com/madelonhulsebos/gittables/blob/main/gittables/table_annotator.py
    
USER = "" # add user here
TOKEN = "" # add valid token here

import requests
import time
def get_table_license(url: str):
    
    repository_url = url.split("blob")[0]
    owner = repository_url.split("/")[-3]
    repo = repository_url.split("/")[-2]

    try:
        response = requests.get(
            f"https://api.github.com/repos/{owner}/{repo}/license",
            headers={"accept": "application/vnd.github.v3+json"},
            auth=(USER, TOKEN)
        )
        if response.status_code == 200:
            table_license = response.json()["license"]
            if table_license["name"] == "Other":
                table_license = None
            else:
                table_license = {"license": table_license}
        elif response.status_code == 404:
            # In this case, the repository is not associated with a license.
            table_license = None
        elif response.status_code == 403:
            # In this case, we likely reached the API limit.
            waiting_time = float(response.headers["X-RateLimit-Reset"]) - time.time()
            if waiting_time < 0:
                # We will not waiting for nothing, there was something else wrong.
                table_license = None
            else:
                msg = f"Reached limit on owner {owner}, repo {repo}, waiting for {waiting_time} s"
                print(msg)
                time.sleep(waiting_time)
                table_license = get_table_license(url)
        else:
            # In this case, we encountered another error.
            code = response.status_code
            msg = f"Ran into another issue, with status code {code}"
            print(msg)
            table_license = None
    except Exception as e:
        msg = f"Ran into exception {e}"
        print(msg)
        table_license = None

    return table_license

In [None]:
import pickle

checkpoint = []
start_idx = 0

if os.path.exists('../../data/license_checkpoint.pckg'):
    checkpoint = pickle.load(open('../../data/license_checkpoint.pckg','rb'))
    start_idx = max([v[0] for v in checkpoint]) + 1

with open("../../data/sqlfiles_urls.csv") as f:
    line = f.readline()
    while line:
        idx = int(line[0:6])
        if idx < start_idx:
            line = f.readline()
            continue
            
        url = line[7:]
        license = get_table_license(url)
        license_key = license['license']['key'] if license and 'license' in license.keys() and 'key' in license['license'].keys() else "None"
        checkpoint.append([idx, url, license_key])
        
        if idx % 100 == 0:
            print(f"checkpointed at {idx}")
            pickle.dump(checkpoint, open('../../data/license_checkpoint.pckg','wb'))
        
        line = f.readline()

# Evaluate License Information

In [50]:
import os
from collections import Counter
if os.path.exists('../../data/license_checkpoint.pckg'):
    checkpoint = pickle.load(open('../../data/license_checkpoint.pckg','rb'))
    counter = Counter([v[2] for v in checkpoint])
    counter = dict(counter)

In [53]:
sorted(counter.items(), key=lambda x: -x[1])

[('None', 563122),
 ('apache-2.0', 49781),
 ('mit', 38655),
 ('gpl-3.0', 17873),
 ('gpl-2.0', 9520),
 ('agpl-3.0', 5501),
 ('bsd-3-clause', 3638),
 ('ecl-2.0', 3003),
 ('lgpl-3.0', 1094),
 ('mpl-2.0', 571),
 ('unlicense', 555),
 ('lgpl-2.1', 548),
 ('cc0-1.0', 408),
 ('bsd-2-clause', 347),
 ('epl-2.0', 222),
 ('epl-1.0', 193),
 ('eupl-1.2', 145),
 ('osl-3.0', 134),
 ('artistic-2.0', 115),
 ('isc', 98),
 (None, 80),
 ('eupl-1.1', 51),
 ('cc-by-4.0', 42),
 ('ms-pl', 39),
 ('cc-by-sa-4.0', 35),
 ('wtfpl', 33),
 ('afl-3.0', 26),
 ('upl-1.0', 20),
 ('zlib', 11),
 ('bsl-1.0', 11),
 ('postgresql', 8),
 ('ms-rl', 6),
 ('0bsd', 6),
 ('bsd-3-clause-clear', 3),
 ('mit-0', 3),
 ('ncsa', 2),
 ('bsd-4-clause', 1),
 ('odbl-1.0', 1)]

In [52]:
# percentage with lincense
sum([(counter[k] if (k is not None and k != 'None') else 0) for k in counter])/sum(counter.values())

0.19068660628451461

# Add License Information to Metadata

In [44]:
# load license information
licenses = pickle.load(open('../../data/license_checkpoint.pckg','rb'))

licenses_dict = {}
for license_info in licenses:
    url = license_info[1]
    license = license_info[2]
    licenses_dict[url] = license

In [45]:
# load metadata
import json
with open('../../data/metadata_postgres.json', 'r') as f:
    metadata = json.load(f)

In [46]:
# add license information   
for key in metadata.keys():
    url = metadata[key]["INFO"]["url"]
    license = None
    if url in licenses_dict: 
        license = licenses_dict[url]
    metadata[key]["INFO"]["license"] = license if license is not None else "None"

In [48]:
# save metadata with license information
with open('../../data/metadata_postgres_with_licenses.json', 'w') as f:
    json.dump(metadata, f)