# Government licenses analysis

In this notebook, we'll analyse a list of licenses gathered from government websites, comparing them among each other and with known licenses. We want to find out:

* Which licenses are standard (i.e. not custom)
* Are there licenses very similar, indicating they came from the same place?

In [106]:
import os
import re
import glob
import csv
import yaml
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

DATA_PATH = os.path.join(os.getcwd(), '..', 'data')
LICENSES_PATH = os.path.join(DATA_PATH, 'gov_licenses.csv')
KNOWN_LICENSES_PATH = os.path.join(DATA_PATH, 'licenses', '*.txt')
LICENSE_TEXT_COL = 'License text (original)'

def load_csv(path):
    with open(path, 'r') as fp:
        reader = csv.DictReader(fp)
        return [row for row in reader]
    
def load_known_licenses(path):
    def _load_license(text):
        matches = re.split(r'[^-]---\w*\n', text)

        if not matches:
            return

        assert len(matches) == 2, 'Wrong matches'
        metadata, text = matches

        license = yaml.load(metadata)
        license['text'] = text.strip()

        return license

    licenses = []

    for filepath in glob.iglob(path):
        _, filename = os.path.split(filepath)
        text = open(filepath, 'r').read()
        licenses.append(_load_license(text))

    return licenses

licenses = load_csv(LICENSES_PATH)
known_licenses = load_known_licenses(KNOWN_LICENSES_PATH)

FileNotFoundError: [Errno 2] No such file or directory: '/home/vitor/Projetos/okfn/licenses_analysis/src/../data/gov_licenses.csv'

## Prepare data for comparisons

Here we clean and prepare the licenses' data to detect similarities among them.

In [None]:
import sklearn.metrics
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

def get_min_distances(distances, is_pairwise=False):
    result = []
    
    for index, distance in enumerate(distances):
        mask = np.full(len(distance), True)
        
        if is_pairwise:
            # Remove the current distance from the list of possibilities
            # otherwise we'll always find out that the closest point is itself
            # (distance == 0)
            np.put(mask, index, False)
            
        min_distance = distance[mask].min()

        result.append({
            'index': index,
            'distance': min_distance,
            'min_indexes': np.where(distance == min_distance)[0],
        })

    return result

pipeline = make_pipeline(
    TfidfVectorizer(stop_words='english', norm='l2', sublinear_tf=True)
)

gov_licenses_data = [
    {'index': index, 'type': 'gov', 'text': license[LICENSE_TEXT_COL]}
    for index, license in enumerate(licenses)
]

known_licenses_data = [
    {'index': index, 'type': 'known', 'text': license['text']}
    for index, license in enumerate(known_licenses)
]

prepared_data = np.array(gov_licenses_data + known_licenses_data)

# We need this to create the masks (there's certainly a better way, but I don't know)
prepared_data_types = np.array([data['type'] for data in prepared_data])
gov_mask = prepared_data_types == 'gov'
known_licenses_mask = prepared_data_types == 'known'

X = pipeline.fit_transform([
    data['text'] for data in prepared_data
])

## Compare government's licenses to known licenses

We're trying to determine how similar the government licenses are to a few known licenses. Them being similar can mean they're using the same license (changing things like year of copyrigh holder names), or that they were based in them.

In [None]:
def generate_df(min_distances, X, y):
    ROW_OFFSET = 2  # 1 for the header, and 1 because it starts in 1, not 0 like lists
    pairs_seen = set()
    result = []
    for potential in sorted(min_distances, key=lambda x: x['distance']):
        index = potential['index']
        min_index = potential['min_indexes'][0]
        pair = ','.join(sorted((str(index), str(min_index))))
        
        if pair in pairs_seen:
            continue
            
        pairs_seen.add(pair)

        data = {
            'distance': potential['distance'],
            'row': index + ROW_OFFSET,
            'license_text': X[index][LICENSE_TEXT_COL],
            'closest_match': y[min_index]['title'],
        }
        result.append(data)

    return pd.DataFrame(result)

distances_to_known_licenses = sklearn.metrics.pairwise.pairwise_distances(
    X[gov_mask].todense(),
    X[known_licenses_mask].todense(),
    metric='dice'
)

min_distances = get_min_distances(distances_to_known_licenses)

plt.hist([d['distance'] for d in min_distances])
plt.title("Distribution of distances between government and known licenses")
plt.show()

display(generate_df(min_distances, licenses, known_licenses))

## Comparing government licenses among themselves

We're comparing every pair of government licenses, and selecting the closest matches.

In [None]:
def generate_df(min_distances):
    pairs_seen = set()
    result = []
    for potential in sorted(min_distances, key=lambda x: x['distance']):
        index = potential['index']
        min_index = potential['min_indexes'][0]
        pair = ','.join(sorted((str(index), str(min_index))))
        
        if pair in pairs_seen:
            continue
            
        pairs_seen.add(pair)
        
        data = {
            'distance': potential['distance'],
            'license_1.row': index + 2,
            'license_2.row': min_index + 2,
            'license_1': licenses[index][LICENSE_TEXT_COL],
            'license_2': licenses[min_index][LICENSE_TEXT_COL],
        }
        result.append(data)

    return pd.DataFrame(result)

distances_among_themselves = sklearn.metrics.pairwise.pairwise_distances(
    X[gov_mask].todense(),
    metric='dice'
)

min_distances = get_min_distances(distances_among_themselves, is_pairwise=True)
plt.hist([d['distance'] for d in min_distances])
plt.title("Distribution of the licenses texts to their most similar ones")
plt.show()

display(generate_df(min_distances))