In [1]:
import hashlib
import os
import time

The provenance of the files is done via md5 hash comparison of the user generated and reference files.

The structure of this file follows the order of the generation of intermediate files by the **preprocess.ipnyb** file located in the same folder as this notebook.

In [2]:
PROVENANCE_FOLDER_BASE="../data/produced/provenance/"
REFERENCE_FOLDER_BASE = PROVENANCE_FOLDER_BASE + "reference/"
TIMESTAMP_FORMAT = "%d-%m-%Y-%H-%M-%S"

The files generated by the original research are located in **REFERENCE_FOLDER_BASE** and the files generated during the last run are stored in **PROVENANCE_FOLDER_BASE**.

During the checks the latest user generated files are compared against the reference files and this depends on the naming convention, see README.md at the root of the project. **Please ensure there are no additional files matching the prefixes of the genreated fils as the latest produced file will fail**

In [3]:
# helper methods to locate the latest file, calculate the MD5 hash, check whether the hashes match

def findFilesStartingWith(prefix):
    matchingFiles = []
    for file in os.listdir(PROVENANCE_FOLDER_BASE):
        if file.startswith(prefix):
            matchingFiles.append(file)
    return matchingFiles

def extractTimestams(files):
    timestamps = []
    for file in files:
        stringTimestamp = file.split("--")[1] # separate the filename out
        stringTimestamp = stringTimestamp [:-4] # remove the csv suffix
        timestamp = time.strptime(stringTimestamp, TIMESTAMP_FORMAT)
        timestamps.append(timestamp)
    return timestamps

def findLatestFileStartingWith(prefix):
    matchingFiles = findFilesStartingWith(prefix)
    timestamps = extractTimestams(matchingFiles)
    return matchingFiles[timestamps.index(max(timestamps))] if len(timestamps) > 0 else None

def calculateMd5HasForFile(file):
    fileHandle = open(file, "rb")
    fileContent = fileHandle.read()
    md5Hash = hashlib.md5()
    md5Hash.update(fileContent)
    return md5Hash

def guardFileFound(file, fileName):
    if file is None:
        raise Exception("No produced file for prefix '" + fileName + "' was found. Have you run the preprocess.ipnyb successfully?"  )

def compareUserAndReferenceFiles(fileName):
    # latest user produced file
    latestFile = findLatestFileStartingWith(fileName)
    guardFileFound(latestFile, fileName)
    producedFileMd5Hash = calculateMd5HasForFile(PROVENANCE_FOLDER_BASE + latestFile)
    print("Latest produced file:", latestFile)

    # reference file
    referenceFile = REFERENCE_FOLDER_BASE + fileName + '.csv'
    print("Reference file:", referenceFile)
    referenceFileMd5Hash = calculateMd5HasForFile(referenceFile)
    
    print("Latest produced file hash:", producedFileMd5Hash.hexdigest())
    print("Reference file hash:      ", referenceFileMd5Hash.hexdigest())
    
    # match validation
    if producedFileMd5Hash.hexdigest() == referenceFileMd5Hash.hexdigest():
        print("[OK]  HASHES MATCH, files are the same")
    else:
        print("[ERROR] HASHES DO NOT MATCH!")
        raise Exception("Hash mismatch - compare the files: data/provenance/" + latestFile + " and data/provenance/reference/" + fileName + '.csv for differences')

## Social media users yearly aggregation

In [4]:
fileName = "social-media-users-year-aggregated"
compareUserAndReferenceFiles(fileName)

Latest produced file: social-media-users-year-aggregated--19-04-2021-20-28-25.csv
Reference file: ../data/produced/provenance/reference/social-media-users-year-aggregated.csv
Latest produced file hash: 7c6a3d4a1559e3e2f70ee2dc30053c8d
Reference file hash:       7c6a3d4a1559e3e2f70ee2dc30053c8d
[OK]  HASHES MATCH, files are the same


## Social media after finall preprocessing

In [5]:
fileName = "social-media-users-final-preprocessed"
compareUserAndReferenceFiles(fileName)

Latest produced file: social-media-users-final-preprocessed--19-04-2021-20-28-25.csv
Reference file: ../data/produced/provenance/reference/social-media-users-final-preprocessed.csv
Latest produced file hash: a2404d696bbba9d043ec946353224a10
Reference file hash:       a2404d696bbba9d043ec946353224a10
[OK]  HASHES MATCH, files are the same


## Suicide rates without missing values

In [None]:
fileName = "suicide-rates-sanitized"
compareUserAndReferenceFiles(fileName)

## Suicide rates aggregated by year

In [None]:
fileName = "suicide-rates-aggregated-by-year"
compareUserAndReferenceFiles(fileName)

## The 2 preprocessed datasets merged into 1

In [None]:
fileName = "merged-datasets"
compareUserAndReferenceFiles(fileName)

## The final dataset

In [None]:
fileName = "final-dataset"
compareUserAndReferenceFiles(fileName)

If no exceptions were thrown then all of the files match and the results should be reproducible correctly.