###### Note: In the resulting hashed csv, there is clear mapping between data and hashes, i.e. if two original data values are the same, then their hashes will also be the same.

In [1]:
### Import necessary packages
import csv
import hashlib
import binascii
import os
from time import time
import pandas as pd
from IPython.display import display

In [3]:
### Hashing procedure

# The following function is used for hashing:
# hashlib.pbkdf2_hmac(hash_name, password, salt, iterations, dklen=None)¶
# The function provides PKCS#5 password-based key derivation function 2. It uses HMAC as pseudorandom function.
# The string hash_name is the desired name of the hash digest algorithm for HMAC, e.g. ‘sha1’ or ‘sha256’. Password and salt are interpreted as buffers of bytes.
# Applications and libraries should limit password to a sensible length (e.g. 1024). salt should be about 16 or more bytes from a proper source, e.g. os.urandom().
# The number of iterations should be chosen based on the hash algorithm and computing power. As of 2013, at least 100,000 iterations of SHA-256 are suggested.
# dklen is the length of the derived key. If dklen is None then the digest size of the hash algorithm hash_name is used, e.g. 64 for SHA-512.
# Example:
# dk = hashlib.pbkdf2_hmac('sha256', b'password', b'salt', 100000)
# binascii.hexlify(dk)
# b'0394a2ede332c9a13eb82e9b24631604c31df978b4e2f0fbd2c549944f9d79a5'

IN_PATH = 'wisconsin_data_original.csv' # enter the name of original file (+path) here
OUT_PATH = 'wisconsin_data_hashed.csv' # enter the name of the destination file (+path) here
ENCODING = 'utf-8' # specify encoding of the input file
HASH_COLUMNS = dict(radius_mean='md5',
                    texture_mean='md5') # specify headers of the columns to be hashed and hash algorithm per column

# you don't need to modify the code below, unless it breaks...
def main():
    print ("hashing in progress")
    print()
    t0 = time() # check hashing time
    with open(IN_PATH, 'rt', encoding=ENCODING, newline='') as in_file, \
            open(OUT_PATH, 'wt', encoding=ENCODING, newline='') as out_file:
        reader = csv.DictReader(in_file)
        writer = csv.DictWriter(out_file, reader.fieldnames)
        writer.writeheader()
        salt = os.urandom(16) # return a string of 16 random bytes suitable for cryptographic use
                             # if you do it here (as opposed to below), you will use the same salt for all data points
                             # by setting the same salt for all data points you can ensure 
                             # that there is clear mapping between data and hashes
                             # (if two original data values are the same, then their hashes will also be the same)
        no_of_rows = sum(1 for row in reader) # count no. of rows for the progress bar
        decil_of_rows = no_of_rows // 10 # integer division(!)
        row_count = 0
        in_file.seek(0) # return to the first row
        reader = csv.DictReader(in_file) # reset the reader
        percent = 0
        for row in reader:
            row_count += 1
            if row_count % decil_of_rows == 0: # display progress every 1%
                percent += 10
                print("{0:.0f}%".format(percent),'', end='', flush=True) # "progress bar"
            for column, method in HASH_COLUMNS.items():
                if row[column] not in (None, ""): # don't hash blanks, i.e. leave blanks blank
                    data = row[column].encode(ENCODING)
                    #salt = os.urandom(16) # return a string of 16 random bytes suitable for cryptographic use
                                          # if you do it here, a different salt will be used for each data point
                                          # which means that even if two original data values are the same,
                                          # they will have different hash values
                                          # this is effectively full encryption of data 
                                          # and in principle makes the data unusable for machine learning(!!!)
                    derived_key = hashlib.pbkdf2_hmac(method, data, salt, 100000) # 100,000 iterations in line with recommendation, can be adjusted if needed 
                    digest = binascii.hexlify(derived_key)
                    digest_string = digest.decode(ENCODING)
                    row[column] = '0x' + digest_string.upper()
            writer.writerow(row)
    print()
    print()
    print ("hashing time:", round(time()-t0, 3), "s") # print hashing time

if __name__ == '__main__':
    main()

hashing in progress

10% 20% 30% 40% 50% 60% 70% 80% 90% 100% 

hashing time: 42.123 s


In [4]:
### Compare selected records from original and hashed data

original_data = pd.read_csv('wisconsin_data_original.csv')
hashed_data = pd.read_csv('wisconsin_data_hashed.csv')

print('Original data:')
display(original_data.head())

print('Hashed data')
display(hashed_data.head())

Original data:


Unnamed: 0,diagnosis,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,0,842302,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,0,842517,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,0,84300903,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,0,84348301,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,0,84358402,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


Hashed data


Unnamed: 0,diagnosis,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,0,842302,0xA3B0291683341CAB9802FD482367873B,0x24D86872182A0140FDE3DFB763B44127,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,0,842517,0x8DD36BAFC67F10DBD42D93098365F8AE,0xADC2D96C66C75F0B4E83717268916B07,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,0,84300903,0x4CD574D85CD25022B0937B09A1AEC6B2,0x07A503BD2C2732A4C7E5B810740AC436,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,0,84348301,0x11DC08B340ABE0991FE0DC0252F19C46,0xA69CFBBEBED5EAED8C681AC3611E1806,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,0,84358402,0x00AD156CA846369AC986A9AA7397896E,0x2A53BF777C4B179A01CF3C99B37971FC,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
