<a href="https://colab.research.google.com/github/wambugudan/Data-Science-Projects/blob/main/Research_Data_Hashing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import hashlib

In [None]:
# Load the data
df = pd.read_csv("data.csv")

In [None]:
# Define a SHA-256 hashing function
def hash_value(val):
    return hashlib.sha256(str(val).encode('utf-8')).hexdigest()

In [None]:
# Apply hash to PII fields
df["student_id_hash"] = df["PERSON_ID"].apply(hash_value)
df["email_hash"] = df["EMAIL"].apply(hash_value)

In [None]:
# Drop or redact the PII from the main data
df_anonymized = df.drop(columns=["PERSON_ID", "EMAIL", "FULL_NAME", "POSTAL_CODE", "INSTITUTION_ROLE_SOURCE_DESC", "JOB_TITLE", "DEPARTMENT", "COMPANY"], errors='ignore')


In [None]:
# Move hashes to the front
cols = df_anonymized.columns.tolist()
reordered_cols = ["student_id_hash", "email_hash"] + [col for col in cols if col not in ["student_id_hash", "email_hash"]]
df_anonymized = df_anonymized[reordered_cols]

In [None]:
# Save the mapping file (secure lookup table)
mapping_df = df[["student_id_hash", "PERSON_ID", "EMAIL"]]
mapping_df.to_csv("student_id_mapping.csv", index=False)

In [None]:
df_anonymized["age_group"] = pd.to_datetime("today").year - pd.to_datetime(df["BIRTH_DATE"]).dt.year
df_anonymized["age_group"] = pd.cut(df_anonymized["age_group"], bins=[0, 17, 25, 35, 50, 100], labels=["<18", "18-25", "26-35", "36-50", "51+"])
df_anonymized.drop(columns="BIRTH_DATE", inplace=True)


In [None]:
# Save the anonymized dataset
df_anonymized.to_csv("anonymized_lms_data.csv", index=False)

# Show the first few rows
df_anonymized.head()

Unnamed: 0,student_id_hash,email_hash,INSTITUTION_ROLE,SYSTEM_ROLE,SYSTEM_ROLE_SOURCE_DESC,COURSE_ID,COURSE_NAME,COURSE_NUMBER,COURSE_DESCRIPTION,DESIGN_MODE,...,GRADED_CNT,MANUAL_IND,VALID_IND,FIRST_GRADED_TIME,LAST_GRADED_TIME,MODIFIER_ROLE,MODIFIER_PERSON_ID,TOTAL_SESSION_DURATION,DEVICE_MODE,age_group
0,0d835ef99b2f4719c68fa792419199087546928290b0c3...,8eff7de0ee911d5ef9ce2b8ee86d3fe3dba2a310fcf469...,S,N,N.name,23096,APT1030B,APT1030B_US2023,Applied Computer Science,C,...,,,,,,,,1844142.0,Mobile,
1,0d835ef99b2f4719c68fa792419199087546928290b0c3...,8eff7de0ee911d5ef9ce2b8ee86d3fe3dba2a310fcf469...,S,N,N.name,22157,APT1040C,APT1040C_SS2023,Applied Computer Science,C,...,,False,True,,,,,1844142.0,Mobile,
2,0d835ef99b2f4719c68fa792419199087546928290b0c3...,8eff7de0ee911d5ef9ce2b8ee86d3fe3dba2a310fcf469...,S,N,N.name,21948,IST1020C,IST1020C_FS2022,Information Systems,C,...,,False,True,,,,,1844142.0,Mobile,
3,0d835ef99b2f4719c68fa792419199087546928290b0c3...,8eff7de0ee911d5ef9ce2b8ee86d3fe3dba2a310fcf469...,S,N,N.name,22599,IST1025A,IST1025A_SS2023,Information Systems,C,...,,False,True,,,,,1844142.0,Mobile,
4,0d835ef99b2f4719c68fa792419199087546928290b0c3...,8eff7de0ee911d5ef9ce2b8ee86d3fe3dba2a310fcf469...,S,N,N.name,22601,IST1025C,IST1025C_SS2023,Information Systems,C,...,,,,,,,,1844142.0,Mobile,
