In [1]:
# Block 1: Import libraries and load data

import pandas as pd
import json
import re
import ast # A safe way to evaluate strings containing Python literals
from tqdm.notebook import tqdm # For a nice progress bar

# Define the filenames
csv_filename = "synthetic_profiles.csv"
json_filename = "resistance_data.json"

try:
    # Read the CSV file. A 5MB file will load almost instantly.
    print(f"Attempting to load {csv_filename}...")
    df = pd.read_csv(csv_filename)
    print("‚úÖ File loaded successfully!")
    print("Here's a preview of the first 5 rows:")
    display(df.head())

except FileNotFoundError:
    print(f"‚ùå ERROR: The file '{csv_filename}' was not found.")
    print("Please make sure you have uploaded the file to your Colab session.")
except Exception as e:
    print(f"An unexpected error occurred while loading the file: {e}")

Attempting to load synthetic_profiles.csv...
‚úÖ File loaded successfully!
Here's a preview of the first 5 rows:


Unnamed: 0,mutations,3TC,ABC,ATV/r,AZT,BIC,CAB,D4T,DDI,DOR,...,IDV/r,LEN,LPV/r,NFV,NVP,RAL,RPV,SQV/r,TDF,TPV/r
0,"['75T', '54A', '70T', '44A']",Pot_R,Low_R,Low_R,Pot_R,S,S,High_R,Mid_R,S,...,Low_R,S,Low_R,Low_R,S,S,S,Low_R,Low_R,Low_R
1,"['215C', '83D', '215Y']",S,Pot_R,Pot_R,High_R,S,S,High_R,Low_R,S,...,Pot_R,S,S,Low_R,S,S,S,Pot_R,Pot_R,Low_R
2,"['67K', '105T']",S,S,S,S,S,S,S,S,S,...,S,Mid_R,S,S,S,S,S,S,S,S
3,"['143A', '67Y']",S,S,S,S,S,Pot_R,S,S,S,...,S,Low_R,S,S,S,High_R,S,S,S,S
4,['140R'],S,S,S,S,Pot_R,High_R,S,S,S,...,S,S,S,S,S,Mid_R,S,S,S,S


In [2]:
# Block 2: Prepare the main data structure

# This dictionary will hold all the processed data. It will grow to a significant
# size in memory before we save it as a JSON file.
resistance_data = {
    "PR": {},
    "RT": {},
    "IN": {}
}

print("Initialized the primary data structure:")
print(json.dumps(resistance_data, indent=2))

Initialized the primary data structure:
{
  "PR": {},
  "RT": {},
  "IN": {}
}


In [3]:
# Block 3: Function to determine the gene from a mutation

def get_gene_from_mutation(mutation_str):
    """
    Parses a mutation string to extract its numerical position and determine
    the corresponding gene based on HIV research conventions.
    """
    # Use regular expressions to find the number in the string.
    match = re.search(r'\d+', mutation_str)

    if not match:
        return None # Could not find a number in the mutation string

    position = int(match.group(0))

    # Apply the mapping rules
    if 1 <= position <= 99:
        return "PR"
    elif 100 <= position <= 560:
        return "RT"
    else:
        return "IN"

# --- Test the function ---
print(f"'75T' belongs to gene: {get_gene_from_mutation('75T')}")
print(f"'M184V' belongs to gene: {get_gene_from_mutation('M184V')}")

'75T' belongs to gene: PR
'M184V' belongs to gene: RT


In [4]:
# Block 4: The main processing loop

# Get a list of all drug columns (all columns except the first one)
drug_columns = df.columns[1:]

# Use tqdm to create a progress bar for the loop
print("Processing CSV rows to build the JSON structure in memory...")
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    try:
        # ast.literal_eval safely converts a string like "'['75T']'" into a list: ['75T']
        mutations_list = ast.literal_eval(row['mutations'])

        # Loop through each individual mutation from the list
        for mutation in mutations_list:
            gene = get_gene_from_mutation(mutation)

            if not gene:
                continue

            # Iterate through all drug columns for this row
            for drug in drug_columns:
                score = row[drug]

                # Only add an entry if there is some level of resistance
                if score != 'S':
                    # If this mutation is new, add its base structure
                    if mutation not in resistance_data[gene]:
                        resistance_data[gene][mutation] = {"drugs": []}

                    # Create the new drug-score object to be added
                    drug_entry = {
                        "name": drug,
                        "score": score
                    }

                    # Add the entry. This is what causes the data structure to grow.
                    resistance_data[gene][mutation]["drugs"].append(drug_entry)

    except (ValueError, SyntaxError):
        print(f"‚ö†Ô∏è Warning: Could not parse mutations in row {index}. Skipping.")
        continue
    except Exception as e:
        print(f"‚ùå An unexpected error occurred at row {index}: {e}")
        continue

print("\n‚úÖ In-memory data structure built successfully!")

Processing CSV rows to build the JSON structure in memory...


  0%|          | 0/30000 [00:00<?, ?it/s]


‚úÖ In-memory data structure built successfully!


In [5]:
# Block 5: Export the populated dictionary to a JSON file

try:
    print(f"Writing the large in-memory dictionary to {json_filename}...")
    with open(json_filename, 'w') as json_file:
        # json.dump writes the dictionary to the file
        # indent=2 makes the file nicely formatted and easy to read
        json.dump(resistance_data, json_file, indent=2)

    print(f"‚úÖ Success! Data has been saved to {json_filename}.")
    print("You can download the file from the Colab file browser on the left.")

except IOError:
    print(f"‚ùå ERROR: Could not write to the file '{json_filename}'.")
except Exception as e:
    print(f"An unexpected error occurred during file writing: {e}")

Writing the large in-memory dictionary to resistance_data.json...
‚úÖ Success! Data has been saved to resistance_data.json.
You can download the file from the Colab file browser on the left.


In [6]:
# Block 6: Load the generated JSON for validation

import json
import re

json_filename = "resistance_data.json"
validation_errors = []
validation_warnings = []

try:
    print(f"Attempting to load {json_filename} for validation...")
    with open(json_filename, 'r') as f:
        data_to_validate = json.load(f)
    print("‚úÖ JSON file loaded successfully. It is well-formed.")

except FileNotFoundError:
    print(f"‚ùå FATAL ERROR: The file '{json_filename}' was not found. Cannot proceed with validation.")
    data_to_validate = None
except json.JSONDecodeError as e:
    print(f"‚ùå FATAL ERROR: The file '{json_filename}' is not a valid JSON file. Error: {e}")
    data_to_validate = None

Attempting to load resistance_data.json for validation...
‚úÖ JSON file loaded successfully. It is well-formed.


In [7]:
# Block 7: Run the validation logic

def validate_resistance_data(data):
    """
    Validates the loaded data against our specific schema and rules.
    Returns True if valid, False otherwise.
    """
    if data is None:
        return False # Can't validate if loading failed

    # 1. Check top-level keys
    print("1. Checking top-level structure...")
    expected_toplevel_keys = {"PR", "RT", "IN"}
    if set(data.keys()) != expected_toplevel_keys:
        validation_errors.append(f"Top-level keys are incorrect. Expected {expected_toplevel_keys}, but found {set(data.keys())}.")
        return False # Stop validation if the basic structure is wrong
    print("   ... Top-level structure is correct.")

    # 2. Iterate through each gene and its mutations
    print("2. Checking each gene's content...")
    for gene, mutations in data.items():
        if not isinstance(mutations, dict):
            validation_errors.append(f"The value for gene '{gene}' should be a dictionary, but it's a {type(mutations).__name__}.")
            continue # Move to the next gene

        for mutation_str, content in mutations.items():
            # 2a. Check mutation placement
            match = re.search(r'\d+', mutation_str)
            if not match:
                validation_errors.append(f"Could not parse position from mutation '{mutation_str}' in gene '{gene}'.")
                continue

            position = int(match.group(0))
            if gene == "PR" and not (1 <= position <= 99):
                validation_errors.append(f"Misplaced Mutation: '{mutation_str}' (pos {position}) is under 'PR' but should not be.")
            elif gene == "RT" and not (100 <= position <= 560):
                validation_errors.append(f"Misplaced Mutation: '{mutation_str}' (pos {position}) is under 'RT' but should not be.")
            elif gene == "IN" and not (position > 560):
                 validation_errors.append(f"Misplaced Mutation: '{mutation_str}' (pos {position}) is under 'IN' but should not be.")

            # 2b. Check internal schema for each mutation
            if 'drugs' not in content or not isinstance(content['drugs'], list):
                validation_errors.append(f"Mutation '{mutation_str}' is missing a 'drugs' list.")
                continue

            if not content['drugs']:
                validation_warnings.append(f"Mutation '{mutation_str}' has an empty 'drugs' list.")

            for drug_entry in content['drugs']:
                if not isinstance(drug_entry, dict) or set(drug_entry.keys()) != {"name", "score"}:
                    validation_errors.append(f"Invalid drug entry under '{mutation_str}': {drug_entry}. Should have 'name' and 'score' keys.")

                # 2c. Check score content
                if drug_entry.get('score') == 'S':
                     validation_errors.append(f"Invalid score 'S' found for drug '{drug_entry.get('name')}' under mutation '{mutation_str}'.")

    print("   ... Gene content check complete.")
    # Return True if the list of errors is empty
    return not validation_errors

# --- Run the validation ---
is_valid = validate_resistance_data(data_to_validate)

1. Checking top-level structure...
   ... Top-level structure is correct.
2. Checking each gene's content...
   ... Gene content check complete.


In [8]:
# Block 8: Display the final validation report

print("\n-----------------------------")
print("  JSON Validation Summary")
print("-----------------------------")

if not data_to_validate:
    print("STATUS: üî¥ Validation could not be performed.")
elif not is_valid:
    print("STATUS: üî¥ FAILED - The JSON file has errors.")
    print("\nThe following errors were found:")
    for i, error in enumerate(validation_errors, 1):
        print(f"  {i}. {error}")
else:
    print("STATUS: ‚úÖ PASSED - The JSON file is valid and conforms to the required schema!")

if validation_warnings:
    print("\nThe following warnings were noted (these are not errors but may be unexpected):")
    for i, warning in enumerate(validation_warnings, 1):
        print(f"  {i}. {warning}")

print("\n-----------------------------")


-----------------------------
  JSON Validation Summary
-----------------------------
STATUS: ‚úÖ PASSED - The JSON file is valid and conforms to the required schema!

-----------------------------
