In [33]:
import os
from openai import OpenAI
import io
import zipfile
import json
import re
import msal
import requests
import random
import pprint
from dotenv import load_dotenv

# Load File to Local Path

In [2]:
load_dotenv()

# Azure application client info
client_id = os.getenv('CLIENT_ID')
client_secret = os.getenv('CLIENT_SECRET')
tenant_id = os.getenv('TENANT_ID')
# redirect_uri = 'https://login.microsoftonline.com/common/oauth2/nativeclient'

# Get access token
authority = f'https://login.microsoftonline.com/{tenant_id}'
scopes = ['Files.Read', 'User.Read', 'Files.ReadWrite']
app = msal.PublicClientApplication(client_id, authority=authority)

# Request token
result = app.acquire_token_interactive(scopes=scopes)

if "access_token" in result:
    access_token = result["access_token"]
    headers = {'Authorization': f'Bearer {access_token}'}

In [3]:
# Access "modeldb-code-analysis/modeldb-zips"
endpoint = 'https://graph.microsoft.com/v1.0/me/drive/root:/modeldb-code-analysis/modeldb-zips:/children'
response = requests.get(endpoint, headers=headers)
file_code_id = {}
if response.status_code == 200:
    files_in_subfolder = response.json().get('value', [])
    for file in files_in_subfolder:
        file_code = file['name'][:-4]
        file_code_id[file_code] = file['id']
        print(f"File Name: {file['name']} - File ID: {file['id']}")
else:
    print(f"Error: {response.status_code} - {response.text}")

File Name: 100603.zip - File ID: 0153IT3SXAD2VJTPSQSBHJZPSW2Z3V7UWE
File Name: 101629.zip - File ID: 0153IT3SQMDXPPCZU7MVCJEGVIB5HNOL65
File Name: 102279.zip - File ID: 0153IT3STKCQUSAZDFEFB3ZQGOV23D5CCQ
File Name: 102288.zip - File ID: 0153IT3SQ5Z6DBLC2Q5ZFYRHH6PG2V5FUD
File Name: 102871.zip - File ID: 0153IT3SUCWBE2OAZMMNHZ27YBIIARMBQ7
File Name: 10360.zip - File ID: 0153IT3SRCF5XUXMLTW5DJZFB2N33CR7IQ
File Name: 104623.zip - File ID: 0153IT3SQYAAKUMXRGFNEK5JWYMTZKJLUJ
File Name: 105383.zip - File ID: 0153IT3SR4S23XHBQROVF3L6OAI44XGDGJ
File Name: 105385.zip - File ID: 0153IT3SVGHNI65UB6ZJFJJE2GI4J6HBFR
File Name: 105501.zip - File ID: 0153IT3SRJ6JKY7FNYPRFKI7LCS2O5Y5G3
File Name: 105506.zip - File ID: 0153IT3SVKXQK6MVTF4JGIGELH6AIP4SYK
File Name: 105507.zip - File ID: 0153IT3SS7FBR5Q6E5EFBYBOXVF72IYWQV
File Name: 105528.zip - File ID: 0153IT3SX2FAFPB3YOEZFK5F77JA5WONHI
File Name: 106551.zip - File ID: 0153IT3STYE3LA2MX43FEYSOQJJ7KNC7YK
File Name: 106891.zip - File ID: 0153IT3SQSATX6Y5

In [5]:
# get shuffled file code
random.seed(20)
file_code_list = list(file_code_id.keys())
random.shuffle(file_code_list)

sample_folder = '/Users/tessakong/Desktop/CodeAnalysis/sample5'
os.makedirs(sample_folder, exist_ok=True)

for code in file_code_list[:5]:
    file_id = file_code_id[code]
    zip_filename = f"{code}.zip"
    local_path = os.path.join(sample_folder, zip_filename)
    extract_path = os.path.join(sample_folder, code)

    # download zip file into local directory
    download_endpoint = f'https://graph.microsoft.com/v1.0/me/drive/items/{file_id}/content'
    response = requests.get(download_endpoint, headers=headers)
    
    if response.status_code == 200:
        with open(local_path, 'wb') as file:
            file.write(response.content)
        print(f"Downloaded {zip_filename} to {local_path}")
        
        with zipfile.ZipFile(local_path, 'r') as zip_ref:
            zip_ref.extractall(extract_path)
        print(f"Unzip {zip_filename} to {extract_path}")
    else:
        print(f"Failed to download {zip_filename}")

Downloaded 114665.zip to /Users/tessakong/Desktop/CodeAnalysis/sample5/114665.zip
Unzip 114665.zip to /Users/tessakong/Desktop/CodeAnalysis/sample5/114665
Downloaded 118434.zip to /Users/tessakong/Desktop/CodeAnalysis/sample5/118434.zip
Unzip 118434.zip to /Users/tessakong/Desktop/CodeAnalysis/sample5/118434
Downloaded 114424.zip to /Users/tessakong/Desktop/CodeAnalysis/sample5/114424.zip
Unzip 114424.zip to /Users/tessakong/Desktop/CodeAnalysis/sample5/114424
Downloaded 105383.zip to /Users/tessakong/Desktop/CodeAnalysis/sample5/105383.zip
Unzip 105383.zip to /Users/tessakong/Desktop/CodeAnalysis/sample5/105383
Downloaded 113949.zip to /Users/tessakong/Desktop/CodeAnalysis/sample5/113949.zip
Unzip 113949.zip to /Users/tessakong/Desktop/CodeAnalysis/sample5/113949


# File Screening

In [28]:
def traverse_folder(path, score_metric, acceptable_extensions, pattern_mapping):
    for entry in os.listdir(path):
        full_path = os.path.join(path, entry)
        if os.path.isdir(full_path):
            print(f'Traverse folder: {full_path}')
            traverse_folder(full_path, score_metric, acceptable_extensions, pattern_mapping)
        else:
            # Check if the file extension is acceptable
            if not entry.lower().endswith(acceptable_extensions):
                continue  # Skip the file if the extension is not acceptable
            score = 0   
            with open(full_path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
                for pattern in pattern_mapping.keys():
                    if pattern.search(content):  # If any rule matches the file content
                        score += 1
            score_metric.append((score, full_path))
                                      


def get_score_metric(model_name, json_file_path, sample_folder = '/Users/tessakong/Desktop/CodeAnalysis/sample5'):
    extract_folder = f'{sample_folder}/{model_name}'
    score_metric = []
    # Load rules from the JSON file
    with open(json_file_path, 'r', encoding='utf-8') as json_file:
        rules = json.load(json_file)
    # Convert rules to a dictionary of regex patterns and replacements
    pattern_mapping = {re.compile(pattern): replacement for pattern, replacement in rules.items()}
    matched_files = [] 
    # Define acceptable file extensions
    acceptable_extensions = ('.py', '.cpp', '.java', '.m', '.txt', '.h', '.data', 
                             '.html', '.c', '.mod', '.g', '.p', ".ode", ".html")  # Adjust as needed

    traverse_folder(extract_folder, score_metric, acceptable_extensions, pattern_mapping)
    return score_metric

def concat_files(code, sample_folder, file_path_list, topK):
    output_file_folder = f'{sample_folder}/match_file'
    os.makedirs(output_file_folder, exist_ok=True)
    output_file_path = f'{output_file_folder}/{code}_top{topK}.txt'
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        for file_path in file_path_list:
            output_file.write(f'=== {file_path} ===\n')  # Write the file path
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                output_file.write(f.read())  # Write the file content
                output_file.write('\n\n')  # Add a newline between files

    print(f"Concatenated file for model {code} have been saved to {output_file_path}")

In [30]:
json_file_path = "/Users/tessakong/Desktop/CodeAnalysis/manual_classifier_rules.json"
for code in file_code_list[:5]:
    print(f'==============================Processing model {code}==============================')
    file_id = file_code_id[code]
    scores = get_score_metric(code, json_file_path, sample_folder)
    propotion = 0.5
    topK = int(propotion * len(scores))
    print(topK)
    if topK == 0:
        print(f"topK is 0 for model {code}, ignore")
        continue
    file_path_list = [s[1] for s in scores[:topK]]
    concat_files(code, sample_folder, file_path_list, topK)
        

Traverse folder: /Users/tessakong/Desktop/CodeAnalysis/sample5/114665/plast
6
Concatenated file for model 114665 have been saved to /Users/tessakong/Desktop/CodeAnalysis/sample5/match_file/114665_top6.txt
Traverse folder: /Users/tessakong/Desktop/CodeAnalysis/sample5/118434/KulviciusEtAl2008
3
Concatenated file for model 118434 have been saved to /Users/tessakong/Desktop/CodeAnalysis/sample5/match_file/118434_top3.txt
Traverse folder: /Users/tessakong/Desktop/CodeAnalysis/sample5/114424/LampreyNMDAosc
1
Concatenated file for model 114424 have been saved to /Users/tessakong/Desktop/CodeAnalysis/sample5/match_file/114424_top1.txt
Traverse folder: /Users/tessakong/Desktop/CodeAnalysis/sample5/105383/GPCv1.0.8
Traverse folder: /Users/tessakong/Desktop/CodeAnalysis/sample5/105383/GPCv1.0.8/cvode
29
Concatenated file for model 105383 have been saved to /Users/tessakong/Desktop/CodeAnalysis/sample5/match_file/105383_top29.txt
Traverse folder: /Users/tessakong/Desktop/CodeAnalysis/sample5/1139

# Metadata Generation (currents)

In [31]:
def api_request(url, method = 'GET', headers=None, params=None, json_data=None):
    '''
    Parameters:
      - url (str): The API endpoint.
      - method (str): The HTTP method ('GET', 'POST', etc.). Default is 'GET'.
      - headers (dict): Optional headers for the request.
      - params (dict): Optional URL parameters for the request.
      - json_data (dict): Optional JSON data for POST requests.

      Returns:
      - response (dict): Parsed JSON response from the API.
    '''
    url = "https://modeldb.science/" + url
    try:
        # Determine the request method
        if method.upper() == 'GET':
            response = requests.get(url, headers=headers, params=params)
        elif method.upper() == 'POST':
            response = requests.post(url, headers=headers, json=json_data)
        else:
            raise ValueError("Unsupported HTTP method: {}".format(method))

        # Check for HTTP errors
        response.raise_for_status()

        # Parse JSON response
        return response.json()

    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
    except requests.exceptions.RequestException as req_err:
        print(f"Error occurred: {req_err}")
    except ValueError as json_err:
        print(f"JSON decode error: {json_err}")

    return None


In [32]:
cat_url = "/api/v1/"
metadata_categories = api_request(cat_url, method = 'GET')
print(metadata_categories)

['celltypes', 'currents', 'genes', 'modelconcepts', 'models', 'modeltypes', 'papers', 'receptors', 'regions', 'simenvironments', 'transmitters']


In [36]:
count_metadata = {}
for e in metadata_categories:
    temp_url = cat_url + e
    count_metadata[e] = len(api_request(temp_url, method = 'GET'))
pprint.pprint(count_metadata)
model_url = "/api/v1/currents/name"
model_current_list = api_request(model_url, method = 'GET')

{'celltypes': 278,
 'currents': 64,
 'genes': 62,
 'modelconcepts': 214,
 'models': 1878,
 'modeltypes': 21,
 'papers': 80975,
 'receptors': 60,
 'regions': 47,
 'simenvironments': 174,
 'transmitters': 25}


In [39]:
api_key = os.getenv('API_KEY')
organization=os.getenv('ORGANIZATION')

match_file_folder = '/Users/tessakong/Desktop/CodeAnalysis/sample5/match_file'
save_prompt = "You are a neuroscience expert specializing in ion channel and current analysis. Given the following content: please identify the most relevant ion currents from the following list. The list includes: ['I Chloride', 'I Na,p', 'I Na,t', 'I L high threshold', 'I N', 'I T low threshold', 'I p,q', 'I A', 'I K', 'I K,leak', 'I M', 'I h', 'I Cl,Ca', 'I K,Ca', 'I CNG', 'I CAN', 'I Sodium', 'I Calcium', 'I Mixed', 'I Potassium', 'I A, slow', 'ATP-sensitive potassium current', 'I_KHT', 'I_KLT', 'I_HERG', 'Late Na', 'Na/Ca exchanger', 'I_Na,Ca', 'I_SERCA', 'KCNQ1', 'I_Ks', 'I Krp', 'I R', 'I Q', 'I_K,Na', 'Na/K pump', 'I_AHP', 'I ANO2', 'I trp', 'I Cl, leak', 'I Na, leak', 'I Ca,p', 'I_KD', 'Osmosis-driven water flux', 'KCC2', 'NKCC1', 'Ca pump', 'I_HCO3', 'Channelrhodopsin (ChR)', 'Kir', 'I MI', 'I TRPM8', 'Kir2 leak', 'I Na, slow inactivation', 'Na+-glutamate transporter', 'IK Bkca', 'I Ca SOCC', 'IK Skca', 'KCC1', 'NBC', 'I C', 'TASK channel', 'Kir, inactivating', 'I TRPM4'].\n\nBased on this content, please provide the most relevant ion current(s), and list them separated by commas."

output_json_path = os.path.join(sample_folder, "prompt_test.json")
# if os.path.exists(output_json_path):
#     with open(output_json_path, 'r', encoding='utf-8') as json_file:
#         output_data = json.load(json_file)
#         if not isinstance(output_data, list):
#             output_data = []
# else:
#     output_data = []
output_data = []

new_prompt_block = {
    "prompt": save_prompt,
    "results": []
}

client = OpenAI(api_key=api_key, organization=organization) 

for entry in os.listdir(match_file_folder):
    full_path = os.path.join(match_file_folder, entry)
    code = entry.split('_')[0]

    with open(full_path, 'r', encoding='utf-8') as file:
        file_content = file.read()
        print(f'===================model {code} has {len(file_content)} tokens===================')
        file_content = file_content[:5000]
    prompt = (
    f"You are a neuroscience expert specializing in ion channel and current analysis. "
    f"Given the following content:\n\n{file_content}\n\n"
    f"Please identify the most relevant ion currents from the following list. "
    f"The list includes: ['I Chloride', 'I Na,p', 'I Na,t', 'I L high threshold', 'I N', 'I T low threshold', 'I p,q', "
    f"'I A', 'I K', 'I K,leak', 'I M', 'I h', 'I Cl,Ca', 'I K,Ca', 'I CNG', 'I CAN', 'I Sodium', 'I Calcium', "
    f"'I Mixed', 'I Potassium', 'I A, slow', 'ATP-sensitive potassium current', 'I_KHT', 'I_KLT', 'I_HERG', "
    f"'Late Na', 'Na/Ca exchanger', 'I_Na,Ca', 'I_SERCA', 'KCNQ1', 'I_Ks', 'I Krp', 'I R', 'I Q', 'I_K,Na', "
    f"'Na/K pump', 'I_AHP', 'I ANO2', 'I trp', 'I Cl, leak', 'I Na, leak', 'I Ca,p', 'I_KD', 'Osmosis-driven water flux', "
    f"'KCC2', 'NKCC1', 'Ca pump', 'I_HCO3', 'Channelrhodopsin (ChR)', 'Kir', 'I MI', 'I TRPM8', 'Kir2 leak', "
    f"'I Na, slow inactivation', 'Na+-glutamate transporter', 'IK Bkca', 'I Ca SOCC', 'IK Skca', 'KCC1', 'NBC', "
    f"'I C', 'TASK channel', 'Kir, inactivating', 'I TRPM4'].\n\n"
    f"Based on this content, please provide the most relevant ion current(s), just list them separated by commas, DO NOT analyze, "
    f"This is an example: current 1, current 2,..."
    )

    chat_completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
    )

    response_dict = chat_completion.to_dict()
    metadata = response_dict["choices"][0]["message"]["content"].strip()
    print("Generated metadata:", metadata)

    new_result = {
        "file_content": file_content,
        "model_code": code,
        "metadata": metadata
    }
    new_prompt_block["results"].append(new_result)

with open(output_json_path, 'w', encoding='utf-8') as json_file:
    json.dump(new_prompt_block, json_file, ensure_ascii=False, indent=4)

Generated metadata: I Na,t, I T low threshold, I K, I K,leak, I M, I h
Generated metadata: I Sodium, I Calcium, Na/Ca exchanger, Na/K pump
Generated metadata: I K,Ca, I K, I K,leak, I Calcium
Generated metadata: I Na,p, I Na,t, I K,leak, I K, I Calcium
Generated metadata: I K


# old version

In [None]:
def filter_model(model_name, json_file_path, sample_folder):
    extract_folder = f'{sample_folder}/{model_name}'
    output_file_folder = f'{sample_folder}/match_file'
    os.makedirs(output_file_folder, exist_ok=True)
    # Load rules from the JSON file
    with open(json_file_path, 'r', encoding='utf-8') as json_file:
        rules = json.load(json_file)

    # Convert rules to a dictionary of regex patterns and replacements
    pattern_mapping = {re.compile(pattern): replacement for pattern, replacement in rules.items()}

    matched_files = [] 
    # Define acceptable file extensions
    acceptable_extensions = ('.py', '.cpp', '.java', '.m', '.txt', '.h', '.data', '.html', '.c', '.mod', '.g', '.p')  # Adjust as needed

    # Traverse subfolders and files in the extraction folder
    for subfolder in os.listdir(extract_folder):
        subfolder_path = os.path.join(extract_folder, subfolder)
        if os.path.isdir(subfolder_path):  # Check if it's a directory
            for root, _, files in os.walk(subfolder_path):
                for file in files:
                    file_path = os.path.join(root, file)
                    # Check if the file extension is acceptable
                    if not file.lower().endswith(acceptable_extensions):
                        continue  # Skip the file if the extension is not acceptable

                    # Open the file and check its content against the rules
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                        content = f.read()
                        for pattern, replacement_list in pattern_mapping.items():
                            if pattern.search(content):  # If any rule matches the file content
                                matched_files.append(file_path)
                                break  # Stop checking this file if one rule is matched

    # Write matched file contents to a new text file
    output_file_path = f'{output_file_folder}/{model_name}_mathched_file.txt'
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        for file_path in matched_files:
            output_file.write(f'=== {file_path} ===\n')  # Write the file path
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                output_file.write(f.read())  # Write the file content
                output_file.write('\n\n')  # Add a newline between files

    print(f"Files matching the rules have been saved to {output_file_path}")


def concat_model(model_name, sample_folder):
    extract_folder = f'{sample_folder}/{model_name}'
    output_file_folder = f'{sample_folder}/raw_file'
    os.makedirs(output_file_folder, exist_ok=True)

    matched_files = []
    # Define acceptable file extensions
    acceptable_extensions = ('.py', '.cpp', '.java', '.m', '.txt', '.h', '.data', '.html', '.c', '.mod', '.g', '.p')  # Adjust as needed

    # Traverse subfolders and files in the extraction folder
    for subfolder in os.listdir(extract_folder):
        subfolder_path = os.path.join(extract_folder, subfolder)
        if os.path.isdir(subfolder_path):  # Check if it's a directory
            for root, _, files in os.walk(subfolder_path):
                for file in files:
                    file_path = os.path.join(root, file)
                    # Check if the file extension is acceptable
                    if not file.lower().endswith(acceptable_extensions):
                        continue  # Skip the file if the extension is not acceptable
                    matched_files.append(file_path)

    # Write matched file contents to a new text file
    output_file_path = f'{output_file_folder}/{model_name}_raw_file.txt'
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        for file_path in matched_files:
            output_file.write(f'=== {file_path} ===\n')  # Write the file path
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                output_file.write(f.read())  # Write the file content
                output_file.write('\n\n')  # Add a newline between files

    print(f"Concatenated file have been saved to {output_file_path}")

json_file_path = "/Users/tessakong/Desktop/CodeAnalysis/manual_classifier_rules.json"
for code in file_code_list[:5]:
    file_id = file_code_id[code]
    filter_model(code, json_file_path, sample_folder)
    concat_model(code,sample_folder)


Files matching the rules have been saved to /Users/tessakong/Desktop/CodeAnalysis/sample5/match_file/114665_mathched_file.txt
Concatenated file have been saved to /Users/tessakong/Desktop/CodeAnalysis/sample5/raw_file/114665_raw_file.txt
Files matching the rules have been saved to /Users/tessakong/Desktop/CodeAnalysis/sample5/match_file/118434_mathched_file.txt
Concatenated file have been saved to /Users/tessakong/Desktop/CodeAnalysis/sample5/raw_file/118434_raw_file.txt
Files matching the rules have been saved to /Users/tessakong/Desktop/CodeAnalysis/sample5/match_file/114424_mathched_file.txt
Concatenated file have been saved to /Users/tessakong/Desktop/CodeAnalysis/sample5/raw_file/114424_raw_file.txt
Files matching the rules have been saved to /Users/tessakong/Desktop/CodeAnalysis/sample5/match_file/105383_mathched_file.txt
Concatenated file have been saved to /Users/tessakong/Desktop/CodeAnalysis/sample5/raw_file/105383_raw_file.txt
Files matching the rules have been saved to /Use

# Old Version

In [None]:
zip_file_path = '/content/drive/MyDrive/group capstone/modeldb_model/100603.zip'
extract_folder = '/content/drive/MyDrive/group capstone/modeldb_model/100603/'
os.makedirs(extract_folder, exist_ok=True)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

print("completed")


completed


In [None]:
# Set the path for the JSON file and the output file
json_file_path = '/content/drive/My Drive/group capstone/manual_classifier_rules.json'
output_file_path = '/content/drive/My Drive/group capstone/modeldb_model/match_file/100603_matched_files.txt'

# Load rules from the JSON file
with open(json_file_path, 'r', encoding='utf-8') as json_file:
    rules = json.load(json_file)

# Convert rules to a list of regex patterns and corresponding replacements
pattern_mapping = {re.compile(pattern): replacement for pattern, replacement in rules.items()}

matched_files = []
# Set the list of acceptable file extensions
acceptable_extensions = ('.py', '.cpp', '.java', '.m', '.txt','.h','.data','.html','.c')#不确定是否要限制文件类型

# Traverse subfolders in the extract_folder directory
for subfolder in os.listdir(extract_folder):
    subfolder_path = os.path.join(extract_folder, subfolder)
    if os.path.isdir(subfolder_path):  # Check if it's a directory
        for root, _, files in os.walk(subfolder_path):
            for file in files:
                file_path = os.path.join(root, file)
                # Check if the file extension is acceptable
                if not file.lower().endswith(acceptable_extensions):
                    continue  # Skip the file if the extension is not acceptable

                # Open the file and check its content against the rules
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    content = f.read()
                    for pattern, replacement_list in pattern_mapping.items():
                        if pattern.search(content):  # If any pattern matches the file content
                            matched_files.append(file_path)
                            break  # Stop checking this file if one rule is matched


# Write matched file contents to a new text file
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    for file_path in matched_files:
        output_file.write(f'=== {file_path} ===\n')  # Write the file path
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            output_file.write(f.read())  # Write the file content
            output_file.write('\n\n')  # Add a newline between files

print(f"Files matching the rules have been saved to {output_file_path}")


Files matching the rules have been saved to /content/drive/My Drive/group capstone/modeldb_model/match_file/100603_matched_files.txt


In [None]:
! ls '/content/drive/MyDrive/group capstone/modeldb_model/modeldb-zips'

modeldb-model  modeldb-zips


In [None]:
def process_model(model_name):
    # Set the path for the zip file and extraction folder based on the model name
    zip_file_path = f'/content/drive/MyDrive/group capstone/modeldb_model/modeldb-zips/modeldb-zips/{model_name}.zip'
    extract_folder = f'/content/drive/MyDrive/group capstone/modeldb_model/modeldb-zips/modeldb-model/{model_name}/'
    os.makedirs(extract_folder, exist_ok=True)

    # Extract the zip file
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_folder)

    print(f"{model_name} extraction completed")

    # Set the path for the JSON rules file and output file
    json_file_path = '/content/drive/My Drive/group capstone/manual_classifier_rules.json'
    output_file_path = f'/content/drive/My Drive/group capstone/modeldb_model/match_file/{model_name}_matched_files.txt'

    # Load rules from the JSON file
    with open(json_file_path, 'r', encoding='utf-8') as json_file:
        rules = json.load(json_file)

    # Convert rules to a dictionary of regex patterns and replacements
    pattern_mapping = {re.compile(pattern): replacement for pattern, replacement in rules.items()}

    matched_files = []
    # Define acceptable file extensions
    acceptable_extensions = ('.py', '.cpp', '.java', '.m', '.txt', '.h', '.data', '.html', '.c')  # Adjust as needed

    # Traverse subfolders and files in the extraction folder
    for subfolder in os.listdir(extract_folder):
        subfolder_path = os.path.join(extract_folder, subfolder)
        if os.path.isdir(subfolder_path):  # Check if it's a directory
            for root, _, files in os.walk(subfolder_path):
                for file in files:
                    file_path = os.path.join(root, file)
                    # Check if the file extension is acceptable
                    if not file.lower().endswith(acceptable_extensions):
                        continue  # Skip the file if the extension is not acceptable

                    # Open the file and check its content against the rules
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                        content = f.read()
                        for pattern, replacement_list in pattern_mapping.items():
                            if pattern.search(content):  # If any rule matches the file content
                                matched_files.append(file_path)
                                break  # Stop checking this file if one rule is matched

    # Write matched file contents to a new text file
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        for file_path in matched_files:
            output_file.write(f'=== {file_path} ===\n')  # Write the file path
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                output_file.write(f.read())  # Write the file content
                output_file.write('\n\n')  # Add a newline between files

    print(f"Files matching the rules have been saved to {output_file_path}")


In [None]:
model_name = '93394'
process_model(model_name)

93394 extraction completed
Files matching the rules have been saved to /content/drive/My Drive/group capstone/modeldb_model/match_file/93394_matched_files.txt


In [None]:
api_key = os.getenv('API_KEY')
organization=os.getenv('ORGANIZATION')
output_file_path='/content/drive/My Drive/group capstone/modeldb_model/match_file/93394_matched_files.txt'

In [None]:
with open(output_file_path, 'r', encoding='utf-8') as file:
    file_content = file.read()
client = OpenAI(
    api_key=api_key,
    organization=organization
)

chat_completion = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {
            "role": "user",
            "content": f"Please analyze the following content and provide some metadata about it:\n\n{file_content}"
        }
    ],
)

response_dict = chat_completion.to_dict()
metadata = response_dict["choices"][0]["message"]["content"].strip()
print("Generated metadata:", metadata)

Generated metadata: Metadata:
- Author: Quentin Huys
- Year: 2006
- Title: Fast population coding Neural Computation
- Description: Example code for fast population coding with sparse spike trains in conjunction with a 2006 paper by Quentin Huys, Zemel RS, Natarajan R and Dayan P.
- License: GNU General Public License
- Website: http://www.gatsby.ucl.ac.uk/~qhuys/code.html
- Email: qhuys@gatsby.ucl.ac.uk
- Files:
  - COPYRIGHT.txt
  - getinf.m
  - getspk.m
  - getstim.m
  - LICENSE.txt
  - main.m
  - param.m
  - plots.m
  - psinf.m
  - pspred.m
  - README.txt
  - setup.m
