In [17]:
import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig


st.set_page_config(page_title="Lung Cancer AI System", layout="wide")

@st.cache_resource
def load_model_and_tokenizer(model_path):
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        device_map="auto",  # Automatically maps model to GPU/CPU
        torch_dtype=torch.float16,
        trust_remote_code=True,
        low_cpu_mem_usage=True
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)


    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        model.resize_token_embeddings(len(tokenizer), mean_resizing=False)


    model.eval()  
    return model, tokenizer

@torch.inference_mode()
def generate_response(model, tokenizer, prompt, generation_config):

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=1024  
    ).to("cuda")


    pad_token_id = tokenizer.pad_token_id
    outputs = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=generation_config.max_new_tokens,
        top_p=generation_config.top_p,
        temperature=generation_config.temperature,
        repetition_penalty=generation_config.repetition_penalty,
        do_sample=generation_config.do_sample,
        pad_token_id=pad_token_id  
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response




def configure_generation_settings():
    with st.sidebar:
        st.title("Generation Settings")
        max_new_tokens = 3000
        top_p = 0.9
        temperature = 0.00001
        repetition_penalty = 1.1
        return GenerationConfig(
            max_new_tokens=max_new_tokens,
            top_p=top_p,
            temperature=temperature,
            repetition_penalty=repetition_penalty,
            do_sample=True
        )



In [18]:
model_path = "/home/h392x566/DeepSeek-R1-Distill-Llama-8B"  # /home/h392x566/DeepSeek-R1-Distill-Llama-8B # /home/h392x566/llama3.2-8b-train-py
model, tokenizer = load_model_and_tokenizer(model_path)


In [19]:
st.sidebar.title("Model Settings")
st.sidebar.text(f"Model: {model_path}")
generation_config = configure_generation_settings()



In [35]:
full_prompt = ( "I am a helpful AI Lung Cancer Oncology Assistant for precision medicine. Determine the relation direction between 'ABCB1' and 'Prednisolone', with relation 'drug_protein'. Just return one of these numbers: suppressed (0), active (1), no relation (2), or not sure (3).Do not include any explanation, chain-of-thought, or intermediate reasoning. Output format is: 'Final number is:'"

    )
response = generate_response(model, tokenizer, full_prompt, generation_config)

In [37]:
print(response)

I am a helpful AI Lung Cancer Oncology Assistant for precision medicine. Determine the relation direction between 'ABCB1' and 'Prednisolone', with relation 'drug_protein'. Just return one of these numbers: suppressed (0), active (1), no relation (2), or not sure (3).Do not include any explanation, chain-of-thought, or intermediate reasoning. Output format is: 'Final number is:' followed by the number.

Okay, so I need to figure out whether ABBC1 interacts with Prednisolone in terms of drug-protein interaction. Hmm, let me start by understanding what each term means. ABCB1 is a gene that codes for a protein called P-glycoprotein, which is involved in transporting drugs out of cells. It's part of the ATP-binding cassette transporters. Prednisolone is a corticosteroid used in treating various conditions like inflammation and immune disorders.

Now, considering their relationship. Since ABCB1 is a drug transporter, it might affect how other drugs are absorbed or excreted. Steroids like pre

In [38]:
import re
pattern = r"Final number is:\s*(\d)"
match = re.search(pattern, response)
res = match.group(1)
res

'2'

In [7]:
direction = int(response.strip())

ValueError: invalid literal for int() with base 10: "I am a helpful AI Lung Cancer Oncology Assistant for precision medicine. Determine the relation direction between 'Vinblastine'and 'TUBB', with relation 'drug_protein'. Just return one of these numbe

In [11]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('lung_cancer_paths_combined.csv')

# Extract the 'Path' column
paths = df['Path']
paths
# List to store relation dictionaries for all paths
relations_list = []

# Iterate over each path string in the CSV
for path in paths:
    # Split the path by "->" and strip any extra whitespace
    elements = [element.strip() for element in path.split("->")]
    
    # Nodes are at even indices (0, 2, 4, ...) and relations at odd indices (1, 3, 5, ...)
    nodes = elements[0::2]
    relations = elements[1::2]
    
    # For each relation, record the source node, relation, and target node
    for i, rel in enumerate(relations):
        source = nodes[i]
        target = nodes[i+1]
        relations_list.append({
            'Source Node': source,
            'Relation': rel,
            'Target Node': target
        })

# Build a DataFrame to display all the relationships
relations_df = pd.DataFrame(relations_list)
relations_df['Relation']

0         disease_disease
1         disease_disease
2         disease_disease
3              indication
4         disease_disease
               ...       
939258         indication
939259         indication
939260         indication
939261         indication
939262         indication
Name: Relation, Length: 939263, dtype: object

In [12]:
unique_relation_count = relations_df['Relation'].nunique()
print("Unique number of relations:", unique_relation_count)

Unique number of relations: 18


In [14]:
# Get unique relation names
unique_relations = relations_df['Relation'].unique()

# Print the unique relation names
print("Unique Relation Names:")
for relation in unique_relations:
    print(relation)

Unique Relation Names:
disease_disease
indication
disease_protein
disease_phenotype_positive
phenotype_phenotype
drug_protein
anatomy_protein_present
cellcomp_protein
protein_protein
exposure_disease
pathway_protein
exposure_protein
phenotype_protein
molfunc_protein
bioprocess_protein
drug_effect
disease_phenotype_negative
exposure_bioprocess


In [None]:
import pandas as pd
import re
# Assume that model, tokenizer, and generation_config are defined.
# Also assume the function generate_response is defined as:
# response = generate_response(model, tokenizer, full_prompt, generation_config)

# Read the CSV file containing the extracted relations (e.g., 'extracted_relations.csv')
relations_df = pd.read_csv('drug_protein.csv')

def get_relation_direction(source, relation, target):
    """
    Build a prompt using source, relation, and target, then use the model to determine the direction.
    The expected outputs are:
      - suppressed: 0
      - active: 1
      - no relation: 2
      - not sure: 3
    """
    full_prompt = (
        f"I am a helpful AI Lung Cancer Oncology Assistant. Determine the relation direction between '{source}' and '{target}' "
        f"with relation '{relation}'. Just return one of these numbers: "
        "suppressed (0), active (1), no relation (2), or not sure (3). Do not include any explanation, chain-of-thought, or intermediate reasoning. Output format is: 'Final number is:'"
    )
    # Get the model's response
    response = generate_response(model, tokenizer, full_prompt, generation_config)
    
    # Attempt to parse the response into an integer.
    try:
        print(response)
        pattern = r"Final number is:\s*(\d)"
        directions = re.search(pattern, response)
        print(directions)
        dir = directions.group(1)
        print(dir)
        direction = dir
    except Exception as e:
        # If parsing fails, default to 'not sure' (3)
        direction = 3
    return direction

# Apply the function to each row to get the relation direction
relations_df['Relation Direction'] = relations_df.apply(
    lambda row: get_relation_direction(row['Source Node'], row['Relation'], row['Target Node']),
    axis=1
)

# Write the updated DataFrame with the new column to a new CSV file
relations_df.to_csv('drug_protein-test.csv', index=False)

print("The relations with their directions have been written to 'relations_with_direction.csv'.")


In [39]:
import pandas as pd
import re

# Assume that model, tokenizer, and generation_config are defined.
# Also assume the function generate_response is defined as:
# response = generate_response(model, tokenizer, full_prompt, generation_config)

# Read the CSV file containing the extracted relations (e.g., 'drug_protein.csv')
relations_df = pd.read_csv('drug_protein.csv')

# Process only the top 100 rows
relations_df_top100 = relations_df.head(10).copy()

def get_relation_direction(source, relation, target):
    """
    Build a prompt using source, relation, and target, then use the model to determine the direction.
    The expected outputs are:
      - suppressed: 0
      - active: 1
      - no relation: 2
      - not sure: 3
    """
    full_prompt = (
        f"I am a helpful AI Lung Cancer Oncology Assistant. Determine the relation direction between '{source}' and '{target}' "
        f"with relation '{relation}'. Just return one of these numbers: "
        "suppressed (0), active (1), no relation (2), or not sure (3). "
        "Do not include any explanation, chain-of-thought, or intermediate reasoning. Output format is: 'Final number is:'"
    )
    # Get the model's response
    response = generate_response(model, tokenizer, full_prompt, generation_config)
    
    try:
        # Debug print to check the response (can be removed later)
        print("Response:", response)
        pattern = r"Final number is:\s*(\d)"
        match = re.search(pattern, response)
        # pattern1 = r"Final number is:\s*(\d)"
        # match1 = re.search(pattern1, response)
        if match:
            direction_int = int(match.group(1))
        # elif match1:
        #     direction_int = int(match1.group(1))
        else:
            direction_int = 3  # Default to 'not sure' if the expected pattern is not found
    except Exception as e:
        direction_int = 3  # Default to 'not sure' on any error
    return direction_int

# Apply the function to each row of the top 100 rows to get the numeric relation direction
relations_df_top100['Relation Direction'] = relations_df_top100.apply(
    lambda row: get_relation_direction(row['Source Node'], row['Relation'], row['Target Node']),
    axis=1
)

# Map the numeric direction to arrow symbols
direction_mapping = {
    0: '<-',  # suppressed
    1: '->',  # active
    2: '-',   # no relation
    3: '-'    # not sure (default symbol)
}

relations_df_top100['Direction Symbol'] = relations_df_top100['Relation Direction'].apply(
    lambda x: direction_mapping.get(x, '-')
)

# Write the updated DataFrame with the new column to a new CSV file
relations_df_top100.to_csv('drug_protein-test.csv', index=False)

print("The top 100 relations with their directions have been written to 'drug_protein-test.csv'.")


Response: I am a helpful AI Lung Cancer Oncology Assistant. Determine the relation direction between 'TUBB1' and 'Vindesine' with relation 'drug_protein'. Just return one of these numbers: suppressed (0), active (1), no relation (2), or not sure (3). Do not include any explanation, chain-of-thought, or intermediate reasoning. Output format is: 'Final number is:' followed by the number.

Okay, so I need to figure out whether TUBB1 is related to Vindesine in terms of drug-protein interaction. Hmm, let me start by understanding both entities. 

First, TUBB1 is a gene that codes for the tubulin beta-1 class protein. Tubulin is a key component of microtubules, which are involved in cell structure and movement. Mutations or changes in TUBB1 can lead to various cellular functions, including cell division and signaling pathways.

Now, Vindesine is an antineoplastic antibiotic used in cancer treatment, specifically as part of combination chemotherapy. It's known to inhibit tubulin polymerizatio

In [1]:
import pandas as pd

#-------------------------------------------------------
# 1) Read the CSV file
#-------------------------------------------------------
# Assuming your CSV has columns exactly like:
#   Source Node, Relation, Target Node
# and is called "relations.csv"
df = pd.read_csv("extracted_relations.csv")

#-------------------------------------------------------
# 2) Remove redundant reverse edges
#-------------------------------------------------------
# We define "redundant" as: for a pair (A, B), if we have both
#   A-B  and  B-A
# with the same Relation, we only keep one.
#
# Approach:
#   - For each row, build a "canonical" key that doesn't depend on order,
#     e.g. (relation, min(source, target), max(source, target)).
#   - If we have already seen that key, we skip the row.
#   - Otherwise, we keep it.
#-------------------------------------------------------
seen = set()
rows_to_keep = []

for idx, row in df.iterrows():
    source = row["Source Node"]
    target = row["Target Node"]
    relation = row["Relation"]
    
    # Create a direction-agnostic key
    # Sorting the source & target ensures A-B == B-A
    # Also include the relation so that different relations
    # on the same node pair are not considered duplicates.
    node_pair = tuple(sorted([source, target]))
    key = (relation, node_pair[0], node_pair[1])
    
    if key not in seen:
        seen.add(key)
        rows_to_keep.append(row)

# Build a new DataFrame of non-redundant edges
df_reduced = pd.DataFrame(rows_to_keep)

#-------------------------------------------------------
# 3) Save the result
#-------------------------------------------------------
df_reduced.to_csv("relations_reduced.csv", index=False)

print("Done! Redundant reverse edges removed. See 'relations_reduced.csv'.")


Done! Redundant reverse edges removed. See 'relations_reduced.csv'.


In [3]:
import pandas as pd

#-------------------------------------------------------
# 1) Read the original CSV
#-------------------------------------------------------
df = pd.read_csv("relations_reduced.csv")  
# Make sure your CSV has columns exactly:
#   Source Node, Relation, Target Node

#-------------------------------------------------------
# 2) Group by Relation
#-------------------------------------------------------
# groupby('Relation') splits the DataFrame into groups,
# each containing rows for a single Relation type.
# We then iterate over each group and write it out to
# a separate CSV file.
#-------------------------------------------------------
for relation, group_df in df.groupby('Relation'):
    # Build an output filename from the relation
    # e.g. "disease_disease.csv", "disease_protein.csv"
    output_filename = f"{relation}.csv"
    
    # Write only rows for this specific relation
    group_df.to_csv(output_filename, index=False)

print("Done! Created a separate CSV for each relation.")


Done! Created a separate CSV for each relation.
