<a href="https://colab.research.google.com/github/IslamTayeb/life-edit-gene-classifier/blob/main/Code/ncbiUnnamedDataParser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## NCBI Unfound Data Parsing Tool

#### You Must Install the following tools:

```
%pip install biopython
%pip install ace_tools
```

In [None]:
# Set your email and API key for NCBI Entrez
Entrez.email = "EMAIL@duke.edu"  # Replace with your email
Entrez.api_key = "API_KEY"  # Replace with your NCBI API key

In [None]:
from Bio import Entrez
import pandas as pd

# Load the CSV containing the LOC identifiers
file_path = '../../../data/elbowClusterData/clusterData/cluster_4_output.csv'
data = pd.read_csv(file_path)

# Extract unique LOC identifiers
loc_ids = data['Description'].str.extract(r'(LOC\d+)')[0].dropna().unique()

# Function to fetch gene information from NCBI
def fetch_gene_info(loc_id):
    try:
        handle = Entrez.esearch(db="gene", term=loc_id, retmax=1)
        record = Entrez.read(handle)
        handle.close()

        # If a gene is found, fetch detailed information
        if record["IdList"]:
            gene_id = record["IdList"][0]
            handle = Entrez.efetch(db="gene", id=gene_id, retmode="xml")
            record = Entrez.read(handle)
            handle.close()

            gene_info = record[0]
            return {
                "LOC Identifier": loc_id,
                "Gene ID": gene_id,
                "Description": gene_info.get("Entrezgene_summary", "N/A"),
                "Chromosome": gene_info.get("Entrezgene_locus", [{}])[0].get("Map_location", "N/A"),
                "Other Names": ", ".join([x["Gene-ref_syn_E"] for x in gene_info.get("Entrezgene_gene", {}).get("Gene-ref", {}).get("Gene-ref_syn", [])]),
            }
        else:
            return {"LOC Identifier": loc_id, "Gene ID": "Not Found", "Description": "N/A", "Chromosome": "N/A", "Other Names": "N/A"}

    except Exception as e:
        return {"LOC Identifier": loc_id, "Error": str(e)}

# Fetch information for all LOC identifiers
results = [fetch_gene_info(loc) for loc in loc_ids]

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Save the results to a CSV
output_path = "../../../data/elbowClusterData/clusterData/ncbiParse/loc_gene_info.csv"
results_df.to_csv(output_path, index=False)

# Display the results
import ace_tools as tools
tools.display_dataframe_to_user(name="LOC Gene Information", dataframe=results_df)