In [6]:
import requests
import pandas as pd

# File paths
gene_expression_file = '../../data/GSE218462_raw_counts_GRCh38.p13_NCBI.tsv'
annotation_file = '../../data/Human.GRCh38.p13.annot.tsv'

# Load gene expression data and extract unique gene IDs
def load_gene_ids(file_path):
    gene_df = pd.read_csv(file_path, sep='\t')
    gene_ids = gene_df['GeneID'].unique().tolist()
    return gene_ids

# Query NCBI API to get metadata for a list of gene IDs
def fetch_gene_metadata(gene_ids, batch_size=50):
    base_url = 'https://api.ncbi.nlm.nih.gov/datasets/v2/gene/id/'
    gene_metadata = {}

    for i in range(0, len(gene_ids), batch_size):
        batch = gene_ids[i:i + batch_size]
        query_ids = ','.join(map(str, batch))
        url = f"{base_url}{query_ids}"

        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            for gene in data.get('genes', []):
                gene_metadata[gene['gene_id']] = gene  # Store full metadata for each gene ID
        else:
            print(f"Failed to fetch metadata for gene IDs: {batch}")

    return gene_metadata

# Load annotation data
def load_annotations(file_path):
    annot_df = pd.read_csv(file_path, sep='\t')
    return annot_df

# Main function to combine and save metadata
def main():
    # Load gene IDs from gene expression data
    gene_ids = load_gene_ids(gene_expression_file)
    print(f"Loaded {len(gene_ids)} unique gene IDs.")

    # Fetch metadata for gene IDs from NCBI
    gene_metadata = fetch_gene_metadata(gene_ids)
    print(f"Fetched metadata for {len(gene_metadata)} genes.")

    # Load annotation data
    annot_df = load_annotations(annotation_file)

    # Process and merge metadata
    metadata_records = []
    for gene_id, metadata in gene_metadata.items():
        annot_row = annot_df[annot_df['GeneID'] == gene_id]
        if not annot_row.empty:
            record = {
                'GeneID': gene_id,
                'Symbol': metadata.get('symbol', 'N/A'),
                'Description': metadata.get('description', 'N/A'),
                'Chromosome': metadata.get('chromosome', 'N/A'),
                'Annotation': annot_row.iloc[0].to_dict()  # Annotation details
            }
            metadata_records.append(record)

    # Convert to DataFrame and save
    output_df = pd.DataFrame(metadata_records)
    output_df.to_csv('gene_metadata_with_annotations.csv', index=False)
    print("Metadata with annotations saved to 'gene_metadata_with_annotations.csv'.")

if __name__ == "__main__":
    main()

  annotation_df = pd.read_csv(annotation_path, sep="\t")


FileNotFoundError: [Errno 2] No such file or directory: 'Hs.data'

In [2]:
# Read the TSV file into a DataFrame
tsv_file_path = '../../lifeedit_data/Human.GRCh38.p13.annot.tsv'  # Replace with the actual path to your TSV file
tsv_df = pd.read_csv(tsv_file_path, sep='\t')

# Display the first few rows of the DataFrame
print(tsv_df.head())

# Display the summary statistics of the DataFrame
print(tsv_df.describe())

      GeneID       Symbol                                 Description  \
0  100287102      DDX11L1  DEAD/H-box helicase 11 like 1 (pseudogene)   
1     653635       WASH7P           WASP family homolog 7, pseudogene   
2  102466751    MIR6859-1                             microRNA 6859-1   
3  107985730  MIR1302-2HG                         MIR1302-2 host gene   
4  100302278    MIR1302-2                             microRNA 1302-2   

                    Synonyms GeneType    EnsemblGeneID  Status        ChrAcc  \
0                        NaN   pseudo  ENSG00000290825  active  NC_000001.11   
1              FAM39F|WASH5P   pseudo              NaN  active  NC_000001.11   
2             hsa-mir-6859-1    ncRNA  ENSG00000278267  active  NC_000001.11   
3                        NaN    ncRNA              NaN  active  NC_000001.11   
4  MIRN1302-2|hsa-mir-1302-2    ncRNA  ENSG00000284332  active  NC_000001.11   

  ChrStart ChrStop Orientation  Length GOFunctionID GOProcessID GOComponentID  \

  tsv_df = pd.read_csv(tsv_file_path, sep='\t')
