# info

In [None]:
docker compose up -d
http://localhost:8080/console
psql -h localhost -p 5432 -U postgres -d postgres   

# Create data

In [None]:
import json
import pandas as pd

# Load your data
with open('protein_id_annotation_position.json', 'r') as f:
    protein_id_annotation_position = json.load(f)

rows = []

# From protein_id_annotation_position.json (using actual annotation)
for protein_id, annotations in protein_id_annotation_position.items():
    for annotation, positions in annotations.items():
        # Determine annotation_type
        if annotation.startswith('['):
            annotation_type = 'PTM'
        elif annotation in ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']:
            annotation_type = 'AA'
        elif annotation in ['IDR', 'PPII-helix', 'bend', 'turn', 'unassigned', 'ß-bridge', 'ß-strand', 'α-helix', '3₁₀-helix', 'π-helix', 'loop']:
            annotation_type = 'sec'
        elif annotation.startswith('IPR'):
            annotation_type = 'domain'
        for position in positions:
            rows.append({
                'protein_id': protein_id,
                'position': str(position),
                'annotation': annotation,
                'annotation_type': annotation_type
            })

# Optional: Deduplicate
df = pd.DataFrame(rows)
df = df.drop_duplicates(subset=['protein_id', 'position', 'annotation', 'annotation_type'])

# Export to CSV
df.to_csv('protmodcon.csv', index=False)

In [2]:
import csv
import psycopg2
from psycopg2.extras import execute_values
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Update these with your actual credentials
DB_NAME = "postgres"
DB_USER = "postgres"
DB_PASSWORD = "postgrespassword"
DB_HOST = "localhost"
CSV_FILE = "protmodcon.csv"  # Path to your CSV file

def load_csv_to_postgres(csv_file):
    # Read CSV
    with open(csv_file, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        rows = [
            (row['protein_id'], row['position'], row['annotation'], row['annotation_type'])
            for row in reader
        ]

    logger.info(f"Read {len(rows)} rows from {csv_file}")

    # Connect to PostgreSQL
    conn = psycopg2.connect(
        dbname=DB_NAME,
        user=DB_USER,
        password=DB_PASSWORD,
        host=DB_HOST
    )
    cur = conn.cursor()

    # Bulk insert
    query = """
        INSERT INTO protmodcon (protein_id, position, annotation, annotation_type)
        VALUES %s
        ON CONFLICT (protein_id, position, annotation) DO NOTHING
    """
    execute_values(cur, query, rows, page_size=1000)
    conn.commit()
    cur.close()
    conn.close()
    logger.info(f"Inserted {len(rows)} rows into PostgreSQL table protmodcon.")

if __name__ == "__main__":
    load_csv_to_postgres(CSV_FILE)

2025-05-28 14:26:55,617 - INFO - Read 28210695 rows from protmodcon.csv
2025-05-28 14:37:21,425 - INFO - Inserted 28210695 rows into PostgreSQL table protmodcon.


# Parse database

In [1]:
import requests

HASURA_URL = "http://localhost:8080/v1/graphql"

def fetch_protein_sequence(protein_id, annotation_type):
    query = '''
    query ($protein_id: String!, $annotation_type: String!) {
      protmodcon(
        where: {
          protein_id: { _eq: $protein_id },
          annotation_type: { _eq: $annotation_type }
        }
      ) {
        annotation
      }
    }
    '''
    variables = {
        "protein_id": protein_id,
        "annotation_type": annotation_type
    }
    headers = {"Content-Type": "application/json"}
    response = requests.post(
        HASURA_URL,
        json={"query": query, "variables": variables},
        headers=headers
    )
    data = response.json()
    # Extract and concatenate all annotation values
    annotations = [item["annotation"] for item in data["data"]["protmodcon"]]
    sequence = ''.join(annotations)
    return sequence

# Example usage:
protein_sequence = fetch_protein_sequence("P41227", "AA")
print(protein_sequence)


MNIRNARPEDLMNMQHCNLLCLPENYQMKYYFYHGLSWPQLSYIAEDENGKIVGYVLAKMEEDPDDVPHGHITSLAVKRSHRRLGLAQKLMDQASRAMIENFNAKYVSLHVRKSNRAALHLYSNTLNFQISEVEPKYYADGEDAYAMKRDLTQMADELRRHLELKEKGRHVVLGAIENKVESKGNSPPSSGEACREEKGLAAEDSGGDSKDLSEVSETTESTDVKDSSEASDSAS
