# Goals

* Test connecting and updating a GCP Cloud SQL database

In [2]:
import os
import psycopg2
import pandas as pd
from pypika import Query, Table, Field, Column, Table

In [3]:
from dotenv import load_dotenv
load_dotenv()

True

# Connect

In [4]:
# Your GCP PostgreSQL connection parameters
# get home directory
host = os.path.join(os.path.expanduser("~"), "cloudsql", os.environ["GCP_SQL_DB_HOST"])

db_params = {
    'host': host,
    'database': os.environ["GCP_SQL_DB_NAME"],
    'user': os.environ["GCP_SQL_DB_USERNAME"],
    'password': os.environ["GCP_SQL_DB_PASSWORD"],
    'port': '5432',
    'connect_timeout': 10 
}

In [5]:
def get_db_connection():
    try:
        return psycopg2.connect(**db_params)
    except Exception as e:
        print(f"Error connecting to database: {e}")
        return None

conn = get_db_connection()    

In [28]:
# list tables in pypika
def list_tables_pypika():
    tables = Table('tables', schema='information_schema')
    query = Query.from_(tables).select('table_name').where(tables.table_schema == 'public')
    with conn.cursor() as cur:
        cur.execute(str(query))
        tables = cur.fetchall()
        return tables
list_tables_pypika()

[('screcounter_log',)]

# Create tables

In [29]:
def execute_query(stmt, conn):
    try:
        with conn.cursor() as cur:
            cur.execute(str(stmt))
            conn.commit() 
    except psycopg2.errors.DuplicateTable as e:
        print(f"Table already exists: {e}")

In [30]:
# SRX_metadata
stmt = Query \
    .create_table("srx_metadata") \
    .columns(
        Column("id", "SERIAL", nullable=False),
        Column("database", "VARCHAR(20)", nullable=False),
        Column("entrez_id", "INT", nullable=False),
        Column("srx_accession", "VARCHAR(20)", nullable=False),
        Column("is_illumina", "VARCHAR(10)", nullable=False),
        Column("is_single_cell", "VARCHAR(10)", nullable=False),
        Column("is_paired_end", "VARCHAR(10)", nullable=False),
        Column("is_10x", "VARCHAR(10)", nullable=False),
        Column("tech_10x", "VARCHAR(20)", nullable=False),
        Column("organism", "VARCHAR(60)", nullable=False),
        Column("processed", "VARCHAR(10)"),
    ) \
    .unique("database", "entrez_id") \
    .primary_key("id")

execute_query(stmt, conn)

In [31]:
# SRX_SRR
stmt = Query \
    .create_table("srx_srr") \
    .columns(
        Column("id", "SERIAL", nullable=False),
        Column("srx_accession", "VARCHAR(20)", nullable=False),
        Column("srr_accession", "VARCHAR(20)", nullable=False)
    ) \
    .unique("srx_accession", "srr_accession") \
    .primary_key("id")

execute_query(stmt, conn)

In [32]:
# scRecounter log
stmt = Query \
    .create_table("screcounter") \
    .columns(
        Column("id", "SERIAL", nullable=False),
        Column("sample_id", "VARCHAR(20)", nullable=False),
        Column("pipeline_version", "VARCHAR(10)", nullable=False),
        Column("run_id", "VARCHAR(30)", nullable=False),
        Column("task_name", "VARCHAR(20)", nullable=False),
        Column("task_exit_status", "VARCHAR(10)"),
        Column("log", "TEXT", nullable=False)
    ) \
    .primary_key("id")

execute_query(stmt, conn)

# Delete tables

In [None]:
for table in ["srx_metadata", "srx_srr", "screcounter"]:
    stmt = Query.drop_table(table)
    print(str(stmt))
    execute_query(stmt, conn)

DROP TABLE "srx_metadata"
DROP TABLE "srx_srr"
DROP TABLE "screcounter"


UndefinedTable: table "screcounter" does not exist


# Insert data

In [33]:
srx_metadata = Table("srx_metadata")

q = Query.into(srx_metadata) \
    .columns('database', 'entrez_id', 'srx_accession', 'is_illumina', 'is_single_cell', 'is_paired_end', 'is_10x', 'tech_10x', 'organism') \
    .insert('sra', 35087715, 'SRX25994842', 'yes', 'yes', 'yes', 'yes', '3_prime_gex', 'human')

execute_query(q, conn)

# Query data

In [8]:
srx_metadata = Table("srx_metadata")
stmt = Query \
    .from_(srx_metadata) \
    .select("*") 
pd.read_sql(str(stmt), conn)

  pd.read_sql(str(stmt), conn)


Unnamed: 0,id,database,entrez_id,srx_accession,is_illumina,is_single_cell,is_paired_end,is_10x,tech_10x,organism,processed
0,1,sra,35087715,SRX25994842,yes,yes,yes,yes,3_prime_gex,human,
1,2,sra,123456,test,unsure,unsure,unsure,unsure,other,other,


In [10]:
srx_srr = Table("srx_srr")
stmt = Query \
    .from_(srx_srr) \
    .select("*") 
pd.read_sql(str(stmt), conn)

  pd.read_sql(str(stmt), conn)


Unnamed: 0,id,srx_accession,srr_accession
0,1,SRX25994842,SRR30571763


In [None]:
srx_metadata = Table("srx_metadata")
stmt = Query \
    .from_(srx_metadata) \
    .select("*") \
    .where((srx_metadata.processed != "complete") | (srx_metadata.processed.isnull())) \
    .where(srx_metadata.database == "sra")
pd.read_sql(str(stmt), conn)

  pd.read_sql(str(stmt), conn)


Unnamed: 0,id,database,entrez_id,srx_accession,is_illumina,is_single_cell,is_paired_end,is_10x,tech_10x,organism,processed
0,2,sra,123456,test,unsure,unsure,unsure,unsure,other,other,
1,1,sra,35087715,SRX25994842,yes,yes,yes,yes,3_prime_gex,human,
