# Highlights

- This program only imports `psycopg2`, no `SQLAlchemy` is needed (although `pandas.read_sql_query()` gives a warning message).
- `psycopg2.connect(database= , user= , password= , host= , port= ).cursor().execute()`
- Wrap queries in `execute()`, use triple quotation marks.
- Always remember to commit using `psycopg2.connect().commit()`.
- Avoid joining text-based columns in `pandas` because typical special character escape syntax isn't honored. `psycopg2.connect().cursor().execute(sql_query)`.
- If not using `SQLAlchemy` and only relying on `psycopg2`, `INSERT INTO table () VALUES (),()...;` is the way to upload `pandas` dataframe to sql database (not recommend).
- `df.to_sql()` won't work without `SQLAlchemy` engine

In [1]:
import sys
import psycopg2
import getpass
import pandas as pd

In [3]:
print(sys.version_info)
print('psycopg2 version:',psycopg2.__version__)
print('pandas version:',pd.__version__)

sys.version_info(major=3, minor=12, micro=2, releaselevel='final', serial=0)
psycopg2 version: 2.9.9 (dt dec pq3 ext lo64)
pandas version: 2.1.4


# Establish Connection

`psycopg2.connect(database= , user= , password= , host= , port=)`

In [4]:
database_name = f'{getpass.getpass('database name: ')}'
host_name = 'localhost'
my_port = '5433'
username = 'postgres'
pass_word = f'{getpass.getpass('password: ')}'

database name: ········
password: ········


In [26]:
try:
    conn = psycopg2.connect(
        database = database_name,
        user = username,
        password = pass_word,
        host = host_name,
        port = my_port)
    print(f"Database {database_name[:3]+"***"} connected.")
except:
    print(f"Database {database_name[:3]+"***"} cannot be connected.")

Database opm*** connected.


# Cursor

- `psycopg2.connect().cursor()` # create cursor
- `psycopg2.connect().cursor().execute(sql queries)` # put the query block in quotation marks
- `psycopg2.connect().commit()` # commit updates to the database

In [23]:
# Open a cursor to perform database operations
cur = conn.cursor()
# Query and save to a python object then print the object
# Multiple queries are ok
# If aborted, try re-establish the connection
cur.execute("""
DROP TABLE IF EXISTS cm63_hq_mar2024_toa;

CREATE TABLE cm63_hq_mar2024_toa AS
SELECT TOATYPT AS "tenure-type",TOAT AS "tenure", COUNT(TOA) AS headcount,
ROUND(100*CAST(COUNT(TOA) AS NUMERIC)/(SELECT COUNT(*) FROM FACT WHERE AGYSUB = 'CM63' AND loc = '24'),2) AS PCT
FROM FACT
JOIN APPOINTMENT USING(TOA)
WHERE AGYSUB = 'CM63' AND loc = '24'
GROUP BY TOATYPT,TOAT
ORDER BY TOATYPT DESC,TOAT, HEADCOUNT;

ALTER TABLE cm63_hq_mar2024_toa ADD COLUMN time DATE;

UPDATE cm63_hq_mar2024_toa SET time = '2024-03-01';

COPY cm63_hq_mar2024_toa TO '/Users/tiangeng/Public/data/cm63_hq_mar2024_toa.csv'
DELIMITER ',' CSV HEADER;
""")
# Make the changes to the database
# Refresh tables in pgadmin, updates have been committed
conn.commit()

In [24]:
sql_cm63_hq_toa = pd.read_sql_query("""
SELECT * FROM cm63_hq_mar2024_toa
""", conn)
df_cm63_hq_toa = pd.DataFrame(sql_cm63_hq_toa)
df_cm63_hq_toa

  sql_cm63_hq_toa = pd.read_sql_query("""


Unnamed: 0,tenure-type,tenure,headcount,pct,time
0,Permanent,10-Competitive Service - Career,3676,80.63,2024-03-01
1,Permanent,15-Competitive Service - Career-Conditional,433,9.5,2024-03-01
2,Permanent,30-Excepted Service - Schedule A,214,4.69,2024-03-01
3,Permanent,35-Excepted Service - Schedule D,10,0.22,2024-03-01
4,Permanent,38-Excepted Service - Other,23,0.5,2024-03-01
5,Permanent,50-Senior Executive Service - Career,44,0.97,2024-03-01
6,Permanent,55-Senior Executive Service - Non-Career,2,0.04,2024-03-01
7,Non-permanent,20-Competitive Service,22,0.48,2024-03-01
8,Non-permanent,40-Excepted Service - Schedule A,127,2.79,2024-03-01
9,Non-permanent,45-Excepted Service - Schedule D,6,0.13,2024-03-01


# Upload `pd.DataFrame()` to sql Database Without `SQLAlchemy`

- Strongly recomend using `SQLAlchemy`
- Use the "native" way `INSERT INTO () VALUES (),()...();` if use `psycopg2`
- Escape characters is causing issues. So it's strongly recommend to work on text-based columns in `cursor.execute("""sql query""")` over in `pandas`

## Create Dataframe in `pandas`

In [36]:
agency_headcount=\
pd.DataFrame(
    pd.read_sql_query(
        """
        SELECT AGYSUBT, T1.AGYSUB, COUNT(T1.AGYSUB) AS HEADCOUNT, AVG_TENURE 
        FROM FACT T1
        JOIN AGENCY T2 ON T1.AGYSUB = T2.AGYSUB
        JOIN (SELECT AGYSUB, ROUND(AVG(LOS),2) AS AVG_TENURE FROM FACT GROUP BY AGYSUB) T3 ON T1.AGYSUB = T3.AGYSUB
        GROUP BY AGYSUBT, T1.AGYSUB, AVG_TENURE 
        ORDER BY HEADCOUNT DESC;
        """,
        conn
    )
)

  pd.read_sql_query(


In [179]:
agency_headcount_copy = agency_headcount.copy(deep=True)

## Escaping Special Character is the Problem

In [180]:
# for now, have to remove apostrophe ' because cannot escape it in the .execute() block
for idx,row in agency_headcount_copy.iterrows():
    if "'" in row['agysubt']:
        print(idx,row['agysubt'])
        print(row['agysubt'].replace("'",""))
        agency_headcount_copy.loc[idx,'agysubt'] = row['agysubt'].replace("'","")

167 DLOW-OFFICE OF WORKERS' COMPENSATION PROGRAMS
DLOW-OFFICE OF WORKERS COMPENSATION PROGRAMS
434 DLWB-WOMEN'S BUREAU
DLWB-WOMENS BUREAU
489 BH00-COMMISSION FOR THE PRESERVATION OF AMERICA'S HERITAGE ABROAD
BH00-COMMISSION FOR THE PRESERVATION OF AMERICAS HERITAGE ABROAD


In [174]:
for idx,row in agency_headcount_copy.iterrows():
    if "'" in row['agysubt']:
        print(idx,row['agysubt'])

## Convert Dataframe to a List of Tuples

In [41]:
import numpy as np

In [45]:
cols=','.join(list(agency_headcount.columns)) 
cols

'agysubt,agysub,headcount,avg_tenure'

In [175]:
agency_headcount_values_tuple=[tuple(row) for row in agency_headcount_copy.to_numpy()]
agency_headcount_values_tuple[:5]

[('VATA-VETERANS HEALTH ADMINISTRATION', 'VATA', 432908, 8.92),
 ('TR93-INTERNAL REVENUE SERVICE', 'TR93', 93937, 12.57),
 ('AF1M-AIR FORCE MATERIEL COMMAND', 'AF1M', 70084, 11.76),
 ('HSBD-CUSTOMS AND BORDER PROTECTION', 'HSBD', 65268, 13.88),
 ('HSBC-TRANSPORTATION SECURITY ADMINISTRATION', 'HSBC', 62788, 10.06)]

## `INSERT INTO` Values Row by Row

In [176]:
query_insert_info = f"""
DROP TABLE IF EXISTS agency_headcount;

CREATE TABLE agency_headcount (
agysubt VARCHAR(100),
agysub VARCHAR(5),
headcount INTEGER,
avg_tenure NUMERIC(4,2)
);

INSERT INTO agency_headcount ({cols}) 
VALUES 
{str(agency_headcount_values_tuple).lstrip('[').rstrip(']')};

COPY agency_headcount TO '/Users/tiangeng/Public/data/agency_headcount_202403.csv'
DELIMITER ',' CSV HEADER;
"""
print(query_insert_info[:500])
print("\n...\n")
print(query_insert_info[-500:])


DROP TABLE IF EXISTS agency_headcount;

CREATE TABLE agency_headcount (
agysubt VARCHAR(100),
agysub VARCHAR(5),
headcount INTEGER,
avg_tenure NUMERIC(4,2)
);

INSERT INTO agency_headcount (agysubt,agysub,headcount,avg_tenure) 
VALUES 
('VATA-VETERANS HEALTH ADMINISTRATION', 'VATA', 432908, 8.92), ('TR93-INTERNAL REVENUE SERVICE', 'TR93', 93937, 12.57), ('AF1M-AIR FORCE MATERIEL COMMAND', 'AF1M', 70084, 11.76), ('HSBD-CUSTOMS AND BORDER PROTECTION', 'HSBD', 65268, 13.88), ('HSBC-TRANSPORTATION 

...

 1, 1.2), ('DA00-DELTA REGIONAL AUTHORITY', 'DA00', 1, 2.0), ('AG15-RURAL UTILITIES SERVICE', 'AG15', 1, 31.0), ('ARBA-U.S. ARMY INSTALLATION MANAGEMENT COMMAND', 'ARBA', 1, 13.3), ('OP00-Office of Pandemic Preparedness and Response Policy', 'OP00', 1, 0.6), ('HUVV-OFFICE OF DISASTER MANAGEMENT AND NATIONAL SECURITY', 'HUVV', 1, 15.9), ('AF3G-AIR FORCE ELEMENTS, NATO', 'AF3G', 1, 21.2);

COPY agency_headcount TO '/Users/tiangeng/Public/data/agency_headcount_202403.csv'
DELIMITER ',' CSV H

In [178]:
# Open a cursor to perform database operations
cur = conn.cursor()
# Query and save to a python object then print the object
# Multiple queries are ok
# If aborted, try conn.rollback() or re-establish the connection
cur.execute(query_insert_info)
# Make the changes to the database
# Refresh tables in pgadmin, updates have been committed
conn.commit()

In [177]:
conn.rollback()

## `df.to_sql()` Won't Work Without ``SQLAlchemy``

In [39]:
# doesn't work without SQLAlchemy
agency_headcount.to_sql(name='agency_headcount',
                        con=conn,
                        if_exists='replace',
                        index=False)

  agency_headcount.to_sql(name=username,


DatabaseError: Execution failed on sql '
        SELECT
            name
        FROM
            sqlite_master
        WHERE
            type IN ('table', 'view')
            AND name=?;
        ': syntax error at or near ";"
LINE 8:             AND name=?;
                              ^


In [181]:
cur.close()
conn.close()