In [104]:
import warnings
warnings.filterwarnings("ignore")

from octopus.db import PostgresqlManager
from dotenv import load_dotenv

import matplotlib.pyplot as plt
from tqdm import tqdm

# Roadmap

1. Prepare links first
   - transactions (bio-ticker)
   - committee-assignment (bio-committee)
   - lobbying-on-bills (ticker-bill)
   - bill-assignments (bill-committee)
   - sponsors (bio-bill)
   - cosponsors (bio-bill
      - Since we have 115, 116, 117th committee assignments, let's cut-off from 2016-01-01 []
   - industry (ticker-NAICS) 

## Transactions 

In [105]:
# get transactions data (bioguide-ticker)
load_dotenv("/Users/syyun/Dropbox (MIT)/efd/.envlv", override=True)
pm = PostgresqlManager(dotenv_path="/Users/syyun/Dropbox (MIT)/efd/.envlv")
trans = pm.execute_sql(fetchall=True, sql=
                    """
                    with st as (
                    select distinct bioguide_id, ticker, trans_date  from "_sandbox_suyeol".senate_annual_4a saa
                        inner join  _sandbox_suyeol.senate_annual sa on sa.report_type_url = saa.report_url
                        inner join _sandbox_suyeol.senator_bioguide sb on sb.first_name = sa.first_name  and sb.last_name = sa.last_name 
                    where ticker is not null and trans_date is not null
                    )
                    , ht as (
                    select distinct bioguide_id, ticker, transaction_date as trans_date from "_sandbox_suyeol".house_fd_transactions hft 
                    	inner join "_sandbox_suyeol".house_docs_id hdi on hdi."DocID" =hft.docid 
                    	inner join "_sandbox_suyeol".house_fd_bio_brdige hfbb on hfbb.first_name = hdi."First" and hfbb.last_name =hdi."Last" 
					where ticker is not null and transaction_date is not null
					)
					, union_sh as (
					select * from st
					union
					select * from ht
					)
					select * from union_sh        
                    """)

In [106]:
import pandas as pd

# Create a DataFrame with the desired column names
trans_df = pd.DataFrame(trans, columns=['bioguide_id', 'ticker', 'transaction_date'])

# Convert transaction date to datetime
trans_df['transaction_date'] = pd.to_datetime(trans_df['transaction_date'])

# Cleanse ticker by splitting on whitespace, stripping, and taking the first part
trans_df['ticker'] = trans_df['ticker'].apply(lambda x: x.strip().split()[0].strip())

# Remove None values
trans_df = trans_df.dropna()

# Display the cleaned DataFrame
print(trans_df.shape)
trans_df.head(10)

(24675, 3)


Unnamed: 0,bioguide_id,ticker,transaction_date
0,W000802,VOD,2016-09-28
1,Y000062,YUM,2018-03-08
2,Y000062,MCD,2017-11-09
3,P000608,SYK,2015-06-23
4,C001101,UBS,2021-09-16
5,R000609,LLY,2019-02-19
6,F000462,TFC,2020-06-10
7,W000779,MAR,2020-06-05
8,C001066,DIS,2016-06-06
9,G000583,CHUY,2020-09-15


## Committee Assignments (bio-committee)

In [107]:
# get committee assignments
assign = pm.execute_sql(fetchall=True, sql=
                f"""
                select distinct committee_thomas_id, legislator_bioguide_id, congress_num  from relational___congress.committees__legislators cl
	                inner join relational___congress.legislators l on l.bioguide_id = cl.legislator_bioguide_id                """
                )

In [108]:
import pandas as pd

# Create a DataFrame with the desired column names
assign_df = pd.DataFrame(assign, columns=['committee_id', 'bioguide_id', 'congress_num'])

# Remove None values
assign_df = assign_df.dropna()

# Display the cleaned DataFrame
print(assign_df.shape)
assign_df.head(10)

(11698, 3)


Unnamed: 0,committee_id,bioguide_id,congress_num
0,SSBK08,K000393,115
1,HSBA13,P000616,116
2,SSBK12,B000944,115
3,HSAP04,R000609,115
4,SSFR,C000141,116
5,HSPW14,B001295,117
6,HSGO24,C001116,117
7,HSED02,S001206,116
8,SSEG07,L000577,117
9,HSPW,L000578,117


In [109]:
import pandas as pd

# Load your dataframe, assuming it is called df
assign_df['congress_num'] = pd.to_numeric(assign_df['congress_num']) # convert congress_num to numeric

# Define function to get start and end year of congress
def get_congress_years(congress_num):
    start_year = (congress_num - 115) * 2 + 2017
    end_year = start_year + 1
    return start_year, end_year

# Apply function to congress_num column to get year start and end dates
assign_df = assign_df.assign(congress_years=assign_df['congress_num'].apply(get_congress_years))

# Split the congress_years column into two separate columns
assign_df[['congress_year_start', 'congress_year_end']] = pd.DataFrame(assign_df['congress_years'].tolist(), index=assign_df.index)

assign_df['congress_year_start'] = pd.to_datetime(assign_df['congress_year_start'].astype(str) + '-01-03')
assign_df['congress_year_end'] = pd.to_datetime((assign_df['congress_year_end']+1).astype(str) + '-01-03')


# Drop the congress_years column
assign_df.drop('congress_years', axis=1, inplace=True)

assign_df.head(10)


Unnamed: 0,committee_id,bioguide_id,congress_num,congress_year_start,congress_year_end
0,SSBK08,K000393,115,2017-01-03,2019-01-03
1,HSBA13,P000616,116,2019-01-03,2021-01-03
2,SSBK12,B000944,115,2017-01-03,2019-01-03
3,HSAP04,R000609,115,2017-01-03,2019-01-03
4,SSFR,C000141,116,2019-01-03,2021-01-03
5,HSPW14,B001295,117,2021-01-03,2023-01-03
6,HSGO24,C001116,117,2021-01-03,2023-01-03
7,HSED02,S001206,116,2019-01-03,2021-01-03
8,SSEG07,L000577,117,2021-01-03,2023-01-03
9,HSPW,L000578,117,2021-01-03,2023-01-03


## Lobby on Bills (ticker-bill)

In [110]:
# ticker's lobbying on bills
load_dotenv("/Users/syyun/Dropbox (MIT)/efd/.envlv", override=True)
pm = PostgresqlManager(dotenv_path="/Users/syyun/Dropbox (MIT)/efd/.envlv")
lob = pm.execute_sql(fetchall=True, sql=
                f"""
                with lobby_on_bill as (
                    select distinct concat(bill_type, bill_number_chain, '-', congress_num) as bill_id, ticker, f.dt_posted  from link___lda__congress."_issue_paragraphs__bills" ipb 
                    inner join relational___lda.clientships c on c.clientship_id  = ipb.clientship_id 
                    inner join relational___lda.filings f on f.filing_uuid  = ipb.filing_uuid 
                    inner join "_sandbox_suyeol".client_ticker ct on ct.client_name = c.client_name 
                    where ticker != 'Not found' and ticker is not null
                    )
                    select * from lobby_on_bill
                """
                )

In [111]:
# Create the Pandas dataframe
lob_df = pd.DataFrame(lob, columns=['bill_id', 'ticker', 'datetime'])

# Convert datetime column to date only
lob_df['date'] = pd.to_datetime(lob_df['datetime']).dt.date

# Drop the datetime column
lob_df.drop('datetime', axis=1, inplace=True)

# Drop None values
lob_df = lob_df.dropna()

# Print the resulting dataframe
print(lob_df.shape)
lob_df.head(10)

(419224, 3)


Unnamed: 0,bill_id,ticker,date
0,hr1332-116,HCA,2019-10-18
1,hr3962-111,IVCRQ,2010-04-16
2,hr1-115,INVVY,2018-01-18
3,hr3590-111,SVNDY,2010-04-20
4,hr842-117,HLF,2021-10-19
5,hr2113-116,CSL,2019-10-17
6,s684-116,HMRK,2019-10-18
7,hr5497-112,PG,2012-10-19
8,hr767-108,RTX,2005-02-11
9,hr2471-111,PNW,2010-01-20


## Bill Assignments (bill-committee)

In [112]:
# bill-assignments (bill-committee)
ba = pm.execute_sql(fetchall=True, sql=
                f"""
select b.bill_id, committee_id, b.introduced_at  from relational___congress.bills__committees bc
      	inner join relational___congress.bills b on b.bill_id = bc.bill_id 
                """)

In [113]:
# Create the Pandas dataframe
ba_df = pd.DataFrame(ba, columns=['bill_id', 'committee_id', 'intro_date'])

# Convert datetime column to date only
ba_df['date'] = pd.to_datetime(ba_df['intro_date']).dt.date

# Drop the datetime column
ba_df.drop('intro_date', axis=1, inplace=True)

# Drop None values
ba_df = ba_df.dropna()

# Print the resulting dataframe
print(ba_df.shape)
ba_df.head(10)

(546703, 3)


Unnamed: 0,bill_id,committee_id,date
0,hres607-93,HSFA,1973-10-17
1,hres556-93,HSFA,1973-09-20
2,hres322-93,HSRU,1973-03-22
3,hres18-93,HSRU,1973-01-03
4,hres482-93,HSFA,1973-07-10
5,hres350-93,HSWM,1973-04-10
6,hres528-93,HSAS,1973-08-03
7,hres1408-93,HSFA,1974-10-02
8,hres226-93,HSRU,1973-02-20
9,hres1244-93,HSRU,1974-07-18


## Sponsors (bio-bill)

In [114]:
# bill-sponsor
bs = pm.execute_sql(fetchall=True, sql=
                f"""
      	select b.bill_id, legislator_bioguide_id, introduced_at from relational___congress.bills__legislators bl
      		inner join relational___congress.bills b on bl.bill_id = b.bill_id 
      	where relation = 'sponsor'
                """)

In [115]:
# Create the Pandas dataframe
bs_df = pd.DataFrame(bs, columns=['bill_id', 'bioguide_id', 'intro_date'])

# Convert datetime column to date only
bs_df['date'] = pd.to_datetime(bs_df['intro_date']).dt.date

# Drop the datetime column
bs_df.drop('intro_date', axis=1, inplace=True)

# Drop None values
bs_df = bs_df.dropna()

# Print the resulting dataframe
print(bs_df.shape)
bs_df.head(10)

(335837, 3)


Unnamed: 0,bill_id,bioguide_id,date
0,hconres100-107,D000533,2001-04-04
1,hconres100-100,R000053,1987-04-08
2,hconres100-101,Y000014,1989-04-18
3,hconres100-102,M000590,1991-03-19
4,hconres100-103,B000403,1993-05-12
5,hconres100-104,H000981,1995-09-06
6,hconres100-111,B001230,2009-04-21
7,hconres100-112,Q000024,2012-02-14
8,hconres100-113,F000455,2014-05-22
9,hconres100-114,R000580,2015-12-01


## Co-sponsors (bio-bill)

In [116]:
# bill-cosponsor
bcs = pm.execute_sql(fetchall=True, sql=
                f"""
      	select b.bill_id, legislator_bioguide_id, introduced_at from relational___congress.bills__legislators bl
      		inner join relational___congress.bills b on bl.bill_id = b.bill_id 
      	where relation = 'cosponsor'
                """)

In [117]:
# Create the Pandas dataframe
bcs_df = pd.DataFrame(bcs, columns=['bill_id', 'bioguide_id', 'intro_date'])

# Convert datetime column to date only
bcs_df['date'] = pd.to_datetime(bcs_df['intro_date']).dt.date

# Drop the datetime column
bcs_df.drop('intro_date', axis=1, inplace=True)

# Drop None values
bcs_df = bcs_df.dropna()

# Print the resulting dataframe
print(bcs_df.shape)
bcs_df.head(10)

(3634549, 3)


Unnamed: 0,bill_id,bioguide_id,date
0,hconres100-100,G000445,1987-04-08
1,hconres100-101,B000403,1989-04-18
2,hconres100-101,M000015,1989-04-18
3,hconres100-102,A000014,1991-03-19
4,hconres100-102,A000022,1991-03-19
5,hconres100-102,A000189,1991-03-19
6,hconres100-102,B000229,1991-03-19
7,hconres100-102,B000463,1991-03-19
8,hconres100-102,B000551,1991-03-19
9,hconres100-102,B000586,1991-03-19


## Ticker-NAICS

In [118]:
# ticker-naics
tn = pm.execute_sql(fetchall=True, sql=
                f"""
                with n1 as (
                select ticker, naics1 as naics  from "_sandbox_suyeol".ticker_naics tn 
                    inner join "_sandbox_suyeol".ticker_naics_url tnu on tnu.naics_url =tn.naics_url 
                where ticker is not null and naics1 is not null and naics1 != ''
                )
                , n2 as (
                select ticker, naics2 as naics from "_sandbox_suyeol".ticker_naics tn 
                    inner join "_sandbox_suyeol".ticker_naics_url tnu on tnu.naics_url =tn.naics_url 
                where ticker is not null and naics2 is not null and naics2 != ''
                )
                , z as (
                select ticker, naics as naics from "_sandbox_suyeol".ticker_naics_zoom tnz 
                where ticker is not null and naics is not null and naics != ''

                )
                select * from n1
                union
                select * from n2
                union
                select * from z
                """)

In [119]:
# Create the Pandas dataframe
tn_df = pd.DataFrame(tn, columns=['ticker', 'naics'])

# Drop None values
tn_df = tn_df.dropna()

# Sort by ticker asc
tn_df = tn_df.sort_values(by=['ticker'])

# Filter rows where the "ticker" column starts with a capital letter (A-Z)
tn_df = tn_df[tn_df["ticker"].str.match(r'^[A-Z]')]

# Replace any non-alphabetic characters in the "ticker" column with a period (.)
tn_df["ticker"] = tn_df["ticker"].str.replace(r'[^A-Za-z]', '.', regex=True)


# Print the resulting dataframe
print(tn_df.shape)
tn_df.head(10)

(4147, 2)


Unnamed: 0,ticker,naics
581,A,334516
438,A,513210
3036,AA,331318
1122,AA,331523
1056,AACFX,523940
2185,AAIC.B,541720
1459,AAIEX,52525
2621,AAIGF,52524
147,AAL,481111
3032,AAL,488119


## Generate Graph

In [120]:
from torch_geometric.data import HeteroData
data = HeteroData()

### Add Nodes

- Add Bio Nodes

In [121]:
import pandas as pd
import numpy as np

congressperson_nodes = set(trans_df["bioguide_id"]).union(assign_df["bioguide_id"]).union(
    bs_df["bioguide_id"]).union(bcs_df["bioguide_id"])

congressperson_nodes = sorted(list(congressperson_nodes))

# Create a dictionary mapping unique congresspeople to their one-hot encoded vectors
num_nodes = len(congressperson_nodes)
unique_congresspeople = {name: idx for idx, name in enumerate(congressperson_nodes)}
print(unique_congresspeople)
one_hot_vectors = np.eye(num_nodes, dtype=np.float32)

print(one_hot_vectors.shape)

# Add node-type attribute
congressperson_attributes = [
    (name, {"cgp_one_hot": one_hot_vectors[idx], "node_type": "congressperson"})
    for idx, name in enumerate(congressperson_nodes)
]
print(congressperson_attributes) # this is only for printing

# assign nodes
data['congressperson'].x = one_hot_vectors
print(data)

{'A000009': 0, 'A000014': 1, 'A000017': 2, 'A000018': 3, 'A000022': 4, 'A000031': 5, 'A000052': 6, 'A000055': 7, 'A000062': 8, 'A000069': 9, 'A000073': 10, 'A000076': 11, 'A000103': 12, 'A000109': 13, 'A000118': 14, 'A000121': 15, 'A000127': 16, 'A000139': 17, 'A000148': 18, 'A000170': 19, 'A000177': 20, 'A000189': 21, 'A000195': 22, 'A000202': 23, 'A000207': 24, 'A000208': 25, 'A000209': 26, 'A000210': 27, 'A000211': 28, 'A000212': 29, 'A000213': 30, 'A000214': 31, 'A000215': 32, 'A000216': 33, 'A000217': 34, 'A000219': 35, 'A000220': 36, 'A000221': 37, 'A000222': 38, 'A000224': 39, 'A000226': 40, 'A000329': 41, 'A000337': 42, 'A000355': 43, 'A000356': 44, 'A000357': 45, 'A000358': 46, 'A000359': 47, 'A000360': 48, 'A000361': 49, 'A000362': 50, 'A000363': 51, 'A000364': 52, 'A000365': 53, 'A000366': 54, 'A000367': 55, 'A000368': 56, 'A000369': 57, 'A000370': 58, 'A000371': 59, 'A000372': 60, 'A000373': 61, 'A000374': 62, 'A000375': 63, 'A000376': 64, 'A000377': 65, 'A000378': 66, 'B00

- Add Committee Nodes

In [122]:
import pandas as pd
import numpy as np

# Extract committee nodes
committee_nodes = set(assign_df["committee_id"]).union(ba_df["committee_id"])

# Remove None values and sort committee_nodes in ascending order
committee_nodes = sorted([node for node in committee_nodes if node is not None])

# Create a dictionary mapping unique committees to their one-hot encoded vectors
num_nodes = len(committee_nodes)
unique_committees = {name: idx for idx, name in enumerate(committee_nodes)}
print(unique_committees)
one_hot_vectors = np.eye(num_nodes, dtype=np.float32)

print(one_hot_vectors.shape)

# Add node-type attribute
committee_attributes = [
    (name, {"committee_one_hot": one_hot_vectors[idx], "node_type": "committee"})
    for idx, name in enumerate(committee_nodes)
]
# print(committee_attributes)  # this is only for printing

# Assign committee nodes
data['committee'].x = one_hot_vectors
print(data)

{'HHAH': 0, 'HLCQ': 1, 'HLET': 2, 'HLIG': 3, 'HLIG01': 4, 'HLIG02': 5, 'HLIG03': 6, 'HLIG04': 7, 'HLIG05': 8, 'HLIG06': 9, 'HLIG08': 10, 'HLIG10': 11, 'HLOC': 12, 'HSAG': 13, 'HSAG01': 14, 'HSAG02': 15, 'HSAG03': 16, 'HSAG04': 17, 'HSAG05': 18, 'HSAG06': 19, 'HSAG07': 20, 'HSAG08': 21, 'HSAG14': 22, 'HSAG15': 23, 'HSAG16': 24, 'HSAG20': 25, 'HSAG21': 26, 'HSAG22': 27, 'HSAG23': 28, 'HSAG24': 29, 'HSAG25': 30, 'HSAG26': 31, 'HSAG27': 32, 'HSAG28': 33, 'HSAG29': 34, 'HSAP': 35, 'HSAP01': 36, 'HSAP02': 37, 'HSAP03': 38, 'HSAP04': 39, 'HSAP05': 40, 'HSAP06': 41, 'HSAP07': 42, 'HSAP08': 43, 'HSAP09': 44, 'HSAP10': 45, 'HSAP11': 46, 'HSAP12': 47, 'HSAP13': 48, 'HSAP15': 49, 'HSAP18': 50, 'HSAP19': 51, 'HSAP20': 52, 'HSAP23': 53, 'HSAP24': 54, 'HSAS': 55, 'HSAS01': 56, 'HSAS02': 57, 'HSAS03': 58, 'HSAS04': 59, 'HSAS05': 60, 'HSAS06': 61, 'HSAS07': 62, 'HSAS20': 63, 'HSAS25': 64, 'HSAS26': 65, 'HSAS27': 66, 'HSAS28': 67, 'HSAS29': 68, 'HSAS30': 69, 'HSAS35': 70, 'HSAT': 71, 'HSBA': 72, 'HSBA01

- Add Company Nodes

In [123]:
import pandas as pd
import numpy as np

# Extract ticker nodes
ticker_nodes = set(trans_df["ticker"]).union(lob_df["ticker"]).union(tn_df["ticker"])

# Remove None values and sort ticker_nodes in ascending order
ticker_nodes = sorted([node for node in ticker_nodes if node is not None])

# Create a dictionary mapping unique tickers to their one-hot encoded vectors
num_nodes = len(ticker_nodes)
unique_tickers = {name: idx for idx, name in enumerate(ticker_nodes)}
print(unique_tickers)
one_hot_vectors = np.eye(num_nodes, dtype=np.float32)

print(one_hot_vectors.shape)

# Add node-type attribute
ticker_attributes = [
    (name, {"ticker_one_hot": one_hot_vectors[idx], "node_type": "ticker"})
    for idx, name in enumerate(ticker_nodes)
]
print(ticker_attributes)  # this is only for printing

# Assign ticker nodes
data['ticker'].x = one_hot_vectors
print(data)

{'000150': 0, '000660': 1, '005720': 2, '005930': 3, '028260': 4, '066570': 5, '137400': 6, '1821': 7, '1901': 8, '2010': 9, '2090': 10, '2454': 11, '2461': 12, '2498': 13, '3001': 14, '300532': 15, '3445': 16, '352940': 17, '3696': 18, '373220': 19, '3962': 20, '4107': 21, '4312': 22, '4390': 23, '4641': 24, '4813': 25, '4THDIM': 26, '524818': 27, '536868': 28, '538180': 29, '539255': 30, '540204': 31, '5947': 32, '601012': 33, '603028': 34, '6335': 35, '65I': 36, '6677': 37, '6701': 38, '688408': 39, '7701': 40, '7707': 41, '9450': 42, '9993': 43, '9IL': 44, 'A': 45, 'AA': 46, 'AAA': 47, 'AABB': 48, 'AACFX': 49, 'AAGIY': 50, 'AAI': 51, 'AAIC': 52, 'AAIC$B': 53, 'AAIC.B': 54, 'AAIEX': 55, 'AAIGF': 56, 'AAJ': 57, 'AAL': 58, 'AAM$A': 59, 'AAM.A': 60, 'AAN': 61, 'AAON': 62, 'AAP': 63, 'AAPL': 64, 'AAT': 65, 'AATC': 66, 'AAWW': 67, 'ABALX': 68, 'ABB': 69, 'ABBV': 70, 'ABC': 71, 'ABCB': 72, 'ABCIX': 73, 'ABCO': 74, 'ABEV': 75, 'ABEYX': 76, 'ABI': 77, 'ABM': 78, 'ABMD': 79, 'ABNB': 80, 'ABS

- Add Bill nodes

In [124]:
import pandas as pd
import numpy as np

# Define a date filter
date_filter = pd.Timestamp("2016-01-01")

# Filter bill nodes based on the date filter
filtered_lob_df = lob_df[lob_df["date"] > date_filter]
filtered_ba_df = ba_df[ba_df["date"] > date_filter]
filtered_bs_df = bs_df[bs_df["date"] > date_filter]
filtered_cbs_df = bcs_df[bcs_df["date"] > date_filter]

# Extract bill nodes
bill_nodes = set(filtered_lob_df["bill_id"]).union(filtered_ba_df["bill_id"]).union(filtered_bs_df["bill_id"]).union(filtered_cbs_df["bill_id"])

# Remove None values and sort bill_nodes in ascending order
bill_nodes = sorted([node for node in bill_nodes if node is not None])

# Create a dictionary mapping unique bills to their one-hot encoded vectors
num_nodes = len(bill_nodes)
unique_bills = {name: idx for idx, name in enumerate(bill_nodes)}
print(unique_bills)
one_hot_vectors = np.eye(num_nodes, dtype=np.float32)

print(one_hot_vectors.shape)

# Add node-type attribute
bill_attributes = [
    (name, {"bill_one_hot": one_hot_vectors[idx], "node_type": "bill"})
    for idx, name in enumerate(bill_nodes)
]
print(bill_attributes)  # this is only for printing

{'hconres1-115': 0, 'hconres1-116': 1, 'hconres1-117': 2, 'hconres10-115': 3, 'hconres10-116': 4, 'hconres10-117': 5, 'hconres100-115': 6, 'hconres100-116': 7, 'hconres101-115': 8, 'hconres101-116': 9, 'hconres102-115': 10, 'hconres102-116': 11, 'hconres103-115': 12, 'hconres103-116': 13, 'hconres104-115': 14, 'hconres104-116': 15, 'hconres105-114': 16, 'hconres105-115': 17, 'hconres105-116': 18, 'hconres106-114': 19, 'hconres106-115': 20, 'hconres106-116': 21, 'hconres107-114': 22, 'hconres107-115': 23, 'hconres107-116': 24, 'hconres108-114': 25, 'hconres108-115': 26, 'hconres108-116': 27, 'hconres109-114': 28, 'hconres109-115': 29, 'hconres109-116': 30, 'hconres11-115': 31, 'hconres11-116': 32, 'hconres11-117': 33, 'hconres110-114': 34, 'hconres110-115': 35, 'hconres110-116': 36, 'hconres111-114': 37, 'hconres111-115': 38, 'hconres111-116': 39, 'hconres112-114': 40, 'hconres112-115': 41, 'hconres112-116': 42, 'hconres113-114': 43, 'hconres113-115': 44, 'hconres113-116': 45, 'hconres1

In [125]:
# Assign bill nodes
data['bill'].x = one_hot_vectors
print(data)

HeteroData(
  [1mcongressperson[0m={ x=[2431, 2431] },
  [1mcommittee[0m={ x=[556, 556] },
  [1mticker[0m={ x=[4202, 4202] },
  [1mbill[0m={ x=[47767, 47767] }
)


- Add NAICS nodes

In [126]:
import pandas as pd
import numpy as np

# Extract ticker nodes
naics_nodes = set(tn_df["naics"])

# Remove None values and sort ticker_nodes in ascending order
naics_nodes = sorted([node for node in naics_nodes if node is not None])

# Create a dictionary mapping unique tickers to their one-hot encoded vectors
num_nodes = len(naics_nodes)
unique_naics = {name: idx for idx, name in enumerate(naics_nodes)}
print(unique_naics)
one_hot_vectors = np.eye(num_nodes, dtype=np.float32)

print(one_hot_vectors.shape)

# Add node-type attribute
naics_attributes = [
    (name, {"naics_one_hot": one_hot_vectors[idx], "node_type": "naics"})
    for idx, name in enumerate(naics_nodes)
]
# print(naics_attributes)  # this is only for printing

# Assign ticker nodes
data['naics'].x = one_hot_vectors
print(data)

{'10104': 0, '11111': 1, '11112': 2, '111199': 3, '111211': 4, '111219': 5, '111335': 6, '111422': 7, '112320': 8, '112340': 9, '113110': 10, '113210': 11, '114112': 12, '115112': 13, '115116': 14, '115310': 15, '211120': 16, '211130': 17, '21211': 18, '212111': 19, '212114': 20, '212115': 21, '21212': 22, '21213': 23, '212210': 24, '212220': 25, '212230': 26, '212290': 27, '212291': 28, '212312': 29, '212313': 30, '212323': 31, '212390': 32, '212391': 33, '213111': 34, '213112': 35, '221111': 36, '221112': 37, '221114': 38, '221115': 39, '221117': 40, '221118': 41, '221121': 42, '221122': 43, '221210': 44, '221310': 45, '221320': 46, '22221': 47, '23236': 48, '23237': 49, '23238': 50, '236115': 51, '236117': 52, '236118': 53, '236220': 54, '237110': 55, '237120': 56, '237130': 57, '237210': 58, '237310': 59, '237990': 60, '238120': 61, '238130': 62, '238210': 63, '238220': 64, '238290': 65, '28282': 66, '28283': 67, '311211': 68, '311221': 69, '311224': 70, '311225': 71, '311230': 72,

## Add Edges

- Reference date = 2016-01-01

In [127]:
# Reference date
ref_date = pd.Timestamp(2016, 1, 1)

- Add Trnasaction Edges (bio-ticker)

In [128]:
import torch

# Create a list of tuples representing the edges with their corresponding date attributes
edges_with_dates = [
    (unique_congresspeople[row['bioguide_id']], unique_tickers[row['ticker']], row['transaction_date'])
    for _, row in trans_df.iterrows()
]

# Calculate the elapsed days from the reference date
elapsed_days = np.array([(edge[2] - ref_date).days for edge in edges_with_dates], dtype=np.float32).reshape(-1, 1)
print(elapsed_days)

# Assign the edge_index for the corresponding edge type in the data object
data['congressperson', 'buy-sell', 'ticker'].edge_index = torch.tensor(
    [(edge[0], edge[1]) for edge in edges_with_dates],
    dtype=torch.long
).t().contiguous()

# Assign the edge_attr (elapsed_days attribute) for the corresponding edge type in the data object
data['congressperson', 'buy-sell', 'ticker'].edge_attr = torch.tensor(elapsed_days, dtype=torch.float32)

print(data)

[[ 271.]
 [ 797.]
 [ 678.]
 ...
 [1168.]
 [1466.]
 [1250.]]
HeteroData(
  [1mcongressperson[0m={ x=[2431, 2431] },
  [1mcommittee[0m={ x=[556, 556] },
  [1mticker[0m={ x=[4202, 4202] },
  [1mbill[0m={ x=[47767, 47767] },
  [1mnaics[0m={ x=[744, 744] },
  [1m(congressperson, buy-sell, ticker)[0m={
    edge_index=[2, 24675],
    edge_attr=[24675, 1]
  }
)


- Add Committee Assignment Edges (bio-committee)

In [129]:
# Create a list of tuples representing the edges with their corresponding start and end date attributes
edges_with_dates = [
    (unique_congresspeople[row['bioguide_id']], unique_committees[row['committee_id']], row['congress_year_start'], row['congress_year_end'])
    for _, row in assign_df.iterrows()
]

# Calculate the elapsed days from the reference date for start and end dates
elapsed_start_days = np.array([(edge[2] - ref_date).days for edge in edges_with_dates], dtype=np.float32).reshape(-1, 1)
elapsed_end_days = np.array([(edge[3] - ref_date).days for edge in edges_with_dates], dtype=np.float32).reshape(-1, 1)

# Concatenate elapsed_start_days and elapsed_end_days to form a 2D array
elapsed_days = np.concatenate((elapsed_start_days, elapsed_end_days), axis=1)

# Assign the edge_index for the corresponding edge type in the data object
data['congressperson', 'assignment', 'committee'].edge_index = torch.tensor(
    [(edge[0], edge[1]) for edge in edges_with_dates],
    dtype=torch.long
).t().contiguous()

# Assign the edge_attr (elapsed_days attribute) for the corresponding edge type in the data object
data['congressperson', 'assignment', 'committee'].edge_attr = torch.tensor(elapsed_days, dtype=torch.float32)

print(data)

HeteroData(
  [1mcongressperson[0m={ x=[2431, 2431] },
  [1mcommittee[0m={ x=[556, 556] },
  [1mticker[0m={ x=[4202, 4202] },
  [1mbill[0m={ x=[47767, 47767] },
  [1mnaics[0m={ x=[744, 744] },
  [1m(congressperson, buy-sell, ticker)[0m={
    edge_index=[2, 24675],
    edge_attr=[24675, 1]
  },
  [1m(congressperson, assignment, committee)[0m={
    edge_index=[2, 11698],
    edge_attr=[11698, 2]
  }
)


- Add Lobbying Edges (ticker-bill)

In [130]:
# Convert 'date' column to Pandas Timestamp type
lob_df['date'] = pd.to_datetime(lob_df['date'])

# Filter lob_df for dates after the reference date
filtered_lob_df = lob_df[lob_df["date"] > ref_date]

# Create edge_index for ticker lobbying on bill_id
filtered_lob_df["ticker_idx"] = filtered_lob_df["ticker"].map(unique_tickers)
filtered_lob_df["bill_idx"] = filtered_lob_df["bill_id"].map(unique_bills)

# Calculate the number of days between the 'date' column and the reference date
filtered_lob_df["elapsed_days"] = (filtered_lob_df["date"] - ref_date).dt.days

# Create edge attributes as a tensor
edge_attr = torch.tensor(filtered_lob_df["elapsed_days"].values, dtype=torch.float32).unsqueeze(-1)

# Assign edge_index and edge_attr to the data object
edge_index = torch.tensor(filtered_lob_df[["ticker_idx", "bill_idx"]].values, dtype=torch.long).T
data['ticker', 'lobbies_on', 'bill'].edge_index = edge_index
data['ticker', 'lobbies_on', 'bill'].edge_attr = edge_attr

print(data)

HeteroData(
  [1mcongressperson[0m={ x=[2431, 2431] },
  [1mcommittee[0m={ x=[556, 556] },
  [1mticker[0m={ x=[4202, 4202] },
  [1mbill[0m={ x=[47767, 47767] },
  [1mnaics[0m={ x=[744, 744] },
  [1m(congressperson, buy-sell, ticker)[0m={
    edge_index=[2, 24675],
    edge_attr=[24675, 1]
  },
  [1m(congressperson, assignment, committee)[0m={
    edge_index=[2, 11698],
    edge_attr=[11698, 2]
  },
  [1m(ticker, lobbies_on, bill)[0m={
    edge_index=[2, 148487],
    edge_attr=[148487, 1]
  }
)


- Add bill assignment edges (bill-committee)

In [131]:
# Convert 'date' column to Pandas Timestamp type
ba_df["date"] = pd.to_datetime(ba_df["date"])

# Filter ba_df for dates after the reference date
filtered_ba_df = ba_df[ba_df["date"] > ref_date]

# Map nodes to their corresponding indices
filtered_ba_df["bill_idx"] = filtered_ba_df["bill_id"].map(unique_bills)
filtered_ba_df["committee_idx"] = filtered_ba_df["committee_id"].map(unique_committees)

# Create edge_index and edge_attr
edge_index = filtered_ba_df[["bill_idx", "committee_idx"]].T.values
edge_attr = (filtered_ba_df["date"] - ref_date).dt.days.values.reshape(-1, 1)

# Assign edges and edge attributes
data['bill', 'assigned_to', 'committee'].edge_index = torch.tensor(edge_index, dtype=torch.long)
data['bill', 'assigned_to', 'committee'].edge_attr = torch.tensor(edge_attr, dtype=torch.float32)

print(data)

HeteroData(
  [1mcongressperson[0m={ x=[2431, 2431] },
  [1mcommittee[0m={ x=[556, 556] },
  [1mticker[0m={ x=[4202, 4202] },
  [1mbill[0m={ x=[47767, 47767] },
  [1mnaics[0m={ x=[744, 744] },
  [1m(congressperson, buy-sell, ticker)[0m={
    edge_index=[2, 24675],
    edge_attr=[24675, 1]
  },
  [1m(congressperson, assignment, committee)[0m={
    edge_index=[2, 11698],
    edge_attr=[11698, 2]
  },
  [1m(ticker, lobbies_on, bill)[0m={
    edge_index=[2, 148487],
    edge_attr=[148487, 1]
  },
  [1m(bill, assigned_to, committee)[0m={
    edge_index=[2, 75626],
    edge_attr=[75626, 1]
  }
)


- Add sponsor edges (bio-bill)

In [132]:
# Convert 'date' column to Pandas Timestamp type
bs_df["date"] = pd.to_datetime(bs_df["date"])

# Filter sponsor_df for dates after the reference date
filtered_sponsor_df = bs_df[bs_df["date"] > ref_date]

# Map nodes to their corresponding indices
filtered_sponsor_df["congressperson_idx"] = filtered_sponsor_df["bioguide_id"].map(unique_congresspeople)
filtered_sponsor_df["bill_idx"] = filtered_sponsor_df["bill_id"].map(unique_bills)

# Create edge_index and edge_attr
edge_index = filtered_sponsor_df[["congressperson_idx", "bill_idx"]].T.values
edge_attr = (filtered_sponsor_df["date"] - ref_date).dt.days.values.reshape(-1, 1)

# Assign edges and edge attributes
data['congressperson', 'sponsors', 'bill'].edge_index = torch.tensor(edge_index, dtype=torch.long)
data['congressperson', 'sponsors', 'bill'].edge_attr = torch.tensor(edge_attr, dtype=torch.float32)

print(data)

HeteroData(
  [1mcongressperson[0m={ x=[2431, 2431] },
  [1mcommittee[0m={ x=[556, 556] },
  [1mticker[0m={ x=[4202, 4202] },
  [1mbill[0m={ x=[47767, 47767] },
  [1mnaics[0m={ x=[744, 744] },
  [1m(congressperson, buy-sell, ticker)[0m={
    edge_index=[2, 24675],
    edge_attr=[24675, 1]
  },
  [1m(congressperson, assignment, committee)[0m={
    edge_index=[2, 11698],
    edge_attr=[11698, 2]
  },
  [1m(ticker, lobbies_on, bill)[0m={
    edge_index=[2, 148487],
    edge_attr=[148487, 1]
  },
  [1m(bill, assigned_to, committee)[0m={
    edge_index=[2, 75626],
    edge_attr=[75626, 1]
  },
  [1m(congressperson, sponsors, bill)[0m={
    edge_index=[2, 45513],
    edge_attr=[45513, 1]
  }
)


- Add cosponsor edges (bio-bill)

In [133]:
# Load data
cosponsor_df = bcs_df  # Assuming bcs_df is already loaded

# Convert 'date' column to Pandas Timestamp type
cosponsor_df["date"] = pd.to_datetime(cosponsor_df["date"])

# Filter cosponsor_df for dates after the reference date
filtered_cosponsor_df = cosponsor_df[cosponsor_df["date"] > ref_date]

# Map nodes to their corresponding indices
filtered_cosponsor_df["congressperson_idx"] = filtered_cosponsor_df["bioguide_id"].map(unique_congresspeople)
filtered_cosponsor_df["bill_idx"] = filtered_cosponsor_df["bill_id"].map(unique_bills)

# Create edge_index and edge_attr
edge_index = filtered_cosponsor_df[["congressperson_idx", "bill_idx"]].T.values
edge_attr = (filtered_cosponsor_df["date"] - ref_date).dt.days.values.reshape(-1, 1)

# Assign edges and edge attributes
data['congressperson', 'cosponsors', 'bill'].edge_index = torch.tensor(edge_index, dtype=torch.long)
data['congressperson', 'cosponsors', 'bill'].edge_attr = torch.tensor(edge_attr, dtype=torch.float32)

print(data)


HeteroData(
  [1mcongressperson[0m={ x=[2431, 2431] },
  [1mcommittee[0m={ x=[556, 556] },
  [1mticker[0m={ x=[4202, 4202] },
  [1mbill[0m={ x=[47767, 47767] },
  [1mnaics[0m={ x=[744, 744] },
  [1m(congressperson, buy-sell, ticker)[0m={
    edge_index=[2, 24675],
    edge_attr=[24675, 1]
  },
  [1m(congressperson, assignment, committee)[0m={
    edge_index=[2, 11698],
    edge_attr=[11698, 2]
  },
  [1m(ticker, lobbies_on, bill)[0m={
    edge_index=[2, 148487],
    edge_attr=[148487, 1]
  },
  [1m(bill, assigned_to, committee)[0m={
    edge_index=[2, 75626],
    edge_attr=[75626, 1]
  },
  [1m(congressperson, sponsors, bill)[0m={
    edge_index=[2, 45513],
    edge_attr=[45513, 1]
  },
  [1m(congressperson, cosponsors, bill)[0m={
    edge_index=[2, 561565],
    edge_attr=[561565, 1]
  }
)


- Add ticker-NAICS edges (ticker-NAICS)

In [134]:
# Map nodes to their corresponding indices
tn_df["ticker_idx"] = tn_df["ticker"].map(unique_tickers)
tn_df["naics_idx"] = tn_df["naics"].map(unique_naics)

# Create edge_index (no edge_attr in this case)
edge_index = tn_df[["ticker_idx", "naics_idx"]].T.values

# Assign edges to the data object (no edge attributes for this edge type)
data['ticker', 'classified', 'naics'].edge_index = torch.tensor(edge_index, dtype=torch.long)

print(data)

HeteroData(
  [1mcongressperson[0m={ x=[2431, 2431] },
  [1mcommittee[0m={ x=[556, 556] },
  [1mticker[0m={ x=[4202, 4202] },
  [1mbill[0m={ x=[47767, 47767] },
  [1mnaics[0m={ x=[744, 744] },
  [1m(congressperson, buy-sell, ticker)[0m={
    edge_index=[2, 24675],
    edge_attr=[24675, 1]
  },
  [1m(congressperson, assignment, committee)[0m={
    edge_index=[2, 11698],
    edge_attr=[11698, 2]
  },
  [1m(ticker, lobbies_on, bill)[0m={
    edge_index=[2, 148487],
    edge_attr=[148487, 1]
  },
  [1m(bill, assigned_to, committee)[0m={
    edge_index=[2, 75626],
    edge_attr=[75626, 1]
  },
  [1m(congressperson, sponsors, bill)[0m={
    edge_index=[2, 45513],
    edge_attr=[45513, 1]
  },
  [1m(congressperson, cosponsors, bill)[0m={
    edge_index=[2, 561565],
    edge_attr=[561565, 1]
  },
  [1m(ticker, classified, naics)[0m={ edge_index=[2, 4147] }
)


In [135]:
import torch_geometric.transforms as T
data = T.ToUndirected()(data)

# PICKLE!

In [136]:
import pickle

# Define the data to be pickled
data_to_pickle = {
    "hetero_graph": data,
    "unique_tickers": unique_tickers,
    "unique_congresspeople": unique_congresspeople,
    "unique_committees": unique_committees,
    "unique_bills": unique_bills,
    "unique_naics": unique_naics
}

# Specify the file name for the pickle file
pickle_file = "hetero_graph_data.pkl"

# Open the file in binary write mode and pickle the data
with open(pickle_file, "wb") as f:
    pickle.dump(data_to_pickle, f)

print(f"Data has been pickled to {pickle_file}")


Data has been pickled to hetero_graph_data.pkl


# Train

In [137]:
data

HeteroData(
  [1mcongressperson[0m={ x=[2431, 2431] },
  [1mcommittee[0m={ x=[556, 556] },
  [1mticker[0m={ x=[4202, 4202] },
  [1mbill[0m={ x=[47767, 47767] },
  [1mnaics[0m={ x=[744, 744] },
  [1m(congressperson, buy-sell, ticker)[0m={
    edge_index=[2, 24675],
    edge_attr=[24675, 1]
  },
  [1m(congressperson, assignment, committee)[0m={
    edge_index=[2, 11698],
    edge_attr=[11698, 2]
  },
  [1m(ticker, lobbies_on, bill)[0m={
    edge_index=[2, 148487],
    edge_attr=[148487, 1]
  },
  [1m(bill, assigned_to, committee)[0m={
    edge_index=[2, 75626],
    edge_attr=[75626, 1]
  },
  [1m(congressperson, sponsors, bill)[0m={
    edge_index=[2, 45513],
    edge_attr=[45513, 1]
  },
  [1m(congressperson, cosponsors, bill)[0m={
    edge_index=[2, 561565],
    edge_attr=[561565, 1]
  },
  [1m(ticker, classified, naics)[0m={ edge_index=[2, 4147] },
  [1m(ticker, rev_buy-sell, congressperson)[0m={
    edge_index=[2, 24675],
    edge_attr=[24675, 1]
  },
  [1m

## Make Train/Valid/Test Set

In [138]:
# For this, we first split the set of edges into
# training (80%), validation (10%), and testing edges (10%).
# Across the training edges, we use 70% of edges for message passing,
# and 30% of edges for supervision.
# We further want to generate fixed negative edges for evaluation with a ratio of 2:1.
# Negative edges during training will be generated on-the-fly.
# We can leverage the `RandomLinkSplit()` transform for this from PyG:

transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=2.0,
    add_negative_train_samples=False,
    edge_types=("congressperson", "buy-sell", "ticker"),
    rev_edge_types=("ticker", "rev_buy-sell", "congressperson"), 
)
train_data, val_data, test_data = transform(data)

In [139]:
train_data

HeteroData(
  [1mcongressperson[0m={ x=[2431, 2431] },
  [1mcommittee[0m={ x=[556, 556] },
  [1mticker[0m={ x=[4202, 4202] },
  [1mbill[0m={ x=[47767, 47767] },
  [1mnaics[0m={ x=[744, 744] },
  [1m(congressperson, buy-sell, ticker)[0m={
    edge_index=[2, 13819],
    edge_attr=[13819, 1],
    edge_label=[5922],
    edge_label_index=[2, 5922]
  },
  [1m(congressperson, assignment, committee)[0m={
    edge_index=[2, 11698],
    edge_attr=[11698, 2]
  },
  [1m(ticker, lobbies_on, bill)[0m={
    edge_index=[2, 148487],
    edge_attr=[148487, 1]
  },
  [1m(bill, assigned_to, committee)[0m={
    edge_index=[2, 75626],
    edge_attr=[75626, 1]
  },
  [1m(congressperson, sponsors, bill)[0m={
    edge_index=[2, 45513],
    edge_attr=[45513, 1]
  },
  [1m(congressperson, cosponsors, bill)[0m={
    edge_index=[2, 561565],
    edge_attr=[561565, 1]
  },
  [1m(ticker, classified, naics)[0m={ edge_index=[2, 4147] },
  [1m(ticker, rev_buy-sell, congressperson)[0m={
    edge

In [140]:
# In the first hop, we sample at most 20 neighbors.
# In the second hop, we sample at most 10 neighbors.
# In addition, during training, we want to sample negative edges on-the-fly with
# a ratio of 2:1.
# We can make use of the `loader.LinkNeighborLoader` from PyG:
from torch_geometric.loader import LinkNeighborLoader

# Define seed edges:
edge_label_index = train_data["congressperson", "buy-sell", "ticker"].edge_label_index
print(edge_label_index)
edge_label = train_data["congressperson", "buy-sell", "ticker"].edge_label

train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=[20, 10],
    neg_sampling_ratio=2.0,
    edge_label_index=(("congressperson", "buy-sell", "ticker"), edge_label_index),
    edge_label=edge_label,
    batch_size=128,
    shuffle=True,
)

tensor([[1809,  788,  899,  ...,  799,  899, 2333],
        [2742, 1724, 3300,  ..., 1942, 2335, 2810]])


In [None]:
# In the first hop, we sample at most 20 neighbors.
# In the second hop, we sample at most 10 neighbors.
# In addition, during training, we want to sample negative edges on-the-fly with
# a ratio of 2:1.
# We can make use of the `loader.LinkNeighborLoader` from PyG:
from torch_geometric.loader import LinkNeighborLoader

# Define seed edges:
edge_label_index = train_data["congressperson", "buy-sell", "ticker"].edge_label_index
edge_label = train_data["congressperson", "buy-sell", "ticker"].edge_label
train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=[20, 10],
    neg_sampling_ratio=2.0,
    edge_label_index=(("congressperson", "buy-sell", "ticker"), edge_label_index),
    edge_label=edge_label,
    batch_size=128,
    shuffle=True,
)