### Prepare neo4j writer

In [38]:
from neo4j import GraphDatabase

class Neo4jService(object):

    def __init__(self, uri, user, password):
        self._driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self._driver.close()
        
    def add_nodes_legis(self, tx, nodes):
        query = (
            "UNWIND $nodes AS node "
            "CREATE (n:Legislator {name: node.name, bioguide: node.bioguide})" 
        )
        tx.run(query, nodes=nodes)
    
    def add_nodes_ticker(self, tx ,nodes):
        query = (
            "UNWIND $nodes AS node "
            "CREATE (n:Ticker {ticker: node.ticker, name: node.name})" 
        )
        tx.run(query, nodes=nodes)
    
    def add_nodes_committee(self, tx, nodes):
        query = (
            "UNWIND $nodes AS node "
            "CREATE (n:Committee {ticker: node.committee, name: node.name})" 
        )
        tx.run(query, nodes=nodes)

    def add_nodes_naics(self, tx, nodes):
        query = (
            "UNWIND $nodes AS node "
            "CREATE (n:NAICS {ticker: node.naics, desc: node.desc})" 
        )
        tx.run(query, nodes=nodes)    

    def add_nodes_bills(self, tx, nodes):
        query = (
            "UNWIND $nodes AS node "
            "CREATE (n:Bill {id: node.id, short_title: node.short_title, official_title: node.official_title, summary: node.summary_text})" 
        )
        tx.run(query, nodes=nodes) 

    def add_buy_sell_relationships_with_dates(self, tx, relationships):
        query = (
            "UNWIND $relationships AS rel "
            "MATCH (l:Legislator {bioguide: rel.legis_bioguide}), (t:Ticker {ticker: rel.ticker}) "
            "CREATE (l)-[:BUY_SELL {start_date: rel.start_date, end_date: rel.end_date}]->(t)"
        )
        tx.run(query, relationships=relationships)

# Example Usage

uri = "bolt://localhost:7687"
user = "neo4j"
password = "1dbstntk"

neo4j = Neo4jService(uri, user, password)

In [39]:
driver = GraphDatabase.driver(uri, auth=(user, password))

### Read Heterograph data we used for the paper

In [33]:
import pickle
from tqdm import tqdm

import torch
from torch_geometric.loader import LinkNeighborLoader

from torch.nn import functional as F

import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

import torch_geometric.transforms as T

import csv

# Set the random seed for PyTorch, NumPy, and random
seed = 2328466898069313329
torch.manual_seed(seed)

# Print the random seed
print(f"Random seed: {torch.initial_seed()}")

with open('../data/hetero_graph_data.pkl', "rb") as f:
    loaded_data = pickle.load(f)

# Extract the data from the loaded dictionary
data = loaded_data["hetero_graph"]

Random seed: 2328466898069313329


In [38]:
loaded_data.keys()

dict_keys(['hetero_graph', 'unique_tickers', 'unique_congresspeople', 'unique_committees', 'unique_bills', 'unique_naics'])

In [35]:
# Instantiate connection to lv
import warnings
warnings.filterwarnings("ignore")

from octopus.db import PostgresqlManager
from dotenv import load_dotenv

# get transactions data (bioguide-ticker)
load_dotenv("/Users/syyun/Dropbox (MIT)/efd/.envlv", override=True)
pm = PostgresqlManager(dotenv_path="/Users/syyun/Dropbox (MIT)/efd/.envlv")

### Add Congresspeople Node

- Type: Congresspeople
- Name : Congressmen/women's name

In [5]:
politician_name_bioguides = pm.execute_sql(fetchall=True, sql=
                f"""
select first_name, last_name, bioguide_id  from relational___congress.legislators l 
                """)

In [6]:
# Create a dictionary from politician_name_bioguides for faster lookup
bioguide_dict = {bioguide: first_name + ' ' + last_name 
                 for first_name, last_name, bioguide in politician_name_bioguides}


In [21]:
congresspeople_nodes_data = [{"type": "Congresspeople", "bioguide": bioguide, "name": bioguide_dict[bioguide]} for bioguide in list(loaded_data['unique_congresspeople'].keys())]

In [None]:
with neo4j._driver.session() as session:
    session.write_transaction(neo4j.add_nodes_legis, congresspeople_nodes_data)

neo4j.close()

### Add Ticker Nodes
- Type: Ticker
- Name: Company name

In [8]:
ticker_names = pm.execute_sql(fetchall=True, sql="""
            WITH st AS (
    SELECT 
        DISTINCT sb.bioguide_id, 
        saa.ticker, 
        saa.trans_date,
        saa.asset_name,
        ROW_NUMBER() OVER(PARTITION BY saa.ticker ORDER BY LENGTH(saa.asset_name) DESC) AS rn
    FROM 
        "_sandbox_suyeol".senate_annual_4a saa
        INNER JOIN "_sandbox_suyeol".senate_annual sa ON sa.report_type_url = saa.report_url
        INNER JOIN "_sandbox_suyeol".senator_bioguide sb ON sb.first_name = sa.first_name AND sb.last_name = sa.last_name 
    WHERE 
        saa.ticker IS NOT NULL AND saa.trans_date IS NOT NULL
),
ht AS (
    SELECT 
        DISTINCT hfbb.bioguide_id, 
        hft.ticker, 
        hft.transaction_date AS trans_date,
        hft.asset_name,
        ROW_NUMBER() OVER(PARTITION BY hft.ticker ORDER BY LENGTH(hft.asset_name) DESC) AS rn
    FROM 
        "_sandbox_suyeol".house_fd_transactions hft 
        INNER JOIN "_sandbox_suyeol".house_docs_id hdi ON hdi."DocID" = hft.docid 
        INNER JOIN "_sandbox_suyeol".house_fd_bio_brdige hfbb ON hfbb.first_name = hdi."First" AND hfbb.last_name = hdi."Last" 
    WHERE 
        hft.ticker IS NOT NULL AND hft.transaction_date IS NOT NULL
),
union_sh AS (
    SELECT bioguide_id, ticker, trans_date, asset_name FROM st WHERE rn = 1
    UNION
    SELECT bioguide_id, ticker, trans_date, asset_name FROM ht WHERE rn = 1
)
SELECT distinct ticker, asset_name  FROM union_sh;
"""
)

In [11]:
# Create a dictionary from politician_name_bioguides for faster lookup
ticker_dict = {ticker: company_name 
                 for ticker, company_name in ticker_names}


In [31]:
ticker_nodes_data = [
    {"ticker": ticker, "name": ticker_dict.get(ticker)}
    for ticker in list(loaded_data['unique_tickers'].keys())
]

In [35]:
with neo4j._driver.session() as session:
    session.write_transaction(neo4j.add_nodes_ticker, ticker_nodes_data)

neo4j.close()

### Add Committee Nodes
- Type: Committee
- Name: Committee name

In [40]:
committee_names = pm.execute_sql(fetchall=True, sql="""
select distinct thomas_id, representative_name from relational___congress.committees
order by thomas_id desc
                                 """)

In [42]:
# Create a dictionary from politician_name_bioguides for faster lookup
committee_dict = {committee: committee_name 
                 for committee, committee_name in committee_names}


In [45]:
committee_nodes_data = [
    {"committee": committee, "name": committee_dict.get(committee)}
    for committee in list(loaded_data['unique_committees'].keys())
]

In [48]:
with neo4j._driver.session() as session:
    session.write_transaction(neo4j.add_nodes_committee, committee_nodes_data)

neo4j.close()

### Add NAICS code Nodes
- type: NAICS
- name: NAICS code name

In [49]:
loaded_data['unique_naics'].keys()

dict_keys(['10104', '11111', '11112', '111199', '111211', '111219', '111335', '111422', '112320', '112340', '113110', '113210', '114112', '115112', '115116', '115310', '211120', '211130', '21211', '212111', '212114', '212115', '21212', '21213', '212210', '212220', '212230', '212290', '212291', '212312', '212313', '212323', '212390', '212391', '213111', '213112', '221111', '221112', '221114', '221115', '221117', '221118', '221121', '221122', '221210', '221310', '221320', '22221', '23236', '23237', '23238', '236115', '236117', '236118', '236220', '237110', '237120', '237130', '237210', '237310', '237990', '238120', '238130', '238210', '238220', '238290', '28282', '28283', '311211', '311221', '311224', '311225', '311230', '311351', '311412', '311421', '311422', '311511', '311513', '311514', '311520', '311611', '311612', '311615', '311812', '311813', '311821', '311824', '311919', '311920', '311930', '311941', '311942', '311999', '312111', '312120', '312130', '312140', '312230', '31311', '3

In [50]:
naics_desc = pm.execute_sql(fetchall=True, sql="""
with n1 as (
select ticker, naics1 as naics, naics1_desc as naics_desc  from "_sandbox_suyeol".ticker_naics tn 
    inner join "_sandbox_suyeol".ticker_naics_url tnu on tnu.naics_url =tn.naics_url 
where ticker is not null and naics1 is not null and naics1 != ''
)
, n2 as (
select ticker, naics2 as naics, naics2_desc as naics_desc  from "_sandbox_suyeol".ticker_naics tn 
    inner join "_sandbox_suyeol".ticker_naics_url tnu on tnu.naics_url =tn.naics_url 
where ticker is not null and naics2 is not null and naics2 != ''
)
, z as (
select ticker, naics, null as naics_desc from "_sandbox_suyeol".ticker_naics_zoom tnz 
where ticker is not null and naics is not null and naics != ''
)
, combined AS (
    SELECT * FROM n1
    UNION
    SELECT * FROM n2
    UNION
    SELECT * FROM z
)
, ranked AS (
    SELECT 
        naics, 
        naics_desc,
        ROW_NUMBER() OVER(PARTITION BY naics ORDER BY CASE WHEN naics_desc IS NULL THEN 1 ELSE 0 END, naics_desc) as rnk
    FROM combined
)
SELECT 
    naics, 
    naics_desc
FROM ranked
WHERE rnk = 1
ORDER BY naics DESC;
"""
)

In [54]:
# Create a dictionary from politician_name_bioguides for faster lookup
naics_dict = {naics: desc 
                 for naics, desc in naics_desc}
# naics_dict

In [60]:
naics_nodes_data = [
    {"naics": naics, "desc": naics_dict.get(naics)}
    for naics in list(loaded_data['unique_naics'].keys())
]

In [61]:
with neo4j._driver.session() as session:
    session.write_transaction(neo4j.add_nodes_naics, naics_nodes_data)

neo4j.close()

### Add Bill Nodes

In [62]:
loaded_data['unique_bills'].keys()

dict_keys(['hconres1-115', 'hconres1-116', 'hconres1-117', 'hconres10-115', 'hconres10-116', 'hconres10-117', 'hconres100-115', 'hconres100-116', 'hconres101-115', 'hconres101-116', 'hconres102-115', 'hconres102-116', 'hconres103-115', 'hconres103-116', 'hconres104-115', 'hconres104-116', 'hconres105-114', 'hconres105-115', 'hconres105-116', 'hconres106-114', 'hconres106-115', 'hconres106-116', 'hconres107-114', 'hconres107-115', 'hconres107-116', 'hconres108-114', 'hconres108-115', 'hconres108-116', 'hconres109-114', 'hconres109-115', 'hconres109-116', 'hconres11-115', 'hconres11-116', 'hconres11-117', 'hconres110-114', 'hconres110-115', 'hconres110-116', 'hconres111-114', 'hconres111-115', 'hconres111-116', 'hconres112-114', 'hconres112-115', 'hconres112-116', 'hconres113-114', 'hconres113-115', 'hconres113-116', 'hconres114-114', 'hconres114-115', 'hconres114-116', 'hconres115-114', 'hconres115-115', 'hconres115-116', 'hconres116-114', 'hconres116-115', 'hconres116-116', 'hconres117

In [63]:
bill_info = pm.execute_sql(fetchall=True, sql="""
select distinct bill_id, official_title, short_title, summary_text  from relational___congress.bills b ;
"""
)

In [66]:
bill_dict = {bill_id: {"official_title": official_title, "short_title": short_title, "summary_text": summary_text}
             for bill_id, official_title, short_title, summary_text in bill_info}

In [70]:
bill_nodes_data = [
    {"id": bill_id, 
     "official_title": bill_dict.get(bill_id, {}).get('official_title', None), 
     "short_title": bill_dict.get(bill_id, {}).get('short_title', None),
     "summary_text": bill_dict.get(bill_id, {}).get('summary_text', None)}
    for bill_id in list(loaded_data['unique_bills'].keys())
]

In [76]:
with neo4j._driver.session() as session:
    session.write_transaction(neo4j.add_nodes_bills, bill_nodes_data)

neo4j.close()

### Add Congressperson-buy/sell-Ticker Edges

In [18]:
rel = loaded_data['hetero_graph']['congressperson', 'buy-sell', 'ticker']['edge_index']

In [19]:
dates_tensor = loaded_data['hetero_graph']['congressperson', 'buy-sell', 'ticker']['edge_attr']

In [20]:
relationships_int = [(rel[0, i].item(), rel[1, i].item()) for i in range(rel.shape[1])]

In [21]:
# Combine relationship pairs and dates
relationship_data = [
    {"leg_id": leg_id, "tick_id": tick_id, "dates": dates_tensor[i]}
    for i, (leg_id, tick_id) in enumerate(relationships_int)
]

In [23]:
# Helper Function to get key by value
def get_key_by_value(dictionary, value):
    return next((key for key, val in dictionary.items() if val == value), None)

In [25]:
import pandas as pd
ref_date = pd.Timestamp(2016, 1, 1)

In [26]:
def get_dates_from_tensor(tensor_data, ref_date):
    import pandas as pd
    """
    Convert tensor data to actual dates based on a reference date.
    
    Parameters:
    - tensor_data (torch.Tensor): A tensor with two elements representing the elapsed days.
    - ref_date (pd.Timestamp): Reference date.
    
    Returns:
    - tuple of str: Start date and end date in "YYYY-MM-DD" format.
    """
    if not isinstance(tensor_data, torch.Tensor) or len(tensor_data) != 2:
        raise ValueError("Input tensor_data must be a torch.Tensor with exactly two elements.")
    
    start_date = ref_date + pd.Timedelta(days=float(tensor_data[0]))
    end_date = ref_date + pd.Timedelta(days=float(tensor_data[1]))
    
    return (start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d'))


In [27]:
# Convert to identifiers and date strings
relationships = [
    {
        "legis_bioguide": get_key_by_value(loaded_data['unique_congresspeople'], rel["leg_id"]),
        "ticker": get_key_by_value(loaded_data['unique_tickers'], rel["tick_id"]),
        "start_date": get_dates_from_tensor(rel["dates"], ref_date)[0],
        "end_date": get_dates_from_tensor(rel["dates"], ref_date)[1]
    }
    for rel in relationship_data
]

In [31]:
# Example Usage
with neo4j._driver.session() as session:
    session.write_transaction(neo4j.add_buy_sell_relationships_with_dates, relationships)

  session.write_transaction(neo4j.add_buy_sell_relationships_with_dates, relationships)


In [12]:
len(relationships)

24675

### Add Committee-Assignemnt Edges

In [36]:
# get committee assignments
assign = pm.execute_sql(fetchall=True, sql=
                f"""
                select distinct committee_thomas_id, legislator_bioguide_id, congress_num  from relational___congress.committees__legislators cl
	                inner join relational___congress.legislators l on l.bioguide_id = cl.legislator_bioguide_id                """
                )

Failed to write data to connection IPv4Address(('localhost', 7687)) (ResolvedIPv4Address(('127.0.0.1', 7687)))


In [40]:
# assign

In [56]:
def get_all_legislator_bioguide_ids(tx):
    query = "MATCH (n:Legislator) RETURN collect(n.bioguide) AS bioguide_ids"
    result = tx.run(query)
    
    # Since we're collecting all ids into a list, there should be only one record
    record = result.single()  

    if record:
        return record['bioguide_ids']
    else:
        return []

try:
    with driver.session() as session:
        bioguide_ids = session.read_transaction(get_all_legislator_bioguide_ids)
        print(bioguide_ids)
except Exception as e:
    print(f"An error occurred: {str(e)}")
finally:
    driver.close()

print(len(bioguide_ids))

['A000009', 'A000014', 'A000017', 'A000018', 'A000022', 'A000031', 'A000052', 'A000055', 'A000062', 'A000069', 'A000073', 'A000076', 'A000103', 'A000109', 'A000118', 'A000121', 'A000127', 'A000139', 'A000148', 'A000170', 'A000177', 'A000189', 'A000195', 'A000202', 'A000207', 'A000208', 'A000209', 'A000210', 'A000211', 'A000212', 'A000213', 'A000214', 'A000215', 'A000216', 'A000217', 'A000219', 'A000220', 'A000221', 'A000222', 'A000224', 'A000226', 'A000329', 'A000337', 'A000355', 'A000356', 'A000357', 'A000358', 'A000359', 'A000360', 'A000361', 'A000362', 'A000363', 'A000364', 'A000365', 'A000366', 'A000367', 'A000368', 'A000369', 'A000370', 'A000371', 'A000372', 'A000373', 'A000374', 'A000375', 'A000376', 'A000377', 'A000378', 'B000008', 'B000013', 'B000024', 'B000025', 'B000028', 'B000029', 'B000037', 'B000047', 'B000063', 'B000069', 'B000072', 'B000078', 'B000081', 'B000083', 'B000104', 'B000134', 'B000151', 'B000153', 'B000160', 'B000169', 'B000177', 'B000178', 'B000179', 'B000200'

In [59]:
# Define function to get start and end year of congress
def get_congress_years(congress_num):
    start_year = (congress_num - 115) * 2 + 2017
    end_year = start_year + 1
    return start_year, end_year

In [60]:
len(assign)

31227

In [None]:
# Define function to get start and end year of congress
def get_congress_years(congress_num):
    start_year = (congress_num - 115) * 2 + 2017
    end_year = start_year + 1
    return start_year, end_year

def create_committee_assignments(tx, assignments):
    query = (
        "MATCH (c:Committee {ticker: $committee_ticker}) "
        "MATCH (l:Legislator {bioguide: $bioguide_id}) "
        "MERGE (l)-[:COMMITTEE_ASSIGNMENT {congress_num: $congress_num, start_year: $start_year, end_year: $end_year}]->(c)"
    )
    
    for assignment in assignments:
        committee_ticker, bioguide_id, congress_num = assignment
        start_year, end_year = get_congress_years(congress_num)
        tx.run(query, committee_ticker=committee_ticker, bioguide_id=bioguide_id, congress_num=congress_num, start_year=start_year, end_year=end_year)


In [None]:
# Use a try/except block to handle potential errors and ensure resources are released
try:
    with driver.session() as session:
        session.write_transaction(create_committee_assignments, assign)
except Exception as e:
    print(f"An error occurred: {str(e)}")
finally:
    driver.close()

In [None]:
unique_congresspeople = {name: idx for idx, name in enumerate(congressperson_nodes)}

In [86]:


# Create a list of tuples representing the edges with their corresponding start and end date attributes
edges_with_dates = [
    (unique_congresspeople[row['bioguide_id']], unique_committees[row['committee_id']], row['congress_year_start'], row['congress_year_end'])
    for _, row in assign_df.iterrows()
]

# Calculate the elapsed days from the reference date for start and end dates
elapsed_start_days = np.array([(edge[2] - ref_date).days for edge in edges_with_dates], dtype=np.float32).reshape(-1, 1)
elapsed_end_days = np.array([(edge[3] - ref_date).days for edge in edges_with_dates], dtype=np.float32).reshape(-1, 1)

# Concatenate elapsed_start_days and elapsed_end_days to form a 2D array
elapsed_days = np.concatenate((elapsed_start_days, elapsed_end_days), axis=1)

# Assign the edge_index for the corresponding edge type in the data object
data['congressperson', 'assignment', 'committee'].edge_index = torch.tensor(
    [(edge[0], edge[1]) for edge in edges_with_dates],
    dtype=torch.long
).t().contiguous()

# Assign the edge_attr (elapsed_days attribute) for the corresponding edge type in the data object
data['congressperson', 'assignment', 'committee'].edge_attr = torch.tensor(elapsed_days, dtype=torch.float32)

print(data)

In [87]:
# loaded_data['unique_congresspeople']

torch.Size([2, 24675])

In [22]:
import pandas as pd
ref_date = pd.Timestamp(2016, 1, 1)

In [13]:
hg = loaded_data['hetero_graph']

In [27]:
hg['congressperson', 'buy-sell', 'ticker']['edge_attr']

tensor([[ 271.,  271.],
        [ 797.,  797.],
        [ 678.,  678.],
        ...,
        [1168., 1168.],
        [1466., 1466.],
        [1250., 1250.]])

In [28]:
hg['congressperson', 'buy-sell', 'ticker']['edge_attr'][0]

tensor([271., 271.])

In [29]:
import pandas as pd
import torch

# Given data
tensor_data = torch.tensor([271., 271.])
ref_date = pd.Timestamp(2016, 1, 1)

# Convert the tensor data to dates
start_date = ref_date + pd.Timedelta(days=float(tensor_data[0]))
end_date = ref_date + pd.Timedelta(days=float(tensor_data[1]))

# Display the results
print(f"Start Date: {start_date.strftime('%Y-%m-%d')}")
print(f"End Date: {end_date.strftime('%Y-%m-%d')}")


Start Date: 2016-09-28
End Date: 2016-09-28


In [30]:
import pandas as pd
import torch

def get_dates_from_tensor(tensor_data, ref_date):
    """
    Convert tensor data to actual dates based on a reference date.
    
    Parameters:
    - tensor_data (torch.Tensor): A tensor with two elements representing the elapsed days.
    - ref_date (pd.Timestamp): Reference date.
    
    Returns:
    - tuple of str: Start date and end date in "YYYY-MM-DD" format.
    """
    if not isinstance(tensor_data, torch.Tensor) or len(tensor_data) != 2:
        raise ValueError("Input tensor_data must be a torch.Tensor with exactly two elements.")
    
    start_date = ref_date + pd.Timedelta(days=float(tensor_data[0]))
    end_date = ref_date + pd.Timedelta(days=float(tensor_data[1]))
    
    return (start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d'))

# Example usage:

# Given data
tensor_data = torch.tensor([271., 280.])
ref_date = pd.Timestamp(2016, 1, 1)

# Get and print dates
start_date, end_date = get_dates_from_tensor(tensor_data, ref_date)
print(f"Start Date: {start_date}")
print(f"End Date: {end_date}")

Start Date: 2016-09-28
End Date: 2016-10-07
