In [1]:
# Insider transaction ZIP data processing

# Description:
# Loads multiple SEC Form 4 filing ZIP archives (source:https://www.sec.gov/data-research/sec-markets-data/insider-transactions-data-sets),
# extracts and processes .tsv files inside each archive, and filters insider transactions to focus on open-market purchases
# by individual insiders who are officers and directors of the company.  It filters out investment entities such as funds, LPs and trusts.
# It cleans the data by removing invalid records (those with missing roles).
# The processed results are compiled into a dataframe and automatically saved to two CSV files for inspection, backup and potential upload to a database
# or machine-learning pipeline (e.g., BigQuery).
# The CSV 'notebook1_insider_data.csv' is the aggregated valid insider data for further analysis.
# The CSV 'notebook1_filtered_out_entities.csv' is the filtered-out entities.

import zipfile, os
import pandas as pd
import requests
import re

# Globally define final output column selection and renaming (for use in final_df and filtered_entities)
selected_columns = [
    "RPTOWNERNAME", "RPTOWNER_TITLE", "Insider Role",
    "ISSUERNAME", "ISSUERTRADINGSYMBOL", "ISSUERCIK", "PERIOD_OF_REPORT",
    "TRANS_DATE", "SECURITY_TITLE", "TRANS_CODE", "TRANS_SHARES",
    "TRANS_PRICEPERSHARE", "SHRS_OWND_FOLWNG_TRANS", "DIRECT_INDIRECT_OWNERSHIP",
    "ACCESSION_NUMBER"
]

renaming_dict = {
    "RPTOWNERNAME": "Insider Name",
    "RPTOWNER_TITLE": "Insider Title",
    "Insider Role": "Insider Role",
    "ISSUERNAME": "Issuer",
    "ISSUERTRADINGSYMBOL": "Ticker",
    "ISSUERCIK": "CIK Code",
    "PERIOD_OF_REPORT": "Period of Report",
    "TRANS_DATE": "Transaction Date",
    "SECURITY_TITLE": "Security",
    "TRANS_CODE": "Transaction Code",
    "TRANS_SHARES": "Shares",
    "TRANS_PRICEPERSHARE": "Price per Share",
    "SHRS_OWND_FOLWNG_TRANS": "Shares After",
    "DIRECT_INDIRECT_OWNERSHIP": "Ownership Type"
}


# # Upload multiple zip files
# # Use widget button at bottom to choose file(s)
# uploaded = files.upload()  # Select 2+ .zip files # Disabled. Uncomment to enable.

# Path to previously downloaded SEC ZIPs (saved locally in Colab session)
local_zip_dir = "sec_zips"  # Update as needed.
all_files = sorted([f for f in os.listdir(local_zip_dir) if f.endswith(".zip")])

print(f"Found {len(all_files)} zip files")

merged_all = []

# for zip_filename in uploaded.keys(): # disabled to use all_files
for zip_filename in all_files:
    print(f"Processing: {zip_filename}")
    zip_path = os.path.join(local_zip_dir, zip_filename)
    folder_name = zip_filename.replace(".zip", "")
    extract_path = f"/content/{folder_name}"

    # Extract
    try:
      with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    except Exception as e:
        print(f"Skipping {zip_filename} due to extraction error: {e}")
        continue

    # Load .tsv files
    try:
        nonderiv = pd.read_csv(os.path.join(extract_path, "NONDERIV_TRANS.tsv"), sep="\t", low_memory=False) # "False" suppresses Dtype warning
        report = pd.read_csv(os.path.join(extract_path, "REPORTINGOWNER.tsv"), sep="\t")
        submission = pd.read_csv(os.path.join(extract_path, "SUBMISSION.tsv"), sep="\t")
    except Exception as e:
        print(f"Skipping {zip_filename} due to load error: {e}")
        continue

    # Add derived insider role (in case 'Insider Title' is NaN)
    # Default is 'RPTOWNER_RELATIONSHIP'
    def get_role(row):
        if row.get("ISOFFICER") == "true":
            return "Officer"
        elif row.get("ISDIRECTOR") == "true":
            return "Director"
        elif row.get("ISTENPERCENTOWNER") == "true":
            return "10% Owner"
        elif row.get("ISOTHER") == "true":
            return "Other Insider"
        elif pd.notna(row.get("RPTOWNER_RELATIONSHIP")):
          return row["RPTOWNER_RELATIONSHIP"].strip().title()
        else:
            return None

    report["Insider Role"] = report.apply(get_role, axis=1)


    # Filter for common stock purchases
    # Can modify "TRANS_CODE" to include Sales ("S")
    filtered = nonderiv[
        (nonderiv["SECURITY_TITLE"].str.lower() == "common stock") &
        (nonderiv["TRANS_CODE"] == "P")
    ]

    # Filter out trades with price per share below $5 (to reduce microstructure noise)
    filtered = filtered[filtered["TRANS_PRICEPERSHARE"] >= 5].copy()

    # Join with REPORTINGOWNER.tsv before filtering out entities or invalid roles
    filtered = filtered.merge(
        report[["ACCESSION_NUMBER", "RPTOWNERNAME", "RPTOWNER_TITLE", "RPTOWNER_RELATIONSHIP","Insider Role"]],
        on="ACCESSION_NUMBER", how="left"
    )

    # Save copy before entity filtering
    before_entity_filter = filtered.copy()


    # Filter out entity filers (Investment entities that are not officers or directors)
    filtered["RPTOWNERNAME"] = filtered["RPTOWNERNAME"].str.upper()
    entity_keywords = [
        "LLC", "L L C", "L.L.C.", "LP", "L P", "L.P.", "LTD", "INC", "TRUST", "CORP",
        "FOUNDATION", "COMPANY", "CO", "CO.", "PARTNERS", "ADVISORS", "ADVISORY",
        "CAPITAL", "INVESTMENT", "INVESTMENTS", "HOLDINGS", "MGMT", "MANAGEMENT", "FUND",
        "GROUP", "VENTURES", "BIOVENTURES", "INVESTORS", "EQUITY", "LIFE INSURANCE", "GP",
        "FAMILY", "PBC", "SDN BHD", "GMBH"
    ]


    # Create regex pattern that detects keywords with leading punctuation or spacing
    pattern = "(?i)" + "|".join(
        r"(?<!\w)" + re.escape(k) + r"(?=\W|$)"
        for k in entity_keywords
    )


    # Save the rows that will be the filtered out entities (for later review)
    filtered_out_df = before_entity_filter[before_entity_filter["RPTOWNERNAME"].str.contains(pattern, case=False, na=False, regex=True)].copy()

    # Merge the entity-filtered-out rows with submission info to align with final_df format
    filtered_out_df = filtered_out_df.merge(
        submission[["ACCESSION_NUMBER", "ISSUERNAME", "ISSUERTRADINGSYMBOL", "PERIOD_OF_REPORT", "ISSUERCIK"]],
        on="ACCESSION_NUMBER", how="left"
    )


    # Remove rows where the insider name matches any known entity keyword (e.g., LLC, INC, TRUST)
    # Uses word boundaries to avoid false positives
    filtered = filtered[~filtered["RPTOWNERNAME"].str.contains(pattern, case=False, na=False, regex=True)]


    # Keep only valid insiders: director, officer, or has a job title
    # .loc[;, ] used to address warning (means assign this transformation to every row in the column)
    filtered.loc[:, "RPTOWNER_RELATIONSHIP"] = filtered["RPTOWNER_RELATIONSHIP"].str.upper()
    filtered = filtered[
        filtered["RPTOWNER_RELATIONSHIP"].str.contains("DIRECTOR|OFFICER|TENPERCENTOWNER", na=False) |
        filtered["RPTOWNER_TITLE"].notna()
    ]


    # Merge with submission to get equity issuer info
    filtered = filtered.merge(
        submission[["ACCESSION_NUMBER",
                    "ISSUERNAME",
                    "ISSUERTRADINGSYMBOL",
                    "PERIOD_OF_REPORT",
                    "ISSUERCIK" # Added "ISSUECIK" to map this field with SIC code
        ]],
        on="ACCESSION_NUMBER", how="left"
    )

    # Filter out equity issuers that are investment funds
    filtered = filtered[
        ~filtered["ISSUERNAME"].str.contains("FUND", case=False, na=False) &
        ~filtered["ISSUERNAME"].str.contains("trust", case=False, na=False)
    ]

    # Select and rename output columns using global definitions
    final = filtered[selected_columns].rename(columns=renaming_dict)

    # Append cleaned data to master list
    merged_all.append(final)

# Combine all cleaned rows into one DataFrame
if merged_all:
    final_df = pd.concat(merged_all, ignore_index=True)

    # Save merged data
    final_df.to_csv("notebook1_insider_data.csv", index=False)
    print("Saved merged data to notebook1_insider_data.csv")

    # Preview output
    print("Preview of merged data:")
    pd.set_option('display.max_columns', None)
    display(final_df.head(10))
else:
    print("No valid purchase data found in uploaded zip files.")

Found 77 zip files
Processing: 2006q1_form345.zip
Processing: 2006q2_form345.zip
Processing: 2006q3_form345.zip
Processing: 2006q4_form345.zip
Processing: 2007q1_form345.zip
Processing: 2007q2_form345.zip
Processing: 2007q3_form345.zip
Processing: 2007q4_form345.zip
Processing: 2008q1_form345.zip
Processing: 2008q2_form345.zip
Processing: 2008q3_form345.zip
Processing: 2008q4_form345.zip
Processing: 2009q1_form345.zip
Processing: 2009q2_form345.zip
Processing: 2009q3_form345.zip
Processing: 2009q4_form345.zip
Processing: 2010q1_form345.zip
Processing: 2010q2_form345.zip
Processing: 2010q3_form345.zip
Processing: 2010q4_form345.zip
Processing: 2011q1_form345.zip
Processing: 2011q2_form345.zip
Processing: 2011q3_form345.zip
Processing: 2011q4_form345.zip
Processing: 2012q1_form345.zip
Processing: 2012q2_form345.zip
Processing: 2012q3_form345.zip
Processing: 2012q4_form345.zip
Processing: 2013q1_form345.zip
Processing: 2013q2_form345.zip
Processing: 2013q3_form345.zip
Processing: 2013q4_f

Unnamed: 0,Insider Name,Insider Title,Insider Role,Issuer,Ticker,CIK Code,Period of Report,Transaction Date,Security,Transaction Code,Shares,Price per Share,Shares After,Ownership Type,ACCESSION_NUMBER
0,BARNHOLT EDWARD W,,Director,ADOBE SYSTEMS INC,ADBE,796343,30-MAR-2006,30-MAR-2006,Common Stock,P,5000.0,35.61,5000.0,D,0001179110-06-007524
1,STEGMANN THOMAS,Chief Clinical Officer,"Director,Officer,Tenpercentowner","CardioVascular BioTherapeutics, Inc.",CVBT,1303497,27-MAR-2006,27-MAR-2006,Common Stock,P,7750.0,7.34,30015500.0,D,0001303497-06-000011
2,GONZALEZ PLACIDO,,Director,EUROBANCSHARES INC,EUBK,1164554,31-OCT-2005,31-OCT-2005,Common Stock,P,100000.0,10.4,1757796.0,D,0000899078-06-000306
3,GONZALEZ PLACIDO,,Director,EUROBANCSHARES INC,EUBK,1164554,31-OCT-2005,02-NOV-2005,Common Stock,P,21000.0,11.64,1790996.0,D,0000899078-06-000306
4,GONZALEZ PLACIDO,,Director,EUROBANCSHARES INC,EUBK,1164554,31-OCT-2005,01-NOV-2005,Common Stock,P,12200.0,10.51,1769996.0,D,0000899078-06-000306
5,GONZALEZ PLACIDO,,Director,EUROBANCSHARES INC,EUBK,1164554,28-APR-2005,28-APR-2005,Common Stock,P,10300.0,15.45,1657796.0,D,0000899078-06-000305
6,MONTANO DANIEL C,"Chairman, President, CEO","Director,Officer,Tenpercentowner","CardioVascular BioTherapeutics, Inc.",CVBT,1303497,30-MAR-2006,30-MAR-2006,Common Stock,P,2250.0,7.7,7750.0,D,0001303497-06-000010
7,STEGMANN THOMAS,Chief Clinical Officer,"Director,Officer,Tenpercentowner","CardioVascular BioTherapeutics, Inc.",CVBT,1303497,30-MAR-2006,30-MAR-2006,Common Stock,P,2250.0,7.7,30007750.0,D,0001303497-06-000009
8,JACOBS JOHN W,Chief Scientific Officer,"Director,Officer","CardioVascular BioTherapeutics, Inc.",CVBT,1303497,30-MAR-2006,30-MAR-2006,Common Stock,P,5000.0,7.85,405000.0,D,0001303497-06-000008
9,MULCAHY JOHN R,,Director,PEAPACK GLADSTONE FINANCIAL CORP,PGC,1050743,29-MAR-2006,29-MAR-2006,Common Stock,P,100.0,25.2,22587.57,D,0001261683-06-000020


In [None]:
# Save to Google Drive (shared folder)
#final_df.to_csv("/content/drive/MyDrive/593 - Insider Trading Milestone I Project/notebook1_insider_data.csv", index=False)

# # Download merged csv file to local machine (Optional, uncomment to use)
# files.download("notebook1_insider_data.csv") # Downloads from the Colab session to local Downloads folder. Change download folder as needed.


In [2]:
# Create CSV of filtered-out entities using same column names for records and future use
filtered_entities = filtered_out_df[selected_columns].rename(columns=renaming_dict)

# Save to local drive
filtered_entities.to_csv("notebook1_filtered_out_entities.csv", index=False)
