In [None]:
# Description:
# Loads multiple SEC Form 4 filing ZIP archives (source:https://www.sec.gov/data-research/sec-markets-data/insider-transactions-data-sets),
# extracts and processes .tsv files inside each archive, and filters insider transactions to focuse on open-market purchases
# by individual insiders (excluding investment entities such as funds, LPs and trusts).  It identifies transactions involving corporate
# officers, and cleans the data by removing invalid records (those with missing roles).  The processed results are compiled into a
# dataframe and automatically saved to .csv for backup and potential upload to a database or machine-learning pipeline (e.g., BigQuery).

from google.colab import files
import zipfile, os
import pandas as pd

# Upload multiple zip files
# Use widget button at bottom to choose file(s)
uploaded = files.upload()  # Select 2+ .zip files

merged_all = []

for zip_filename in uploaded.keys():
    print(f"Processing: {zip_filename}")
    folder_name = zip_filename.replace(".zip", "")
    extract_path = f"/content/{folder_name}"

    # Extract
    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

    # Load .tsv files
    try:
        nonderiv = pd.read_csv(os.path.join(extract_path, "NONDERIV_TRANS.tsv"), sep="\t")
        report = pd.read_csv(os.path.join(extract_path, "REPORTINGOWNER.tsv"), sep="\t")
        submission = pd.read_csv(os.path.join(extract_path, "SUBMISSION.tsv"), sep="\t")
    except Exception as e:
        print(f"Skipping {zip_filename} due to load error: {e}")
        continue

    # Add derived insider role (in case 'Insider Title' is NaN)
    # Default is 'RPTOWNER_RELATIONSHIP'
    def get_role(row):
        if row.get("ISOFFICER") == "true":
            return "Officer"
        elif row.get("ISDIRECTOR") == "true":
            return "Director"
        elif row.get("ISTENPERCENTOWNER") == "true":
            return "10% Owner"
        elif row.get("ISOTHER") == "true":
            return "Other Insider"
        elif pd.notna(row.get("RPTOWNER_RELATIONSHIP")):
          return row["RPTOWNER_RELATIONSHIP"].strip().title()
        else:
            return None

    report["Insider Role"] = report.apply(get_role, axis=1)


    # Filter for common stock purchases
    # Can modify "TRANS_CODE" to include Sales ("S")
    filtered = nonderiv[
        (nonderiv["SECURITY_TITLE"].str.lower() == "common stock") &
        (nonderiv["TRANS_CODE"] == "P")
    ]

    # Join with REPORTINGOWNER.tsv before filtering out entities or invalid roles
    filtered = filtered.merge(
        report[["ACCESSION_NUMBER", "RPTOWNERNAME", "RPTOWNER_TITLE", "RPTOWNER_RELATIONSHIP","Insider Role"]],
        on="ACCESSION_NUMBER", how="left"
    )

    # Filter out entity filers (Investment entities that are not officers or directors)
    filtered["RPTOWNERNAME"] = filtered["RPTOWNERNAME"].str.upper()
    entity_keywords = ["LLC", "LP", "L.P.", "LTD", "INC", "TRUST", "CORP", "FOUNDATION", "COMPANY", "CO."]
    filtered = filtered[~filtered["RPTOWNERNAME"].str.contains('|'.join(entity_keywords), na=False)]


    # Keep only valid insiders: director, officer, or has a job title
    filtered["RPTOWNER_RELATIONSHIP"] = filtered["RPTOWNER_RELATIONSHIP"].str.upper()
    filtered = filtered[
        filtered["RPTOWNER_RELATIONSHIP"].str.contains("DIRECTOR|OFFICER|TENPERCENTOWNER", na=False) |
        filtered["RPTOWNER_TITLE"].notna()
    ]


    # Merge with submission to get equity issuer info
    filtered = filtered.merge(
        submission[["ACCESSION_NUMBER", "ISSUERNAME", "ISSUERTRADINGSYMBOL", "PERIOD_OF_REPORT"]],
        on="ACCESSION_NUMBER", how="left"
    )

    # Filter out equity issuers that are investment funds
    filtered = filtered[
        ~filtered["ISSUERNAME"].str.contains("FUND", case=False, na=False) &
        ~filtered["ISSUERNAME"].str.contains("trust", case=False, na=False)
    ]



    # Select and rename output columns (for readability)
    final = filtered[[
        "RPTOWNERNAME", "RPTOWNER_TITLE", "Insider Role",
        "ISSUERNAME", "ISSUERTRADINGSYMBOL", "PERIOD_OF_REPORT",
        "TRANS_DATE", "SECURITY_TITLE", "TRANS_CODE", "TRANS_SHARES",
        "TRANS_PRICEPERSHARE", "SHRS_OWND_FOLWNG_TRANS", "DIRECT_INDIRECT_OWNERSHIP",
        "ACCESSION_NUMBER"
    ]].rename(columns={
        "RPTOWNERNAME": "Insider Name",
        "RPTOWNER_TITLE": "Insider Title",
        "Insider Role": "Insider Role",
        "ISSUERNAME": "Issuer",
        "ISSUERTRADINGSYMBOL": "Ticker",
        "PERIOD_OF_REPORT": "Period of Report",
        "TRANS_DATE": "Transaction Date",
        "SECURITY_TITLE": "Security",
        "TRANS_CODE": "Transaction Code",
        "TRANS_SHARES": "Shares",
        "TRANS_PRICEPERSHARE": "Price per Share",
        "SHRS_OWND_FOLWNG_TRANS": "Shares After",
        "DIRECT_INDIRECT_OWNERSHIP": "Ownership Type"
    })

    # Append cleaned data to master list
    merged_all.append(final)

# Combine all cleaned rows into one DataFrame
if merged_all:
    final_df = pd.concat(merged_all, ignore_index=True)
    final_df.to_csv("all_common_stock_purchases.csv", index=False)
    print("Saved merged data to all_common_stock_purchases.csv")

    # Preview output
    print("Preview of merged data:")
    pd.set_option('display.max_columns', None)
    display(final_df.head(10))
else:
    print("No valid purchase data found in uploaded zip files.")

Saving 2024q1_form345.zip to 2024q1_form345.zip
Saving 2024q2_form345.zip to 2024q2_form345.zip
Saving 2024q3_form345.zip to 2024q3_form345 (1).zip
Saving 2024q4_form345.zip to 2024q4_form345 (1).zip
Saving 2025q1_form345.zip to 2025q1_form345 (1).zip
Processing: 2024q1_form345.zip


  nonderiv = pd.read_csv(os.path.join(extract_path, "NONDERIV_TRANS.tsv"), sep="\t")


Processing: 2024q2_form345.zip


  nonderiv = pd.read_csv(os.path.join(extract_path, "NONDERIV_TRANS.tsv"), sep="\t")


Processing: 2024q3_form345 (1).zip


  nonderiv = pd.read_csv(os.path.join(extract_path, "NONDERIV_TRANS.tsv"), sep="\t")


Processing: 2024q4_form345 (1).zip


  nonderiv = pd.read_csv(os.path.join(extract_path, "NONDERIV_TRANS.tsv"), sep="\t")


Processing: 2025q1_form345 (1).zip


  nonderiv = pd.read_csv(os.path.join(extract_path, "NONDERIV_TRANS.tsv"), sep="\t")


Saved merged data to all_common_stock_purchases.csv
Preview of merged data:


Unnamed: 0,Insider Name,Insider Title,Insider Role,Issuer,Ticker,Period of Report,Transaction Date,Security,Transaction Code,Shares,Price per Share,Shares After,Ownership Type,ACCESSION_NUMBER
0,CHANG RAYMOND NOBU,CEO and Chairman,"Director,Officer,Tenpercentowner",Agrify Corp,AGFY,29-MAR-2024,28-FEB-2024,Common Stock,P,1578947.0,0.38,1578947.0,I,0001213900-24-028125
1,CHANG RAYMOND NOBU,CEO and Chairman,"Director,Officer,Tenpercentowner",Agrify Corp,AGFY,29-MAR-2024,28-FEB-2024,Common Stock,P,263157.0,0.38,1717051.0,I,0001213900-24-028125
2,SHAH RAJEEV M.,,"Director,Tenpercentowner","Nkarta, Inc.",NKTX,27-MAR-2024,27-MAR-2024,Common Stock,P,3000000.0,10.0,10050818.0,I,0001415889-24-009447
3,KOLCHINSKY PETER,,"Director,Tenpercentowner","Nkarta, Inc.",NKTX,27-MAR-2024,27-MAR-2024,Common Stock,P,3000000.0,10.0,10050818.0,I,0001415889-24-009447
4,SHENSKY EDWARD,,Director,Sow Good Inc.,SOWG,28-MAR-2024,28-MAR-2024,Common Stock,P,13794.0,7.25,41348.0,D,0001437749-24-010117
5,BERMAN BRAD,,Director,Sow Good Inc.,SOWG,28-MAR-2024,28-MAR-2024,Common Stock,P,30000.0,7.25,252935.0,D,0001437749-24-010116
6,GOLDFARB IRA,Executive Chairman,"Director,Officer,Tenpercentowner",Sow Good Inc.,SOWG,28-MAR-2024,28-MAR-2024,Common Stock,P,17242.0,7.25,309357.0,D,0001437749-24-010115
7,GOLDFARB CLAUDIA,CEO,"Director,Officer,Tenpercentowner",Sow Good Inc.,SOWG,28-MAR-2024,28-MAR-2024,Common Stock,P,17242.0,7.25,285990.0,D,0001437749-24-010114
8,GOULD INVESTORS L P,,Tenpercentowner,BRT Apartments Corp.,BRT,27-MAR-2024,25-MAR-2024,Common Stock,P,2300.0,16.0,3550172.88,D,0001214659-24-005629
9,GOULD INVESTORS L P,,Tenpercentowner,BRT Apartments Corp.,BRT,27-MAR-2024,25-MAR-2024,Common Stock,P,1000.0,16.09,3551172.88,D,0001214659-24-005629


In [None]:
# Download merged csv file
files.download("all_common_stock_purchases.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>