In [1]:
%pip install PyGithub

Collecting PyGithub
  Downloading pygithub-2.8.1-py3-none-any.whl.metadata (3.9 kB)
Collecting pynacl>=1.4.0 (from PyGithub)
  Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (8.6 kB)
Downloading pygithub-2.8.1-py3-none-any.whl (432 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m432.7/432.7 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (856 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m856.7/856.7 kB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynacl, PyGithub
Successfully installed PyGithub-2.8.1 pynacl-1.5.0


In [2]:
import pandas as pd
from github import Github, Auth, BadCredentialsException
from datetime import datetime
import time

GITHUB_PAT = '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'

# Authenticate with GitHub
auth = Auth.Token(GITHUB_PAT)
g = Github(auth=auth)
user = g.get_user()
repos = user.get_repos()

all_events_data = []

print("Fetching events from repositories...")

for repo in repos:
    print(f"Processing: {repo.name}")

    try:
        # Get events for the repository
        events = repo.get_events()

        for event in events:
            try:
                payload = event.raw_data.get('payload', {})

                # Extract commit information if available
                commits = payload.get('commits', [])
                commit_count = len(commits) if commits else 0

                # Extract PR information if available
                pr_title = None
                pr_id = None
                if 'pull_request' in payload:
                    pr_title = payload['pull_request'].get('title')
                    pr_id = payload['pull_request'].get('id')
                elif 'issue' in payload and payload['issue'].get('pull_request'):
                    pr_title = payload['issue'].get('title')
                    pr_id = payload['issue'].get('id')

                # Extract ref information
                ref = payload.get('ref')
                ref_type = payload.get('ref_type')

                # Create event data dictionary
                event_data = {
                    'event_id': event.id,
                    'event_type': event.type,
                    'timestamp': event.created_at.isoformat() if event.created_at else None,
                    'repo_name': repo.name,
                    'repo_full_name': repo.full_name,
                    'user_id': event.actor.id if event.actor else None,
                    'user_name': event.actor.login if event.actor else None,
                    'user_type': event.actor.type if event.actor else None,
                    'payload_action': payload.get('action'),
                    'commit_count': commit_count,
                    'ref': ref,
                    'ref_type': ref_type,
                    'pr_title': pr_title,
                    'pr_id': pr_id,
                    'commit_messages': [commit.get('message', '')[:100] for commit in commits] if commits else None,
                    'size': payload.get('size'),
                    'forkee_name': payload.get('forkee', {}).get('full_name') if 'forkee' in payload else None,
                    'release_name': payload.get('release', {}).get('name') if 'release' in payload else None,
                    'comment_body': payload.get('comment', {}).get('body') if 'comment' in payload else None
                }

                all_events_data.append(event_data)

            except Exception as e:
                print(f"Error processing event {event.id}: {e}")
                continue

        # Add a small delay to avoid rate limiting
        time.sleep(0.1)

    except Exception as e:
        print(f"Error accessing events for repository {repo.name}: {e}")
        continue

# Create DataFrame
df = pd.DataFrame(all_events_data)

# Convert timestamp to datetime
if not df.empty and 'timestamp' in df.columns:
    df['timestamp'] = pd.to_datetime(df['timestamp'])

# Save to CSV
if not df.empty:
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    csv_filename = f"github_events_{timestamp}.csv"
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"Data saved to {csv_filename}")

    # Save to JSON
    json_filename = f"github_events_{timestamp}.json"
    df.to_json(json_filename, orient='records', indent=2)
    print(f"Data saved to {json_filename}")

    # Print summary
    print(f"\nSummary:")
    print(f"Total events collected: {len(df)}")
    print(f"Total repositories processed: {len(list(repos))}")
    print(f"Unique event types: {df['event_type'].nunique()}")

    # Display sample data
    print("\nSample data:")
    print(df.head())

else:
    print("No events were collected.")

Fetching events from repositories...
Processing: test_project
Processing: Slack-and-Zendesk-Integration-Customer-Name-Standardization
Processing: .github
Processing: cyber-range
Processing: hateable
Processing: _management
Processing: _onboarding
Processing: _onboarding_data
Data saved to github_events_20250903_093605.csv
Data saved to github_events_20250903_093605.json

Summary:
Total events collected: 303
Total repositories processed: 8
Unique event types: 8

Sample data:
      event_id   event_type                 timestamp         repo_name  \
0  53063648635    PushEvent 2025-08-07 09:22:10+00:00           .github   
1  53300127149    PushEvent 2025-08-13 07:27:36+00:00          hateable   
2  52897474058  CreateEvent 2025-08-04 05:20:33+00:00       _onboarding   
3  53761037515  GollumEvent 2025-08-24 14:53:01+00:00  _onboarding_data   
4  53760813802  GollumEvent 2025-08-24 14:38:05+00:00  _onboarding_data   

               repo_full_name    user_id           user_name user_type