# Ruby Method Data Exploration

This notebook explores the extracted Ruby method data and prepares it for GNN training.

In [None]:
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '..', 'src'))

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from data_processing import load_methods_json, methods_to_dataframe, filter_methods_by_length

## Load and Explore Data

In [None]:
# Load the extracted Ruby methods
methods = load_methods_json('../output/methods.json')
df = methods_to_dataframe(methods)

print(f"Loaded {len(df)} Ruby methods")
print(f"Columns: {list(df.columns)}")
df.head()

## Data Statistics

In [None]:
# Analyze method lengths
df['line_count'] = df['raw_source'].apply(lambda x: len(x.split('\n')))

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.hist(df['line_count'], bins=50, alpha=0.7)
plt.xlabel('Lines of Code')
plt.ylabel('Number of Methods')
plt.title('Distribution of Method Lengths')

plt.subplot(1, 2, 2)
repo_counts = df['repo_name'].value_counts()
plt.bar(repo_counts.index, repo_counts.values)
plt.xlabel('Repository')
plt.ylabel('Number of Methods')
plt.title('Methods per Repository')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

print(f"Method length statistics:")
print(df['line_count'].describe())

## Filter Data for Training

In [None]:
# Filter methods for reasonable training data
filtered_df = filter_methods_by_length(df, min_lines=5, max_lines=50)

print(f"After filtering: {len(filtered_df)} methods")
print(f"Filtered out {len(df) - len(filtered_df)} methods")

# Show some example methods
print("\nExample Ruby methods:")
for i, row in filtered_df.head(3).iterrows():
    print(f"\n--- Method from {row['repo_name']} ---")
    print(row['raw_source'][:200] + "..." if len(row['raw_source']) > 200 else row['raw_source'])