# Code Search Net Data Exploration

This notebook explores the dataset used for training the Code Summarization model.

In [None]:
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add project root to path to import src modules
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from src.data_utils import load_codesearchnet_dataset, preprocess_data

## 1. Load Data
We load a sample of the CodeSearchNet dataset using the project's data loader.

In [None]:
# Load data
df = load_codesearchnet_dataset(split="train", limit=5000)
print(f"Loaded {len(df)} samples")
df.head()

## 2. Preprocessing & Cleaning
We apply the same noise reduction and normalization steps used in training.

In [None]:
df = preprocess_data(df)
print(f"Remaining samples after cleaning: {len(df)}")

## 3. Length Distribution
Let's look at the distribution of code and summary lengths (in characters).

In [None]:
df['code_len'] = df['code'].str.len()
df['summary_len'] = df['summary'].str.len()

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

sns.histplot(df['code_len'], bins=50, ax=axes[0], color='skyblue')
axes[0].set_title('Code Length Distribution')
axes[0].set_xlabel('Characters')

sns.histplot(df['summary_len'], bins=50, ax=axes[1], color='salmon')
axes[1].set_title('Summary Length Distribution')
axes[1].set_xlabel('Characters')

plt.show()

## 4. Vocabulary Analysis
A quick look at the most common words in the summaries.

In [None]:
from collections import Counter
import re

all_text = " ".join(df['summary'].tolist())
words = re.findall(r"[\w']+", all_text.lower())
counter = Counter(words)

common_words = counter.most_common(20)
cw_df = pd.DataFrame(common_words, columns=['word', 'count'])

plt.figure(figsize=(12, 6))
sns.barplot(x='count', y='word', data=cw_df, palette='viridis')
plt.title('Top 20 Most Common Words in Summaries')
plt.show()