# DataScience-Instruct-500K — Sample Viewer

This notebook displays **one sample** from every file under the `reasoning/`, `interation/`, and `RL/` subdirectories.

In [10]:
import json
import os
import glob
import pprint
import pandas as pd

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

DATA_ROOT = "../data/DataScience-Instruct-500K"

In [11]:
def load_one_sample_json(filepath):
    """Load only the first sample from a JSON file (list of dicts)."""
    with open(filepath, 'r') as f:
        data = json.load(f)
    return data[0] if isinstance(data, list) and len(data) > 0 else data

def load_one_sample_parquet(filepath):
    """Load only the first row from a parquet file."""
    df = pd.read_parquet(filepath)
    return df.iloc[0].to_dict()

def display_sample(sample, filepath):
    """Print a single sample as raw data."""
    fname = os.path.basename(filepath)
    print("=" * 80)
    print(f"FILE: {fname}")
    print(f"PATH: {filepath}")
    print("=" * 80)
    pprint.pprint(sample, width=120, depth=4)
    print()

---
## 1. Reasoning

One sample from each file in `reasoning/`.

In [12]:
reasoning_dir = os.path.join(DATA_ROOT, "reasoning")
reasoning_files = sorted(glob.glob(os.path.join(reasoning_dir, "*.json")))
print(f"Found {len(reasoning_files)} files in reasoning/")
for f in reasoning_files:
    print(f"  - {os.path.basename(f)}")

Found 15 files in reasoning/
  - SKGInstruct_199989.json
  - TableGPT_29448.json
  - TableQA_distillation_39301.json
  - TableQA_original_35357.json
  - TableQA_refinement_39301.json
  - code_20000.json
  - dscode_16338.json
  - file_any_2520.json
  - file_csv_3007.json
  - file_database_3833.json
  - file_xlsx_3663.json
  - instruction_following_20000.json
  - math_20000.json
  - other_19998.json
  - science_20000.json


In [13]:
for filepath in reasoning_files:
    sample = load_one_sample_json(filepath)
    display_sample(sample, filepath)

FILE: SKGInstruct_199989.json
PATH: ../data/DataScience-Instruct-500K/reasoning/SKGInstruct_199989.json
{'evaluation': {'ability': 'Reasoning', 'difficulty': 3, 'quality': 5},
 'id': 512069,
 'input_tokens': 151,
 'messages': [{'content': 'Write your answer to the question based on your reasoning given the information in the '
                          'following table table:\n'
                          '\n'
                          'col : stem | leaf  row 1 : 3 | 3, 3, 3, 5, 5 row 2 : 4 | 6 row 3 : 5 | 4, 5, 7, 8 row 4 : 6 '
                          '| 7, 8 row 5 : 7 | 2, 3, 7, 9 row 6 : 8 | 6, 8, 9\n'
                          '\n'
                          '\n'
                          'question:\n'
                          '\n'
                          "The members of the local garden club tallied the number of plants in each person's garden. "
                          'How many gardens have at least 47 plants?',
               'role': 'user'},
              {'content': '<An

---
## 2. Interation

One sample from each file in `interation/`.

In [14]:
interation_dir = os.path.join(DATA_ROOT, "interation")
interation_files = sorted(glob.glob(os.path.join(interation_dir, "*.json")))
print(f"Found {len(interation_files)} files in interation/")
for f in interation_files:
    print(f"  - {os.path.basename(f)}")

Found 12 files in interation/
  - data_analysis_3936.json
  - data_cleaning_1616.json
  - data_insight_1062.json
  - data_pipeline_3601.json
  - data_preparation_3311.json
  - research_data_analysis_1339.json
  - research_data_insight_1351.json
  - research_data_preparation_488.json
  - research_database_818.json
  - research_other_3505.json
  - research_report_generation_4327.json
  - research_xlsx_848.json


In [15]:
for filepath in interation_files:
    sample = load_one_sample_json(filepath)
    display_sample(sample, filepath)

FILE: data_analysis_3936.json
PATH: ../data/DataScience-Instruct-500K/interation/data_analysis_3936.json
{'evaluation': {'ability': ['Data Preparation',
                            'Data Analysis',
                            'Data Visualization',
                            'Data Modeling',
                            'Data Insight',
                            'Code Generation',
                            'Reasoning',
                            'Numerical Calculation',
                            'Instruction Following',
                            'Structural Understanding'],
                'difficulty': 5,
                'quality': 5},
 'id': 4,
 'input_tokens': 299,
 'messages': [{'content': '# Instruction\n'
                          "Integrate the 'all_star' and 'fielding' tables via player_id and year to categorize players "
                          "as all-stars (present in 'all_star') or non-all-stars (present in 'fielding' but not "
                          "'all_star'

---
## 3. RL

One sample from each file in `RL/` (parquet files, excluding `data.zip`).

In [None]:
rl_dir = os.path.join(DATA_ROOT, "RL")
rl_files = sorted(glob.glob(os.path.join(rl_dir, "*.parquet")))
print(f"Found {len(rl_files)} parquet files in RL/")
for f in rl_files:
    print(f"  - {os.path.basename(f)}")

In [None]:
for filepath in rl_files:
    sample = load_one_sample_parquet(filepath)
    display_sample(sample, filepath)

---
## 4. Summary Table

Quick overview of file counts and record counts across all three subdirectories.

In [None]:
summary_rows = []

for subdir, files, loader in [
    ("reasoning", reasoning_files, lambda f: len(json.load(open(f)))),
    ("interation", interation_files, lambda f: len(json.load(open(f)))),
    ("RL", rl_files, lambda f: len(pd.read_parquet(f))),
]:
    for fp in files:
        fname = os.path.basename(fp)
        size_mb = os.path.getsize(fp) / (1024 * 1024)
        n_records = loader(fp)
        summary_rows.append({
            "Subdirectory": subdir,
            "File": fname,
            "Size (MB)": round(size_mb, 1),
            "Records": n_records,
        })

summary_df = pd.DataFrame(summary_rows)
display(summary_df)
print(f"\nTotal records: {summary_df['Records'].sum():,}")