## Analysis of Data Loader

This notebook evaluates the distrubtion of question type within training, validation and test datasets of each dataloader.

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
!pip install seaborn



In [2]:
# Load libraries

from transformers import DistilBertModel, DistilBertForMaskedLM, DistilBertConfig, \
            DistilBertTokenizerFast, AutoTokenizer, BertModel, BertForMaskedLM, BertTokenizerFast, BertConfig
from torch import nn
from pathlib import Path
import torch
import pandas as pd
from typing import Optional
from tqdm.auto import tqdm
from torch.optim import AdamW, RMSprop
import numpy as np
from prettytable import PrettyTable

import sys
sys.path.append('/content/drive/MyDrive/distilBERT_SQuAD2_w266Project')
from qa_model import QuestionDistilBERT, SimpleQuestionDistilBERT, ReuseQuestionDistilBERT, Dataset, test_model
from util import eval_test_set, count_parameters, print_test_set_incorrect_predictions, \
                 analyze_test_set_performance, eval_test_set_by_category
from my_distilbert import QADataset

# Load tokenizer


In [3]:
# Load DistilBERT tokenizer, use uncased (lowercase) vocabulary

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

## Question Analyzer
Determine the distribiution of question types for each dataset

In [4]:
def analyze_question_distribution(data_loader, tokenizer):
    """
    Analyzes the distribution of question types in a dataset.

    Args:
        data_loader: DataLoader containing the dataset
        tokenizer: Tokenizer for decoding input IDs

    Returns:
        dict: Summary statistics of question type distribution
    """
    category_counts = {}

    def get_question_type(text):
        """Extract the question word from the input text"""
        question_words = ['who', 'what', 'when', 'where', 'why', 'how', 'which']
        words = text.lower().split()

        for word in words:
            if word in question_words:
                return word
        return 'other'

    total_samples = 0
    for batch in tqdm(data_loader):
        input_ids = batch['input_ids']

        # Process each sample in the batch
        for input_i in input_ids:
            # Decode the input text
            text = tokenizer.decode(input_i, skip_special_tokens=True)

            # Get question type
            q_type = get_question_type(text)

            # Update counts
            if q_type not in category_counts:
                category_counts[q_type] = 0
            category_counts[q_type] += 1
            total_samples += 1

    # Calculate summary statistics
    summary = {}
    for q_type, count in category_counts.items():
        summary[q_type] = {
            'count': count,
            'percentage': (count / total_samples) * 100
        }

    # Create formatted output
    pt = PrettyTable()
    pt.field_names = ['Question Type', 'Count', '% of Total']

    for q_type, stats in summary.items():
        pt.add_row([
            q_type,
            stats['count'],
            f"{stats['percentage']:.1f}%"
        ])

    print(f"\nTotal samples analyzed: {total_samples}")
    print(pt)
    return summary

# Load data_2 (Traditional split) -- Fixed-Length Trunc, 384 tokens dataloader

In [6]:
# Get paths for all SQuAD dataset text files in training directory -- data_2, Traditional Split --  Fixed-Length Trunc, 384 tokens
squad_paths_2 = [str(x) for x in Path('/content/drive/MyDrive/distilBERT_SQuAD2_w266Project/data_2/training_squad').glob('**/*.txt')]

# Create training dataset using only SQuAD data, DataLoader with batch size of 8
dataset_2_QA = QADataset(squad_paths = [str(x) for x in Path('/content/drive/MyDrive/distilBERT_SQuAD2_w266Project/data_2/training_squad').glob('**/*.txt')],
                  natural_question_paths=None,
                  hotpotqa_paths=None, tokenizer=tokenizer)
loader_2_QA = torch.utils.data.DataLoader(dataset_2_QA, batch_size=8)
print(f"Approximate Training Dataset Size: {len(dataset_2_QA)}")

## load the validation dataset -- used to be labeled as "test", test_dataset changed to val_dataset, test_loader changed to val_loader
val_dataset_2_QA = QADataset(squad_paths = [str(x) for x in Path('/content/drive/MyDrive/distilBERT_SQuAD2_w266Project/data_2/validation_squad').glob('**/*.txt')],
                       natural_question_paths=None,
                       hotpotqa_paths = None, tokenizer=tokenizer)
val_loader_2_QA = torch.utils.data.DataLoader(val_dataset_2_QA, batch_size=4)
print(f"Approximate Validation Dataset Size: {len(val_dataset_2_QA)}")

## load the test dataset -- test_dataset and test_loader should not be used during training
test_dataset_2_QA = QADataset(squad_paths = [str(x) for x in Path('/content/drive/MyDrive/distilBERT_SQuAD2_w266Project/data_2/test_squad').glob('**/*.txt')],
                       natural_question_paths=None,
                       hotpotqa_paths = None, tokenizer=tokenizer)
test_loader_2_QA = torch.utils.data.DataLoader(test_dataset_2_QA, batch_size=4)
print(f"Approximate Test Dataset Size: {len(test_dataset_2_QA)}")

Loaded 113506 total samples
Approximate Training Dataset Size: 113506
Loaded 14181 total samples
Approximate Validation Dataset Size: 14181
Loaded 14190 total samples
Approximate Test Dataset Size: 14190


In [7]:
# Analyze all three splits for the QADataset -- 384 tokens
print("Analyzing training set...")
train_dist = analyze_question_distribution(loader_2_QA, tokenizer)

print("\nAnalyzing validation set...")
val_dist = analyze_question_distribution(val_loader_2_QA, tokenizer)

print("\nAnalyzing test set...")
test_dist = analyze_question_distribution(test_loader_2_QA, tokenizer)

Analyzing training set...


  0%|          | 0/14189 [00:00<?, ?it/s]


Total samples analyzed: 113506
+---------------+-------+------------+
| Question Type | Count | % of Total |
+---------------+-------+------------+
|      when     |  7603 |    6.7%    |
|     where     |  4699 |    4.1%    |
|      who      | 11832 |   10.4%    |
|      how      | 11677 |   10.3%    |
|      what     | 65250 |   57.5%    |
|     which     |  8435 |    7.4%    |
|     other     |  2330 |    2.1%    |
|      why      |  1680 |    1.5%    |
+---------------+-------+------------+

Analyzing validation set...


  0%|          | 0/3546 [00:00<?, ?it/s]


Total samples analyzed: 14181
+---------------+-------+------------+
| Question Type | Count | % of Total |
+---------------+-------+------------+
|      what     |  8123 |   57.3%    |
|      how      |  1484 |   10.5%    |
|      why      |  255  |    1.8%    |
|      who      |  1477 |   10.4%    |
|     other     |  324  |    2.3%    |
|     which     |  1021 |    7.2%    |
|      when     |  925  |    6.5%    |
|     where     |  572  |    4.0%    |
+---------------+-------+------------+

Analyzing test set...


  0%|          | 0/3548 [00:00<?, ?it/s]


Total samples analyzed: 14190
+---------------+-------+------------+
| Question Type | Count | % of Total |
+---------------+-------+------------+
|      what     |  8188 |   57.7%    |
|      when     |  958  |    6.8%    |
|      how      |  1422 |   10.0%    |
|      who      |  1469 |   10.4%    |
|      why      |  179  |    1.3%    |
|     other     |  283  |    2.0%    |
|     where     |  627  |    4.4%    |
|     which     |  1064 |    7.5%    |
+---------------+-------+------------+


# Load data_2 (Traditional split) -- Variable-Length Trunc, 512 tokens dataloader

In [9]:
# Get paths for all SQuAD dataset text files in training directory -- data_2, Traditional Split -- Variable-Length Trunc, 512 tokens
squad_paths_2 = [str(x) for x in Path('/content/drive/MyDrive/distilBERT_SQuAD2_w266Project/data_2/training_squad').glob('**/*.txt')]

# Create training dataset using only SQuAD data, DataLoader with batch size of 8
dataset_2 = Dataset(squad_paths = [str(x) for x in Path('/content/drive/MyDrive/distilBERT_SQuAD2_w266Project/data_2/training_squad').glob('**/*.txt')],
                  natural_question_paths=None,
                  hotpotqa_paths=None, tokenizer=tokenizer)
loader_2 = torch.utils.data.DataLoader(dataset_2, batch_size=8)
print(f"Approximate Training Dataset Size: {len(dataset_2)}")

## load the validation dataset -- used to be labeled as "test", test_dataset changed to val_dataset, test_loader changed to val_loader
val_dataset_2 = Dataset(squad_paths = [str(x) for x in Path('/content/drive/MyDrive/distilBERT_SQuAD2_w266Project/data_2/validation_squad').glob('**/*.txt')],
                       natural_question_paths=None,
                       hotpotqa_paths = None, tokenizer=tokenizer)
val_loader_2 = torch.utils.data.DataLoader(val_dataset_2, batch_size=4)
print(f"Approximate Validation Dataset Size: {len(val_dataset_2)}")

## load the test dataset -- test_dataset and test_loader should not be used during training
test_dataset_2 = Dataset(squad_paths = [str(x) for x in Path('/content/drive/MyDrive/distilBERT_SQuAD2_w266Project/data_2/test_squad').glob('**/*.txt')],
                       natural_question_paths=None,
                       hotpotqa_paths = None, tokenizer=tokenizer)
test_loader_2 = torch.utils.data.DataLoader(test_dataset_2, batch_size=4)
print(f"Approximate Test Dataset Size: {len(test_dataset_2)}")

Approximate Training Dataset Size: 113000
Approximate Validation Dataset Size: 14000
Approximate Test Dataset Size: 14000


In [10]:
# Analyze all three splits for the Dataset -- 512 tokens
print("Analyzing training set...")
train_dist = analyze_question_distribution(loader_2, tokenizer)

print("\nAnalyzing validation set...")
val_dist = analyze_question_distribution(val_loader_2, tokenizer)

print("\nAnalyzing test set...")
test_dist = analyze_question_distribution(test_loader_2, tokenizer)

Analyzing training set...


  0%|          | 0/14125 [00:00<?, ?it/s]


Total samples analyzed: 113000
+---------------+-------+------------+
| Question Type | Count | % of Total |
+---------------+-------+------------+
|      when     |  7570 |    6.7%    |
|     where     |  4679 |    4.1%    |
|      who      | 11786 |   10.4%    |
|      how      | 11616 |   10.3%    |
|      what     | 64957 |   57.5%    |
|     which     |  8399 |    7.4%    |
|     other     |  2320 |    2.1%    |
|      why      |  1673 |    1.5%    |
+---------------+-------+------------+

Analyzing validation set...


  0%|          | 0/3500 [00:00<?, ?it/s]


Total samples analyzed: 14000
+---------------+-------+------------+
| Question Type | Count | % of Total |
+---------------+-------+------------+
|      what     |  8019 |   57.3%    |
|      how      |  1469 |   10.5%    |
|      why      |  251  |    1.8%    |
|      who      |  1458 |   10.4%    |
|     other     |  316  |    2.3%    |
|     which     |  1011 |    7.2%    |
|      when     |  911  |    6.5%    |
|     where     |  565  |    4.0%    |
+---------------+-------+------------+

Analyzing test set...


  0%|          | 0/3500 [00:00<?, ?it/s]


Total samples analyzed: 14000
+---------------+-------+------------+
| Question Type | Count | % of Total |
+---------------+-------+------------+
|      what     |  8084 |   57.7%    |
|      when     |  946  |    6.8%    |
|      how      |  1400 |   10.0%    |
|      who      |  1453 |   10.4%    |
|      why      |  175  |    1.2%    |
|     other     |  278  |    2.0%    |
|     where     |  619  |    4.4%    |
|     which     |  1045 |    7.5%    |
+---------------+-------+------------+


In [16]:
import pandas as pd
import numpy as np
from scipy import stats

# Fixed Length Data
fixed_length = {
    'train': {
        'total': 113506,
        'when': 7603,
        'where': 4699,
        'who': 11832,
        'how': 11677,
        'what': 65250,
        'which': 8435,
        'other': 2330,
        'why': 1680
    },
    'val': {
        'total': 14181,
        'when': 925,
        'where': 572,
        'who': 1477,
        'how': 1484,
        'what': 8123,
        'which': 1021,
        'other': 324,
        'why': 255
    },
    'test': {
        'total': 14190,
        'when': 958,
        'where': 627,
        'who': 1469,
        'how': 1422,
        'what': 8188,
        'which': 1064,
        'other': 283,
        'why': 179
    }
}

# Variable Length Data
variable_length = {
    'train': {
        'total': 113000,
        'when': 7570,
        'where': 4679,
        'who': 11786,
        'how': 11616,
        'what': 64957,
        'which': 8399,
        'other': 2320,
        'why': 1673
    },
    'val': {
        'total': 14000,
        'when': 911,
        'where': 565,
        'who': 1458,
        'how': 1469,
        'what': 8019,
        'which': 1011,
        'other': 316,
        'why': 251
    },
    'test': {
        'total': 14000,
        'when': 946,
        'where': 619,
        'who': 1453,
        'how': 1400,
        'what': 8084,
        'which': 1045,
        'other': 278,
        'why': 175
    }
}

def analyze_differences(split):
    """Perform statistical analysis for a given split"""
    fixed = fixed_length[split]
    variable = variable_length[split]

    # Create contingency table
    question_types = ['when', 'where', 'who', 'how', 'what', 'which', 'other', 'why']
    observed = np.array([
        [fixed[qt] for qt in question_types],
        [variable[qt] for qt in question_types]
    ])

    # Perform chi-square test
    chi2, p_value, dof, expected = stats.chi2_contingency(observed)

    # Calculate standardized residuals
    expected = np.array(expected)
    residuals = (observed - expected) / np.sqrt(expected)

    return {
        'chi2': chi2,
        'p_value': p_value,
        'dof': dof,
        'residuals': pd.DataFrame(
            residuals,
            index=['fixed', 'variable'],
            columns=question_types
        )
    }

# Analyze each split
results = {}
for split in ['train', 'val', 'test']:
    results[split] = analyze_differences(split)
    print(f"\nResults for {split} split:")
    print(f"Chi-square statistic: {results[split]['chi2']:.4f}")
    print(f"p-value: {results[split]['p_value']:.4e}")
    print(f"Degrees of freedom: {results[split]['dof']}")
    print("\nStandardized residuals (>1.96 or <-1.96 indicates significant difference):")
    print(results[split]['residuals'].round(3))

    # Print significant differences
    residuals = results[split]['residuals']
    significant = np.abs(residuals) > 1.96
    if significant.any().any():
        print("\nSignificant differences found in:")
        for col in residuals.columns:
            if significant.loc['fixed', col] or significant.loc['variable', col]:
                fixed_count = fixed_length[split][col]
                variable_count = variable_length[split][col]
                fixed_pct = (fixed_count / fixed_length[split]['total']) * 100
                variable_pct = (variable_count / variable_length[split]['total']) * 100
                print(f"* {col}:")
                print(f"  - Fixed: {fixed_count} ({fixed_pct:.1f}%)")
                print(f"  - Variable: {variable_count} ({variable_pct:.1f}%)")
                print(f"  - Residual: {residuals.loc['fixed', col]:.3f}")


Results for train split:
Chi-square statistic: 0.0058
p-value: 1.0000e+00
Degrees of freedom: 7

Standardized residuals (>1.96 or <-1.96 indicates significant difference):
           when  where    who    how   what  which  other    why
fixed    -0.005 -0.007 -0.031  0.041  0.004 -0.009 -0.004 -0.006
variable  0.005  0.007  0.031 -0.042 -0.004  0.009  0.004  0.006

Results for val split:
Chi-square statistic: 0.0374
p-value: 1.0000e+00
Degrees of freedom: 7

Standardized residuals (>1.96 or <-1.96 indicates significant difference):
           when  where    who    how   what  which  other    why
fixed     0.036 -0.006  0.002 -0.051  0.002 -0.048  0.108  0.024
variable -0.037  0.006 -0.002  0.052 -0.002  0.048 -0.109 -0.024

Results for test split:
Chi-square statistic: 0.0312
p-value: 1.0000e+00
Degrees of freedom: 7

Standardized residuals (>1.96 or <-1.96 indicates significant difference):
           when  where    who   how   what  which  other    why
fixed    -0.013 -0.008 -0.048 