In [None]:
!pip install PyPDF2 pandas pypdf2


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


**Extract Text from PDF**

For the purpose of validation, we need to extract the raw text from the PDF and convert it into a structured form (e.g., tables, sections). Here's how you can extract text from a PDF using PyPDF2.

In [None]:
from PyPDF2 import PdfReader

def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# Example usage
pdf_path = '/content/test_data_validation_pdf.pdf'
pdf_text = extract_text_from_pdf(pdf_path)
print(pdf_text)


Test PDF for Data Validation

Section 1: Descriptions (Raw Text Validation)
This section includes general information about a fictional product. The product was launched in 2023, with
an expected growth rate of 10% per annum. Further insights include a focus on developing AI technologies
by 2025.

Section 2: Table (Tabular Data Validation)
Name Date of Birth Email
John Doe 1980-05-12 john.doe@email.com
Jane Smith 1990-07-23 jane.smith.email.com
Mike Brown 1975-11-05 mike.brown@email.com

Section 3: Regular Expression Validation
This section includes examples of text that should match specific patterns.
Contact: (555)-123-4567. Expected date: 12-12-2024.
Please validate the date formats and phone number patterns.


For a structured PDF, such as a requirements document, we might want to check whether certain fields have consistent values (e.g., all requirement IDs are unique or all dates are in the correct format).

In [None]:
import re

def check_for_sections(text, section_names):
    """
    Check if specific sections (e.g., "Requirements", "Specifications") exist in the PDF text.
    :param text: The extracted text from the PDF.
    :param section_names: List of sections to look for.
    """
    missing_sections = []
    for section in section_names:
        if re.search(section, text, re.IGNORECASE) is None:
            missing_sections.append(section)

    if missing_sections:
        print(f"Error: The following sections are missing: {', '.join(missing_sections)}")
    else:
        print("All required sections are present.")

# Check for key sections in the PDF
sections_to_validate = ["Requirements", "Section 2: Table", "10%"]
check_for_sections(pdf_text, sections_to_validate)


Error: The following sections are missing: Requirements


In [None]:
import pandas as pd

def validate_consistency_of_ids(text):
    """
    Validate the uniqueness and consistency of IDs in the text.
    :param text: Extracted text from the PDF.
    """
    ids = re.findall(r"\bSection \d+\b", text)  # Example: Look for pattern like REQ-123
    if len(ids) != len(set(ids)):
        print("Error: Duplicate IDs found.")
    else:
        print("All IDs are unique.")

# Validate consistency of IDs
validate_consistency_of_ids(pdf_text)


All IDs are unique.


If the PDF contains tables that need to be extracted, validated, and processed further, we need to use a more advanced method. For example, we could use libraries like tabula-py to extract tabular data.

In [None]:
!pip install tabula-py


Collecting tabula-py
  Downloading tabula_py-2.9.3-py3-none-any.whl.metadata (7.6 kB)
Downloading tabula_py-2.9.3-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m85.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tabula-py
Successfully installed tabula-py-2.9.3


In [None]:
import tabula

def extract_tables_from_pdf(pdf_path):
    """
    Extract tables from a PDF using tabula.
    :param pdf_path: Path to the PDF file.
    :return: List of DataFrames representing tables in the PDF.
    """
    tables = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True)
    return tables

# Extract tables from the PDF
tables = extract_tables_from_pdf(pdf_path)

# Show first table
if tables:
    print(tables[0].head())
else:
    print("No tables found in the PDF.")


Sep 10, 2024 7:47:48 AM org.apache.pdfbox.pdmodel.font.FileSystemFontProvider <init>
Sep 10, 2024 7:47:48 AM org.apache.pdfbox.pdmodel.font.FileSystemFontProvider <init>
Sep 10, 2024 7:47:48 AM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Sep 10, 2024 7:47:48 AM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
Sep 10, 2024 7:47:48 AM org.apache.pdfbox.pdmodel.font.PDSimpleFont toUnicode
Sep 10, 2024 7:47:49 AM org.apache.pdfbox.rendering.Type1Glyph2D getPathForCharacterCode



         Name Date of Birth                 Email
0    John Doe    1980-05-12    john.doe@email.com
1  Jane Smith    1990-07-23  jane.smith.email.com
2  Mike Brown    1975-11-05  mike.brown@email.com


Once the data is extracted, you can run more comprehensive data validation tests, such as:

Missing value checks: Ensure required fields are not empty.
Data type validation: Validate that the extracted values conform to expected data types.
Outlier detection: Use statistical methods to find anomalies in the data.

In [None]:
import re
from PyPDF2 import PdfReader

# Step 1: Extract Text Data from the PDF
def extract_pdf_text(pdf_file):
    reader = PdfReader(pdf_file)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# Step 2: Raw Text Validation
def validate_raw_text(text):
    issues = []

    # Validate that the year 2023 is mentioned
    if "2023" not in text:
        issues.append("Year 2023 not mentioned in the text.")

    # Validate the growth rate is mentioned
    if "10%" not in text:
        issues.append("Growth rate of 10% not mentioned in the text.")

    return issues if issues else "Raw text validation passed."

# Step 3: Tabular Data Validation (Check Email and Date of Birth)
def validate_tabular_data(text):
    issues = []

    # Validate email addresses
    emails = re.findall(r'[\w\.-]+@[\w\.-]+', text)
    invalid_emails = [email for email in emails if not re.match(r"[^@]+@[^@]+\.[^@]+", email)]

    if invalid_emails:
        issues.append(f"Invalid email addresses found: {invalid_emails}")

    # Validate dates in format YYYY-MM-DD
    dates = re.findall(r'\d{4}-\d{2}-\d{2}', text)
    if not dates:
        issues.append("No valid dates of birth found in the table.")

    return issues if issues else "Tabular data validation passed."

# Step 4: Regular Expression Validation (Check phone number and date format)
def validate_regex_patterns(text):
    issues = []

    # Validate phone numbers (format: (555)-123-4567)
    phone_numbers = re.findall(r'\(\d{3}\)-\d{3}-\d{4}', text)
    if not phone_numbers:
        issues.append("No valid phone numbers found in the format (555)-123-4567.")

    # Validate expected date format (format: dd-mm-yyyy)
    expected_dates = re.findall(r'\d{2}-\d{2}-\d{4}', text)
    if not expected_dates:
        issues.append("No valid expected dates found in the format dd-mm-yyyy.")

    return issues if issues else "Regex pattern validation passed."

# Main function to validate the PDF
def validate_pdf(pdf_file):
    text = extract_pdf_text(pdf_file)

    print("\nRaw Text Validation:")
    print(validate_raw_text(text))

    print("\nTabular Data Validation:")
    print(validate_tabular_data(text))

    print("\nRegex Pattern Validation:")
    print(validate_regex_patterns(text))


# Run the validation on the provided PDF
pdf_file_path = "/content/test_data_validation_pdf.pdf"  # Replace with actual PDF path
validate_pdf(pdf_file_path)



Raw Text Validation:
Raw text validation passed.

Tabular Data Validation:
Tabular data validation passed.

Regex Pattern Validation:
Regex pattern validation passed.


Let's implement image data validation using a small image dataset. I'll use a subset of the **CIFAR-10** dataset, which is available through the `torchvision` package. We'll load a few images, apply basic validation (like file type, size, and color mode checks), and show how to implement this in code.

### Steps:
1. Install required libraries.
2. Download a subset of the CIFAR-10 dataset.
3. Apply the validation functions for file types, dimensions, and color modes.

### 1. Install Required Libraries

```bash
!pip install pillow
!pip install torchvision
```

### 2. Download and Load the CIFAR-10 Dataset

```python
import os
from torchvision import datasets, transforms
from torchvision.utils import save_image

# Create a directory to store the images
os.makedirs('cifar10_images', exist_ok=True)

# Define the transform to convert images to tensor and save them
transform = transforms.Compose([transforms.ToTensor()])

# Download and load the CIFAR-10 training dataset (first 10 images)
dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)

# Save the first 10 images from the dataset
for i in range(10):
    image, label = dataset[i]
    save_image(image, f'cifar10_images/image_{i}.png')
```

### 3. Implement Image Validation Functions

We'll use the `Pillow` library to handle image operations and run validations similar to what we outlined before.

```python
from PIL import Image
import os
import numpy as np

# Define the path to your image dataset folder
IMAGE_DIR = "cifar10_images"

# Step 1: File Type Validation
def validate_file_types(image_dir, valid_extensions=['.jpg', '.png']):
    invalid_files = []
    for filename in os.listdir(image_dir):
        ext = os.path.splitext(filename)[-1].lower()
        if ext not in valid_extensions:
            invalid_files.append(filename)
    if invalid_files:
        print(f"Invalid file types found: {invalid_files}")
    else:
        print("All files have valid image extensions.")

# Step 2: Corrupted File Detection
def check_corrupted_images(image_dir):
    corrupted_files = []
    for filename in os.listdir(image_dir):
        file_path = os.path.join(image_dir, filename)
        try:
            with Image.open(file_path) as img:
                img.verify()  # Check if image can be opened
        except (IOError, SyntaxError) as e:
            corrupted_files.append(filename)
    if corrupted_files:
        print(f"Corrupted files found: {corrupted_files}")
    else:
        print("No corrupted files found.")

# Step 3: Dimension and Aspect Ratio Check
def validate_image_dimensions(image_dir, expected_size=(32, 32)):
    incorrect_size_files = []
    for filename in os.listdir(image_dir):
        file_path = os.path.join(image_dir, filename)
        try:
            with Image.open(file_path) as img:
                if img.size != expected_size:
                    incorrect_size_files.append((filename, img.size))
        except Exception as e:
            continue  # Skip files that can't be opened
    if incorrect_size_files:
        print(f"Images with incorrect dimensions found: {incorrect_size_files}")
    else:
        print(f"All images have the expected dimensions of {expected_size}.")

# Step 4: Color Mode Verification (RGB or Grayscale)
def validate_color_mode(image_dir, expected_mode='RGB'):
    incorrect_mode_files = []
    for filename in os.listdir(image_dir):
        file_path = os.path.join(image_dir, filename)
        try:
            with Image.open(file_path) as img:
                if img.mode != expected_mode:
                    incorrect_mode_files.append((filename, img.mode))
        except Exception as e:
            continue
    if incorrect_mode_files:
        print(f"Images with incorrect color mode found: {incorrect_mode_files}")
    else:
        print(f"All images have the expected color mode: {expected_mode}.")

# Step 5: Outlier Detection (by file size)
def detect_outliers_by_size(image_dir, threshold_factor=2):
    sizes = []
    for filename in os.listdir(image_dir):
        file_path = os.path.join(image_dir, filename)
        sizes.append(os.path.getsize(file_path))
    
    avg_size = np.mean(sizes)
    std_dev = np.std(sizes)
    
    outliers = [(filename, size) for filename, size in zip(os.listdir(image_dir), sizes) if abs(size - avg_size) > threshold_factor * std_dev]
    if outliers:
        print(f"Outliers found based on size: {outliers}")
    else:
        print("No outliers detected based on file size.")

# Main function to perform all validation steps
def validate_image_dataset(image_dir):
    print("Starting image dataset validation...\n")
    
    # Step 1: Validate file types
    print("Validating file types...")
    validate_file_types(image_dir)
    
    # Step 2: Check for corrupted files
    print("\nChecking for corrupted files...")
    check_corrupted_images(image_dir)
    
    # Step 3: Validate image dimensions
    print("\nValidating image dimensions...")
    validate_image_dimensions(image_dir, expected_size=(32, 32))
    
    # Step 4: Validate color mode
    print("\nValidating image color mode...")
    validate_color_mode(image_dir, expected_mode='RGB')
    
    # Step 5: Detect outliers by file size
    print("\nDetecting outliers by file size...")
    detect_outliers_by_size(image_dir)

# Run the validation on your image dataset
validate_image_dataset(IMAGE_DIR)
```

### Explanation:

1. **File Type Validation**:
   - Checks if all files in the dataset have valid image extensions like `.jpg` or `.png`.

2. **Corrupted File Detection**:
   - Opens each image to check if it can be opened or if it's corrupted.

3. **Dimension Check**:
   - Validates that all images have the correct dimensions (32x32 in the case of CIFAR-10).

4. **Color Mode Verification**:
   - Ensures all images are in RGB mode.

5. **Outlier Detection by File Size**:
   - Detects images whose file sizes deviate significantly from the average.

---


This code provides a comprehensive validation for a small image dataset like CIFAR-10, ensuring that the data is clean and ready for use in AI model training.

In [None]:
!pip install pillow
!pip install torchvision




In [None]:
import os
from torchvision import datasets, transforms
from torchvision.utils import save_image

# Create a directory to store the images
os.makedirs('cifar10_images', exist_ok=True)

# Define the transform to convert images to tensor and save them
transform = transforms.Compose([transforms.ToTensor()])

# Download and load the CIFAR-10 training dataset (first 10 images)
dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)

# Save the first 10 images from the dataset
for i in range(10):
    image, label = dataset[i]
    save_image(image, f'cifar10_images/image_{i}.png')


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:10<00:00, 16041015.11it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data


In [None]:
from PIL import Image
import os
import numpy as np

# Define the path to your image dataset folder
IMAGE_DIR = "cifar10_images"

# Step 1: File Type Validation
def validate_file_types(image_dir, valid_extensions=['.jpg', '.png']):
    invalid_files = []
    for filename in os.listdir(image_dir):
        ext = os.path.splitext(filename)[-1].lower()
        if ext not in valid_extensions:
            invalid_files.append(filename)
    if invalid_files:
        print(f"Invalid file types found: {invalid_files}")
    else:
        print("All files have valid image extensions.")

# Step 2: Corrupted File Detection
def check_corrupted_images(image_dir):
    corrupted_files = []
    for filename in os.listdir(image_dir):
        file_path = os.path.join(image_dir, filename)
        try:
            with Image.open(file_path) as img:
                img.verify()  # Check if image can be opened
        except (IOError, SyntaxError) as e:
            corrupted_files.append(filename)
    if corrupted_files:
        print(f"Corrupted files found: {corrupted_files}")
    else:
        print("No corrupted files found.")

# Step 3: Dimension and Aspect Ratio Check
def validate_image_dimensions(image_dir, expected_size=(32, 32)):
    incorrect_size_files = []
    for filename in os.listdir(image_dir):
        file_path = os.path.join(image_dir, filename)
        try:
            with Image.open(file_path) as img:
                if img.size != expected_size:
                    incorrect_size_files.append((filename, img.size))
        except Exception as e:
            continue  # Skip files that can't be opened
    if incorrect_size_files:
        print(f"Images with incorrect dimensions found: {incorrect_size_files}")
    else:
        print(f"All images have the expected dimensions of {expected_size}.")

# Step 4: Color Mode Verification (RGB or Grayscale)
def validate_color_mode(image_dir, expected_mode='RGB'):
    incorrect_mode_files = []
    for filename in os.listdir(image_dir):
        file_path = os.path.join(image_dir, filename)
        try:
            with Image.open(file_path) as img:
                if img.mode != expected_mode:
                    incorrect_mode_files.append((filename, img.mode))
        except Exception as e:
            continue
    if incorrect_mode_files:
        print(f"Images with incorrect color mode found: {incorrect_mode_files}")
    else:
        print(f"All images have the expected color mode: {expected_mode}.")

# Step 5: Outlier Detection (by file size)
def detect_outliers_by_size(image_dir, threshold_factor=2):
    sizes = []
    for filename in os.listdir(image_dir):
        file_path = os.path.join(image_dir, filename)
        sizes.append(os.path.getsize(file_path))

    avg_size = np.mean(sizes)
    std_dev = np.std(sizes)

    outliers = [(filename, size) for filename, size in zip(os.listdir(image_dir), sizes) if abs(size - avg_size) > threshold_factor * std_dev]
    if outliers:
        print(f"Outliers found based on size: {outliers}")
    else:
        print("No outliers detected based on file size.")

# Main function to perform all validation steps
def validate_image_dataset(image_dir):
    print("Starting image dataset validation...\n")

    # Step 1: Validate file types
    print("Validating file types...")
    validate_file_types(image_dir)

    # Step 2: Check for corrupted files
    print("\nChecking for corrupted files...")
    check_corrupted_images(image_dir)

    # Step 3: Validate image dimensions
    print("\nValidating image dimensions...")
    validate_image_dimensions(image_dir, expected_size=(32, 32))

    # Step 4: Validate color mode
    print("\nValidating image color mode...")
    validate_color_mode(image_dir, expected_mode='RGB')

    # Step 5: Detect outliers by file size
    print("\nDetecting outliers by file size...")
    detect_outliers_by_size(image_dir)

# Run the validation on your image dataset
validate_image_dataset(IMAGE_DIR)


Starting image dataset validation...

Validating file types...
All files have valid image extensions.

Checking for corrupted files...
No corrupted files found.

Validating image dimensions...
All images have the expected dimensions of (32, 32).

Validating image color mode...
All images have the expected color mode: RGB.

Detecting outliers by file size...
No outliers detected based on file size.


In [None]:
from PIL import Image
import os
import numpy as np
from torchvision.utils import save_image
import random
from collections import Counter

# Simulating Negative Test Cases
def simulate_negative_tests(image_dir):
    # 1. Create a corrupted image
    corrupted_image_path = os.path.join(image_dir, 'corrupted_image.jpg')
    with open(corrupted_image_path, 'w') as f:
        f.write("This is not a valid image file!")

    # 2. Create a non-image file with an image extension
    invalid_file_path = os.path.join(image_dir, 'invalid_image.txt')
    with open(invalid_file_path, 'w') as f:
        f.write("This is a text file with an image extension!")

    # 3. Create an image with incorrect dimensions
    img = Image.new('RGB', (100, 100), color='red')  # 100x100, not 32x32
    img.save(os.path.join(image_dir, 'wrong_dimension_image.png'))

    # 4. Create a grayscale image instead of RGB
    img = Image.new('L', (32, 32), color=128)  # 'L' mode for grayscale
    img.save(os.path.join(image_dir, 'grayscale_image.png'))

    # 5. Add an extremely large image (outlier in size)
    img = Image.new('RGB', (1000, 1000), color='blue')  # Abnormally large image
    img.save(os.path.join(image_dir, 'large_image.png'))


# Create negative test images
simulate_negative_tests('cifar10_images')

# Updated validation functions (with negative test handling)
def validate_file_types(image_dir, valid_extensions=['.jpg', '.png']):
    invalid_files = []
    for filename in os.listdir(image_dir):
        ext = os.path.splitext(filename)[-1].lower()
        if ext not in valid_extensions:
            invalid_files.append(filename)
    if invalid_files:
        print(f"Invalid file types found: {invalid_files}")
    else:
        print("All files have valid image extensions.")


def check_corrupted_images(image_dir):
    corrupted_files = []
    for filename in os.listdir(image_dir):
        file_path = os.path.join(image_dir, filename)
        try:
            with Image.open(file_path) as img:
                img.verify()  # Check if the image is corrupted
        except (IOError, SyntaxError) as e:
            corrupted_files.append(filename)
    if corrupted_files:
        print(f"Corrupted files found: {corrupted_files}")
    else:
        print("No corrupted files found.")


def validate_image_dimensions(image_dir, expected_size=(32, 32)):
    incorrect_size_files = []
    for filename in os.listdir(image_dir):
        file_path = os.path.join(image_dir, filename)
        try:
            with Image.open(file_path) as img:
                if img.size != expected_size:
                    incorrect_size_files.append((filename, img.size))
        except Exception as e:
            continue
    if incorrect_size_files:
        print(f"Images with incorrect dimensions found: {incorrect_size_files}")
    else:
        print(f"All images have the expected dimensions of {expected_size}.")


def validate_color_mode(image_dir, expected_mode='RGB'):
    incorrect_mode_files = []
    for filename in os.listdir(image_dir):
        file_path = os.path.join(image_dir, filename)
        try:
            with Image.open(file_path) as img:
                if img.mode != expected_mode:
                    incorrect_mode_files.append((filename, img.mode))
        except Exception as e:
            continue
    if incorrect_mode_files:
        print(f"Images with incorrect color mode found: {incorrect_mode_files}")
    else:
        print(f"All images have the expected color mode: {expected_mode}.")


def detect_outliers_by_size(image_dir, threshold_factor=2):
    sizes = []
    for filename in os.listdir(image_dir):
        file_path = os.path.join(image_dir, filename)
        sizes.append(os.path.getsize(file_path))

    avg_size = np.mean(sizes)
    std_dev = np.std(sizes)

    outliers = [(filename, size) for filename, size in zip(os.listdir(image_dir), sizes) if abs(size - avg_size) > threshold_factor * std_dev]
    if outliers:
        print(f"Outliers found based on size: {outliers}")
    else:
        print("No outliers detected based on file size.")


# Main function to perform all validation steps, including negative tests
def validate_image_dataset(image_dir):
    print("Starting image dataset validation...\n")

    # Step 1: Validate file types
    print("Validating file types...")
    validate_file_types(image_dir)

    # Step 2: Check for corrupted files
    print("\nChecking for corrupted files...")
    check_corrupted_images(image_dir)

    # Step 3: Validate image dimensions
    print("\nValidating image dimensions...")
    validate_image_dimensions(image_dir, expected_size=(32, 32))

    # Step 4: Validate color mode
    print("\nValidating image color mode...")
    validate_color_mode(image_dir, expected_mode='RGB')

    # Step 5: Detect outliers by file size
    print("\nDetecting outliers by file size...")
    detect_outliers_by_size(image_dir)


# Run the validation on the dataset with negative test cases
validate_image_dataset('cifar10_images')


Starting image dataset validation...

Validating file types...
Invalid file types found: ['invalid_image.txt']

Checking for corrupted files...
Corrupted files found: ['invalid_image.txt', 'corrupted_image.jpg']

Validating image dimensions...
Images with incorrect dimensions found: [('large_image.png', (1000, 1000)), ('wrong_dimension_image.png', (100, 100))]

Validating image color mode...
Images with incorrect color mode found: [('grayscale_image.png', 'L')]

Detecting outliers by file size...
Outliers found based on size: [('large_image.png', 5213)]




---



Testing an audio dataset for AI applications is a crucial step to ensure that the data used for model training and evaluation is of high quality, valid, and representative of the real-world scenarios the AI model will encounter. Proper validation of audio data involves multiple aspects, such as verifying the file types, checking for corruption, validating the duration of the audio files, and ensuring the data adheres to the expected format.

**File Type and Corruption Checks:** The first step in audio dataset validation is to ensure that all files are in the correct format, such as `.wav`, `.mp3`, or another supported audio type. Corrupted files, which might contain incomplete or erroneous data, must also be identified and removed to prevent the model from learning incorrect patterns or failing during training.

**Duration and Quality Validation:** Audio files should also be checked for duration, ensuring they fall within acceptable length ranges that are suitable for the AI model’s requirements. Extremely short or long files may not be useful or could introduce biases into the model. Quality measures, such as checking for noise or signal distortions, can also be essential depending on the use case.

**Feature Integrity Checks:** In more advanced validation, you can inspect specific features extracted from audio, such as frequency, pitch, or spectrograms, to ensure the dataset covers a wide range of real-world scenarios. Tools like `librosa` or `pyDub` can help with these checks by loading and analyzing the audio files for features like amplitude, energy, and silence.

Testing an audio dataset helps ensure that the data is not only valid and complete but also diverse and robust enough to allow the AI model to generalize well to new, unseen data. This validation minimizes the risk of model failure in production environments and ensures better performance.

In [None]:
pip install librosa soundfile




In [None]:
import os
import librosa
import soundfile as sf
import numpy as np

# Create a smaller 'audio_dataset' directory if it doesn't exist
audio_dir = 'small_audio_dataset'
if not os.path.exists(audio_dir):
    os.makedirs(audio_dir)

# Simulate a small audio dataset with one negative case (corrupted file)
def simulate_small_audio_tests(audio_dir):
    # 1. Create a valid short audio file
    sample_rate = 44100
    duration = 2  # 2 seconds
    audio_data = np.random.uniform(-1, 1, size=sample_rate * duration)
    sf.write(os.path.join(audio_dir, 'valid_audio.wav'), audio_data, sample_rate)  # Use soundfile to write the file

    # 2. Create a corrupted audio file
    corrupted_audio_path = os.path.join(audio_dir, 'corrupted_audio.wav')
    with open(corrupted_audio_path, 'w') as f:
        f.write("This is not a valid audio file!")

# Create a small dataset with one valid file and one corrupted file
simulate_small_audio_tests(audio_dir)

# Validation Functions
def validate_audio_file_types(audio_dir, valid_extensions=['.wav', '.mp3']):
    invalid_files = []
    for filename in os.listdir(audio_dir):
        ext = os.path.splitext(filename)[-1].lower()
        if ext not in valid_extensions:
            invalid_files.append(filename)
    if invalid_files:
        print(f"Invalid file types found: {invalid_files}")
    else:
        print("All files have valid audio extensions.")

def check_corrupted_audio_files(audio_dir):
    corrupted_files = []
    for filename in os.listdir(audio_dir):
        file_path = os.path.join(audio_dir, filename)
        try:
            librosa.load(file_path, sr=None)  # Try loading the file with librosa
        except Exception as e:
            corrupted_files.append(filename)
    if corrupted_files:
        print(f"Corrupted audio files found: {corrupted_files}")
    else:
        print("No corrupted audio files found.")

def validate_audio_duration(audio_dir, min_duration=1.0, max_duration=10.0):
    incorrect_duration_files = []
    for filename in os.listdir(audio_dir):
        file_path = os.path.join(audio_dir, filename)
        try:
            audio_data, sample_rate = librosa.load(file_path, sr=None)
            duration = librosa.get_duration(y=audio_data, sr=sample_rate)
            if duration < min_duration or duration > max_duration:
                incorrect_duration_files.append((filename, duration))
        except Exception as e:
            continue
    if incorrect_duration_files:
        print(f"Audio files with incorrect duration: {incorrect_duration_files}")
    else:
        print(f"All audio files are within the duration limits of {min_duration}s to {max_duration}s.")

# Main function to perform validation on the small dataset
def validate_audio_dataset(audio_dir):
    print("Starting small audio dataset validation...\n")

    # Step 1: Validate file types
    print("Validating file types...")
    validate_audio_file_types(audio_dir)

    # Step 2: Check for corrupted files
    print("\nChecking for corrupted audio files...")
    check_corrupted_audio_files(audio_dir)

    # Step 3: Validate audio duration
    print("\nValidating audio duration...")
    validate_audio_duration(audio_dir, min_duration=1.0, max_duration=10.0)

# Run the validation on the small dataset
validate_audio_dataset(audio_dir)


Starting small audio dataset validation...

Validating file types...
All files have valid audio extensions.

Checking for corrupted audio files...


  librosa.load(file_path, sr=None)  # Try loading the file with librosa
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Corrupted audio files found: ['corrupted_audio.wav']

Validating audio duration...
All audio files are within the duration limits of 1.0s to 10.0s.


  audio_data, sample_rate = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Testing a video dataset is an essential step in ensuring the quality and reliability of data for AI applications. This validation includes checking file integrity, consistency in frame rate and resolution, verifying metadata, and ensuring the accuracy of annotations and labels. Properly validating a video dataset can prevent issues during model training, reduce noise, and ensure that the AI model performs effectively in real-world scenarios.

In [None]:
import cv2
def check_video_file(file_path):
    video = cv2.VideoCapture(file_path)
    if not video.isOpened():
        print(f"File {file_path} is corrupted or unsupported.")
    else:
        print(f"File {file_path} is valid.")
check_video_file('/content/Wildlife Windows 7 Sample Video.mp4')

File /content/Wildlife Windows 7 Sample Video.mp4 is valid.


In [None]:
def check_frame_rate(file_path):
    video = cv2.VideoCapture(file_path)
    frame_rate = video.get(cv2.CAP_PROP_FPS)
    return frame_rate
print( f"Framme rate is {check_frame_rate('/content/Wildlife Windows 7 Sample Video.mp4')}")

Framme rate is 29.97002997002997


In [None]:
def check_resolution(file_path):
    video = cv2.VideoCapture(file_path)
    width = video.get(cv2.CAP_PROP_FRAME_WIDTH)
    height = video.get(cv2.CAP_PROP_FRAME_HEIGHT)
    return width, height
print( f"resolution is {check_resolution('/content/Wildlife Windows 7 Sample Video.mp4')}")

resolution is (640.0, 360.0)


In [None]:
import cv2
import os

def validate_video_dataset(video_dir):
    for video_file in os.listdir(video_dir):
        file_path = os.path.join(video_dir, video_file)

        # Check file format
        if not file_path.endswith(('.mp4', '.avi', '.mov')):
            print(f"Invalid format for {file_path}")
            continue

        video = cv2.VideoCapture(file_path)

        # Check if video is corrupted
        if not video.isOpened():
            print(f"Cannot open video file {file_path}")
            continue

        # Check frame rate
        frame_rate = video.get(cv2.CAP_PROP_FPS)
        if frame_rate < 24:  # Assume 24 FPS is the minimum acceptable frame rate
            print(f"Low frame rate: {frame_rate} for {file_path}")

        # Check resolution
        width = video.get(cv2.CAP_PROP_FRAME_WIDTH)
        height = video.get(cv2.CAP_PROP_FRAME_HEIGHT)
        if width < 640 or height < 480:  # Minimum resolution check
            print(f"Low resolution: {width}x{height} for {file_path}")

        # Check duration
        frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
        duration = frame_count / frame_rate
        if duration < 1:  # Check for very short videos
            print(f"Short video duration: {duration} seconds for {file_path}")

        video.release()

# Run the validation on your video dataset
validate_video_dataset('/content')


Invalid format for /content/.config
Low resolution: 640.0x360.0 for /content/Wildlife Windows 7 Sample Video.mp4
Invalid format for /content/sample_data
