# Toy Dataset Generator

This notebook contains functions to generate toy datasets with numerical and categorical values, along with functionality to introduce null values.

In [8]:
import numpy as np
import pandas as pd
import random
import csv

In [9]:
def generate_numerical_data(n_samples, continuous=True, mean=0, std=1, low=0, high=100):
    """
    Generate numerical data either continuous or discrete.
    
    Parameters:
    - n_samples: number of samples to generate
    - continuous: if True, generates continuous data; if False, generates discrete data
    - mean: mean for continuous data
    - std: standard deviation for continuous data
    - low: lower bound for discrete data
    - high: upper bound for discrete data
    """
    if continuous:
        return np.random.normal(mean, std, n_samples)
    else:
        return np.random.randint(low, high, n_samples)

In [10]:
def generate_categorical_data(n_samples, categories=None, probabilities=None):
    """
    Generate categorical data.
    
    Parameters:
    - n_samples: number of samples to generate
    - categories: list of categories (default: ['A', 'B', 'C'])
    - probabilities: probability distribution for categories (must sum to 1)
    """
    if categories is None:
        categories = ['A', 'B', 'C']
    
    if probabilities is None:
        probabilities = [1/len(categories)] * len(categories)
    
    return np.random.choice(categories, size=n_samples, p=probabilities)

In [11]:
def introduce_nulls(data, null_probability=0.1):
    """
    Introduce null values into the data.
    
    Parameters:
    - data: array-like data to introduce nulls into
    - null_probability: probability of converting a value to null
    """
    data = np.array(data)
    mask = np.random.random(data.shape) < null_probability
    data_with_nulls = data.copy()
    data_with_nulls[mask] = np.nan
    return data_with_nulls

In [14]:
def create_toy_dataset(n_samples=1000, include_nulls=True):
    """
    Create a complete toy dataset with various types of data.
    
    Parameters:
    - n_samples: number of samples to generate
    - include_nulls: whether to introduce null values
    """
    # Generate different types of data
    age = generate_numerical_data(n_samples, continuous=False, low=18, high=90)
    income = generate_numerical_data(n_samples, continuous=True, mean=50000, std=20000)
    categories = ['Student', 'Employed', 'Self-Employed', 'Retired']
    occupation = generate_categorical_data(n_samples, categories=categories)
    satisfaction = generate_numerical_data(n_samples, continuous=False, low=1, high=6)
    
    # Create DataFrame
    df = pd.DataFrame({
        'Age': age,
        'Income': income,
        'Occupation': occupation,
        'Satisfaction': satisfaction
    })
    
    # Introduce nulls in income
    if include_nulls:
        df['Income'] = introduce_nulls(df['Income'])
    
    return df

In [15]:
# Generate toy dataset with NaN values
toy_df = create_toy_dataset(n_samples=1000, include_nulls=True)

# Display basic information about the dataset
print("Dataset shape:", toy_df.shape)
print("\nFirst few rows:")
display(toy_df.head())

# Display null value counts
print("\nNull values per column:")
display(toy_df.isnull().sum())

# Display percentage of null values
print("\nPercentage of null values per column:")
display((toy_df.isnull().sum() / len(toy_df) * 100).round(2))

Dataset shape: (1000, 4)

First few rows:


Unnamed: 0,Age,Income,Occupation,Satisfaction
0,75,46924.37814,Employed,4
1,55,61472.191362,Self-Employed,2
2,44,58182.603064,Student,5
3,82,35002.200578,Employed,2
4,52,51165.329345,Self-Employed,2



Null values per column:


Age              0
Income          94
Occupation       0
Satisfaction     0
dtype: int64


Percentage of null values per column:


Age             0.0
Income          9.4
Occupation      0.0
Satisfaction    0.0
dtype: float64

In [17]:
toy_df.to_csv('toy.csv', index=False)