# Statistical Validation of Research Datasets

This notebook analyzes and validates the datasets generated by the research validation suites in:
- Distributed Systems Consistency
- DevOps Practices
- Security Multi-Cloud
- Telemetry and Observability

We'll perform statistical analysis on the data to validate completeness, consistency, and identify any anomalies.

In [None]:
# Import necessary libraries
import os
import sys
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import glob
import re

# Set better default plot styling
plt.style.use('ggplot')
sns.set(font_scale=1.2)
sns.set_style("whitegrid")

# Configure pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.width', 1000)

In [None]:
# Define the research areas based on the repository structure
RESEARCH_AREAS = [
    'distributed_systems_consistency',
    'devops_practices',
    'security_multi_cloud',
    'telemetry_observability'
]

# Function to get all test modules in a research area
def get_test_modules(area):
    area_dir = Path(area)
    if not area_dir.exists():
        return []
    
    return [f.stem for f in area_dir.glob('test_*.py')]

# Analyze the test modules in each research area
test_modules_by_area = {}
for area in RESEARCH_AREAS:
    test_modules_by_area[area] = get_test_modules(area)
    
print("Test modules found in research areas:")
for area, modules in test_modules_by_area.items():
    print(f"\n{area.replace('_', ' ').title()}:")
    for module in modules:
        print(f"  - {module.replace('test_', '').replace('_', ' ').title()}")

## Data Discovery and Loading

First, let's explore what data files are available in the research/data directory and load them for analysis.

In [None]:
# Define the data directory path
DATA_DIR = Path('data')

# Function to scan and categorize data files
def scan_data_files():
    if not DATA_DIR.exists():
        print(f"Warning: Data directory {DATA_DIR} does not exist!")
        return {}
    
    # Get all data files recursively
    all_files = list(DATA_DIR.glob('**/*'))
    
    # Filter out directories
    data_files = [f for f in all_files if f.is_file()]
    
    # Categorize by file extension
    files_by_extension = {}
    for file_path in data_files:
        ext = file_path.suffix.lower()
        if ext not in files_by_extension:
            files_by_extension[ext] = []
        files_by_extension[ext].append(file_path)
    
    # Categorize by research area if possible
    files_by_area = {area: [] for area in RESEARCH_AREAS}
    files_by_area['unknown'] = []
    
    for file_path in data_files:
        assigned = False
        for area in RESEARCH_AREAS:
            if area in str(file_path):
                files_by_area[area].append(file_path)
                assigned = True
                break
        if not assigned:
            files_by_area['unknown'].append(file_path)
    
    return {
        'all_files': data_files,
        'by_extension': files_by_extension,
        'by_area': files_by_area
    }

data_files_info = scan_data_files()

# Display summary of data files
if data_files_info:
    print(f"Found {len(data_files_info['all_files'])} data files in {DATA_DIR}\n")
    
    print("Files by extension:")
    for ext, files in data_files_info['by_extension'].items():
        print(f"  {ext}: {len(files)} files")
    
    print("\nFiles by research area:")
    for area, files in data_files_info['by_area'].items():
        if files:  # Only show areas with files
            print(f"  {area.replace('_', ' ').title()}: {len(files)} files")
else:
    print("No data files found to analyze.")

## Data Loading Functions

Let's create functions to load different types of data files.

In [None]:
# Helper functions to load different data formats
def load_data_file(file_path):
    """Load a data file based on its extension"""
    ext = file_path.suffix.lower()
    
    try:
        if ext == '.csv':
            return pd.read_csv(file_path)
        elif ext == '.json':
            with open(file_path, 'r') as f:
                return json.load(f)
        elif ext == '.jsonl':
            return pd.read_json(file_path, lines=True)
        elif ext in ['.xlsx', '.xls']:
            return pd.read_excel(file_path)
        elif ext == '.parquet':
            return pd.read_parquet(file_path)
        elif ext == '.npy':
            return np.load(file_path)
        elif ext == '.txt':
            with open(file_path, 'r') as f:
                return f.readlines()
        else:
            print(f"Warning: Unsupported file format {ext} for {file_path}")
            return None
    except Exception as e:
        print(f"Error loading {file_path}: {str(e)}")
        return None

# Load data files by research area
def load_area_data(area):
    """Load all data files for a specific research area"""
    if not data_files_info or 'by_area' not in data_files_info:
        return {}
        
    area_files = data_files_info['by_area'].get(area, [])
    area_data = {}
    
    for file_path in area_files:
        key = file_path.stem
        data = load_data_file(file_path)
        if data is not None:
            area_data[key] = {
                'path': file_path,
                'data': data
            }
    
    return area_data

In [None]:
# Load data for each research area
research_data = {}
for area in RESEARCH_AREAS:
    research_data[area] = load_area_data(area)
    print(f"Loaded {len(research_data[area])} datasets for {area.replace('_', ' ').title()}")

## Data Completeness Analysis

Let's analyze if we have data for all test modules.

In [None]:
def check_data_completeness():
    """Check if we have data for all test modules"""
    completeness_results = {}
    
    for area in RESEARCH_AREAS:
        test_modules = test_modules_by_area.get(area, [])
        area_data_keys = list(research_data.get(area, {}).keys())
        
        # Check for each test module if we have corresponding data
        module_coverage = {}
        for module in test_modules:
            module_name = module.replace('test_', '')
            matching_keys = [k for k in area_data_keys if module_name.lower() in k.lower()]
            
            module_coverage[module] = {
                'has_data': len(matching_keys) > 0,
                'data_files': matching_keys
            }
        
        # Check for data files that don't correspond to any test module
        orphaned_data = []
        for data_key in area_data_keys:
            if not any(data_key.lower() in f"{module.replace('test_', '')}_data".lower() for module in test_modules):
                orphaned_data.append(data_key)
        
        completeness_results[area] = {
            'module_coverage': module_coverage,
            'orphaned_data': orphaned_data,
            'coverage_percentage': sum(1 for m in module_coverage.values() if m['has_data']) / max(1, len(module_coverage)) * 100,
            'total_modules': len(module_coverage),
            'modules_with_data': sum(1 for m in module_coverage.values() if m['has_data'])
        }
    
    return completeness_results

completeness_analysis = check_data_completeness()

# Display completeness results
print("Data Completeness Analysis:\n")

for area, results in completeness_analysis.items():
    print(f"\n{area.replace('_', ' ').title()}:")
    print(f"  Module coverage: {results['coverage_percentage']:.1f}% ({results['modules_with_data']}/{results['total_modules']} modules have data)")
    
    if results['orphaned_data']:
        print(f"  Orphaned data files (no matching test module): {len(results['orphaned_data'])}")
        for orphan in results['orphaned_data']:
            print(f"    - {orphan}")
    
    print("  Module details:")
    for module, coverage in results['module_coverage'].items():
        status = "✅ Has data" if coverage['has_data'] else "❌ Missing data"
        print(f"    - {module.replace('test_', '').replace('_', ' ').title()}: {status}")
        if coverage['has_data']:
            for data_file in coverage['data_files']:
                print(f"      └─ {data_file}")

## Data Quality Analysis

Let's analyze the quality of the data by research area.

In [None]:
def analyze_dataset_quality(data, name):
    """Analyze the quality of a dataset"""
    results = {
        'name': name,
        'type': type(data).__name__,
    }
    
    # For pandas DataFrame
    if isinstance(data, pd.DataFrame):
        results.update({
            'rows': len(data),
            'columns': len(data.columns),
            'null_percentage': data.isnull().mean().mean() * 100,
            'column_types': dict(data.dtypes.astype(str)),
            'memory_usage': data.memory_usage(deep=True).sum() / (1024 * 1024),  # MB
        })
    
    # For dictionaries (like JSON)
    elif isinstance(data, dict):
        results.update({
            'keys': len(data),
            'nested_keys': sum(1 for v in data.values() if isinstance(v, (dict, list))),
        })
        
        # If all values are lists, count total entries
        if all(isinstance(v, list) for v in data.values()):
            results['total_entries'] = sum(len(v) for v in data.values())
    
    # For lists
    elif isinstance(data, list):
        results.update({
            'entries': len(data),
            'entry_type': type(data[0]).__name__ if data else 'unknown',
        })
    
    # For numpy arrays
    elif isinstance(data, np.ndarray):
        results.update({
            'shape': data.shape,
            'dtype': str(data.dtype),
            'memory_usage': data.nbytes / (1024 * 1024),  # MB
        })
    
    return results

# Analyze quality for all datasets
quality_analysis = {}

for area in RESEARCH_AREAS:
    area_data = research_data.get(area, {})
    area_quality = []
    
    for dataset_name, dataset_info in area_data.items():
        quality_results = analyze_dataset_quality(dataset_info['data'], dataset_name)
        area_quality.append(quality_results)
    
    quality_analysis[area] = area_quality

# Display quality analysis
for area, quality_results in quality_analysis.items():
    if not quality_results:
        continue
        
    print(f"\n{area.replace('_', ' ').title()} Data Quality Analysis:")
    for result in quality_results:
        print(f"\n  Dataset: {result['name']}")
        print(f"  Type: {result['type']}")
        
        # Print type-specific metrics
        if 'rows' in result:
            print(f"  Rows: {result['rows']:,}")
            print(f"  Columns: {result['columns']}")
            print(f"  Null percentage: {result['null_percentage']:.2f}%")
            print(f"  Memory usage: {result['memory_usage']:.2f} MB")
        elif 'keys' in result:
            print(f"  Keys: {result['keys']}")
            print(f"  Nested keys: {result['nested_keys']}")
            if 'total_entries' in result:
                print(f"  Total entries: {result['total_entries']:,}")
        elif 'entries' in result:
            print(f"  Entries: {result['entries']:,}")
            print(f"  Entry type: {result['entry_type']}")
        elif 'shape' in result:
            print(f"  Shape: {result['shape']}")
            print(f"  Data type: {result['dtype']}")
            print(f"  Memory usage: {result['memory_usage']:.2f} MB")

## In-Depth Analysis of Selected Datasets

Let's perform more detailed analysis of selected datasets from each research area.

In [None]:
def visualize_dataset(data, name):
    """Create visualizations for a dataset"""
    print(f"\nVisualization for: {name}\n{'-'*80}")
    
    # For pandas DataFrame
    if isinstance(data, pd.DataFrame):
        # Sample data preview
        print("Sample data:")
        display(data.head())
        
        # Summary statistics
        print("\nSummary statistics:")
        display(data.describe(include='all').T)
        
        # Missing values visualization
        plt.figure(figsize=(12, 6))
        missing = data.isnull().mean().sort_values(ascending=False)
        if any(missing > 0):
            sns.barplot(x=missing.index, y=missing.values)
            plt.title(f'Missing Values in {name}')
            plt.xticks(rotation=90)
            plt.ylabel('Fraction missing')
            plt.tight_layout()
            plt.show()
        
        # Select numeric columns for correlation analysis
        numeric_data = data.select_dtypes(include=[np.number])
        if numeric_data.shape[1] >= 2:
            # Correlation heatmap
            plt.figure(figsize=(10, 8))
            corr = numeric_data.corr()
            mask = np.triu(np.ones_like(corr, dtype=bool))
            sns.heatmap(corr, mask=mask, annot=True, fmt='.2f', cmap='coolwarm', square=True)
            plt.title(f'Correlation Matrix for {name}')
            plt.tight_layout()
            plt.show()
            
            # Distribution of numeric fields
            max_cols = min(5, len(numeric_data.columns))
            selected_cols = numeric_data.columns[:max_cols]
            plt.figure(figsize=(15, 3*max_cols))
            for i, col in enumerate(selected_cols):
                plt.subplot(max_cols, 1, i+1)
                sns.histplot(numeric_data[col].dropna(), kde=True)
                plt.title(f'Distribution of {col}')
            plt.tight_layout()
            plt.show()
    
    # For dictionaries and lists
    elif isinstance(data, (dict, list)):
        # Convert to DataFrame if possible for visualization
        try:
            if isinstance(data, dict):
                # Try to convert to DataFrame
                if all(isinstance(v, list) for v in data.values()):
                    # Check if all lists have the same length
                    lengths = [len(v) for v in data.values()]
                    if len(set(lengths)) == 1:
                        df = pd.DataFrame(data)
                        print("Converted dictionary to DataFrame:")
                        visualize_dataset(df, name)
                        return
                    else:
                        print("Dictionary has lists of uneven lengths. Showing sample:")
                        for k, v in list(data.items())[:5]:
                            print(f"{k}: {v[:5]} (length: {len(v)})")
                else:
                    # For nested dictionaries, show structure
                    print("Dictionary structure:")
                    for k, v in list(data.items())[:5]:
                        print(f"{k}: {type(v).__name__} {'of length '+str(len(v)) if hasattr(v, '__len__') else ''}")
            elif isinstance(data, list):
                # If list of dictionaries, convert to DataFrame
                if all(isinstance(item, dict) for item in data[:10]):
                    df = pd.DataFrame(data)
                    print("Converted list of dictionaries to DataFrame:")
                    visualize_dataset(df, name)
                    return
                else:
                    # Show sample of list
                    print(f"List of {len(data)} items. Sample:")
                    for item in data[:5]:
                        print(f"  {type(item).__name__}: {item}")
        except Exception as e:
            print(f"Could not convert to DataFrame: {str(e)}")
            print("Sample data:")
            if isinstance(data, dict):
                for k, v in list(data.items())[:5]:
                    print(f"{k}: {v}")
            else:  # list
                for item in data[:5]:
                    print(f"  {item}")
    
    # For numpy arrays
    elif isinstance(data, np.ndarray):
        print(f"Numpy array shape: {data.shape}, dtype: {data.dtype}")
        if len(data.shape) == 1 or (len(data.shape) == 2 and data.shape[1] < 20):
            # For 1D arrays or 2D arrays with reasonable column count
            try:
                df = pd.DataFrame(data)
                visualize_dataset(df, name)
            except:
                print("Could not convert numpy array to DataFrame.")
                print(f"Sample:\n{data[:5]}")

In [None]:
# Select one dataset from each area for detailed visualization
for area in RESEARCH_AREAS:
    area_data = research_data.get(area, {})
    if not area_data:
        print(f"\nNo datasets available for {area.replace('_', ' ').title()}")
        continue
    
    # Select the dataset with the most rows if DataFrame, or first dataset otherwise
    selected_dataset = None
    selected_name = None
    max_rows = 0
    
    for name, info in area_data.items():
        data = info['data']
        if isinstance(data, pd.DataFrame) and len(data) > max_rows:
            max_rows = len(data)
            selected_dataset = data
            selected_name = name
    
    if selected_dataset is None and area_data:
        # If no DataFrames, just use the first dataset
        selected_name = list(area_data.keys())[0]
        selected_dataset = area_data[selected_name]['data']
    
    if selected_dataset is not None:
        print(f"\n{'='*80}")
        print(f"Detailed Analysis for {area.replace('_', ' ').title()}: {selected_name}")
        print(f"{'='*80}")
        visualize_dataset(selected_dataset, selected_name)

## Data Consistency Analysis

This section checks for consistency between related datasets and analyzes time series if available.

In [None]:
# Check for time series data across datasets
def analyze_time_series():
    time_series_data = []
    
    # Look for DataFrames with timestamp/date columns
    for area, area_data in research_data.items():
        for name, info in area_data.items():
            data = info['data']
            if isinstance(data, pd.DataFrame):
                # Check for datetime columns
                date_cols = []
                
                # Look for columns with date/time in the name
                for col in data.columns:
                    col_lower = col.lower()
                    if any(term in col_lower for term in ['time', 'date', 'timestamp', 'datetime']):
                        date_cols.append(col)
                
                # Look for datetime dtypes
                datetime_cols = data.select_dtypes(include=['datetime']).columns.tolist()
                date_cols.extend([col for col in datetime_cols if col not in date_cols])
                
                if date_cols:
                    time_series_data.append({
                        'area': area,
                        'name': name,
                        'data': data,
                        'date_columns': date_cols
                    })
    
    return time_series_data

time_series_datasets = analyze_time_series()

# Visualize time series data if available
if time_series_datasets:
    print(f"\nFound {len(time_series_datasets)} datasets with time series data:\n")
    
    for ts_data in time_series_datasets:
        print(f"Dataset: {ts_data['name']} from {ts_data['area'].replace('_', ' ').title()}")
        print(f"Time columns: {', '.join(ts_data['date_columns'])}")
        
        # Try to convert the first time column to datetime
        df = ts_data['data']
        time_col = ts_data['date_columns'][0]
        
        try:
            # Ensure the column is datetime type
            if df[time_col].dtype != 'datetime64[ns]':
                df[time_col] = pd.to_datetime(df[time_col], errors='coerce')
            
            # Time series analysis
            numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
            if numeric_cols:
                # Select up to 3 numeric columns to visualize
                plot_cols = numeric_cols[:3]
                plt.figure(figsize=(14, 7))
                
                # Sort by time column
                df_sorted = df.sort_values(time_col)
                
                for col in plot_cols:
                    plt.plot(df_sorted[time_col], df_sorted[col], label=col)
                
                plt.title(f'Time Series Analysis for {ts_data["name"]}')
                plt.xlabel(time_col)
                plt.legend()
                plt.xticks(rotation=45)
                plt.tight_layout()
                plt.show()
                
                # Show basic statistics by time periods
                print("\nTime series statistics:")
                # Try to resample by day, week, or month depending on data density
                date_range = (df_sorted[time_col].max() - df_sorted[time_col].min()).days
                
                if date_range > 90:  # More than 90 days
                    freq = 'M'
                    freq_name = 'month'
                elif date_range > 14:  # More than 2 weeks
                    freq = 'W'
                    freq_name = 'week'
                else:
                    freq = 'D'
                    freq_name = 'day'
                
                try:
                    # Create time series index
                    df_ts = df_sorted.set_index(time_col)
                    # Resample and show count, mean, std
                    resampled = df_ts[plot_cols].resample(freq).agg(['count', 'mean', 'std'])
                    print(f"Statistics by {freq_name}:")
                    display(resampled)
                except Exception as e:
                    print(f"Could not resample time series: {str(e)}")
        except Exception as e:
            print(f"Error analyzing time series: {str(e)}")
        print("\n" + "-"*80)
else:
    print("No time series datasets identified.")

## Cross-Dataset Validation

Check for relationships and consistency across datasets from the same research area.

In [None]:
def find_related_datasets(area):
    """Find potentially related datasets in an area based on common column names"""
    area_data = research_data.get(area, {})
    df_datasets = {}
    
    # Collect all DataFrame datasets with their columns
    for name, info in area_data.items():
        data = info['data']
        if isinstance(data, pd.DataFrame) and not data.empty:
            df_datasets[name] = {
                'data': data,
                'columns': set(data.columns)
            }
    
    # Find potential relationships between datasets
    related_pairs = []
    processed = set()
    
    for name1, info1 in df_datasets.items():
        for name2, info2 in df_datasets.items():
            if name1 == name2 or (name1, name2) in processed or (name2, name1) in processed:
                continue
                
            processed.add((name1, name2))
            
            common_cols = info1['columns'].intersection(info2['columns'])
            if len(common_cols) >= 1:  # At least one common column
                related_pairs.append({
                    'dataset1': name1,
                    'dataset2': name2,
                    'common_columns': common_cols,
                    'similarity': len(common_cols) / min(len(info1['columns']), len(info2['columns']))
                })
    
    # Sort by similarity
    related_pairs.sort(key=lambda x: x['similarity'], reverse=True)
    return related_pairs

# Analyze relationships between datasets in each area
for area in RESEARCH_AREAS:
    related_pairs = find_related_datasets(area)
    
    if related_pairs:
        print(f"\n{area.replace('_', ' ').title()} - Related Dataset Pairs:")
        for pair in related_pairs:
            print(f"  {pair['dataset1']} ↔ {pair['dataset2']}")
            print(f"    Similarity: {pair['similarity']*100:.1f}%")
            print(f"    Common columns: {', '.join(sorted(pair['common_columns']))}")
            
            # Check if datasets can be joined
            if len(pair['common_columns']) > 0:
                data1 = research_data[area][pair['dataset1']]['data']
                data2 = research_data[area][pair['dataset2']]['data']
                
                # Pick the first common column for join example
                join_col = next(iter(pair['common_columns']))
                
                # Check for common values in the join column
                common_values = set(data1[join_col].astype(str)) & set(data2[join_col].astype(str))
                overlap = len(common_values) / max(1, min(data1[join_col].nunique(), data2[join_col].nunique()))
                
                print(f"    Join analysis on '{join_col}':")   
                print(f"      Dataset 1 unique values: {data1[join_col].nunique():,}")
                print(f"      Dataset 2 unique values: {data2[join_col].nunique():,}")
                print(f"      Common values: {len(common_values):,} (Overlap: {overlap*100:.1f}%)")
            print()

## Statistical Tests and Validation

Perform statistical tests to validate data correctness and detect anomalies.

In [None]:
from scipy import stats

def statistical_validation(area):
    """Perform statistical validations on datasets in the area"""
    area_data = research_data.get(area, {})
    results = []
    
    for name, info in area_data.items():
        data = info['data']
        if not isinstance(data, pd.DataFrame) or data.empty:
            continue
            
        dataset_results = {'name': name, 'tests': []}
        
        # Select numeric columns for statistical tests
        numeric_cols = data.select_dtypes(include=[np.number]).columns
        if len(numeric_cols) == 0:
            continue
            
        # Run tests for each numeric column
        for col in numeric_cols:
            col_data = data[col].dropna()
            if len(col_data) < 8:  # Need minimum sample size for tests
                continue
                
            col_results = {'column': col}
            
            # 1. Check for normality (Shapiro-Wilk test)
            try:
                # Use smaller sample for Shapiro test to avoid excessive power
                sample = col_data.sample(min(5000, len(col_data))).values
                shapiro_stat, shapiro_p = stats.shapiro(sample)
                col_results['normality'] = {
                    'test': 'Shapiro-Wilk',
                    'statistic': shapiro_stat,
                    'p_value': shapiro_p,
                    'is_normal': shapiro_p > 0.05
                }
            except Exception as e:
                col_results['normality'] = {'error': str(e)}
            
            # 2. Check for outliers using IQR method
            try:
                q1 = col_data.quantile(0.25)
                q3 = col_data.quantile(0.75)
                iqr = q3 - q1
                lower_bound = q1 - 1.5 * iqr
                upper_bound = q3 + 1.5 * iqr
                outliers = col_data[(col_data < lower_bound) | (col_data > upper_bound)]
                outlier_percent = len(outliers) / len(col_data) * 100
                
                col_results['outliers'] = {
                    'count': len(outliers),
                    'percentage': outlier_percent,
                    'has_many_outliers': outlier_percent > 5  # Flag if >5% are outliers
                }
            except Exception as e:
                col_results['outliers'] = {'error': str(e)}
            
            # 3. Autocorrelation for potential time series
            if len(col_data) > 30:  # Need sufficient data points
                try:
                    # Calculate lag-1 autocorrelation
                    autocorr = col_data.autocorr(lag=1)
                    col_results['autocorrelation'] = {
                        'lag_1': autocorr,
                        'has_autocorrelation': abs(autocorr) > 0.3  # Moderate autocorrelation
                    }
                except Exception as e:
                    col_results['autocorrelation'] = {'error': str(e)}
            
            dataset_results['tests'].append(col_results)
        
        results.append(dataset_results)
    
    return results

# Run statistical validation for each area
stats_results = {}
for area in RESEARCH_AREAS:
    stats_results[area] = statistical_validation(area)

# Display statistical validation results
for area, results in stats_results.items():
    if not results:
        continue
        
    print(f"\n{area.replace('_', ' ').title()} - Statistical Validation Results:")
    
    for dataset in results:
        print(f"\n  Dataset: {dataset['name']}")
        
        for test in dataset['tests']:
            print(f"    Column: {test['column']}")
            
            if 'normality' in test and 'error' not in test['normality']:
                norm = test['normality']
                print(f"      Normality: {'Normal' if norm['is_normal'] else 'Non-normal'} "  
                      f"(p={norm['p_value']:.4f})")
            
            if 'outliers' in test and 'error' not in test['outliers']:
                out = test['outliers']
                print(f"      Outliers: {out['count']:,} ({out['percentage']:.2f}%)" + 
                      (" - MANY OUTLIERS" if out.get('has_many_outliers', False) else ""))
            
            if 'autocorrelation' in test and 'error' not in test['autocorrelation']:
                auto = test['autocorrelation']
                print(f"      Autocorrelation: {auto['lag_1']:.2f}" + 
                      (" - SIGNIFICANT" if auto.get('has_autocorrelation', False) else ""))

## Summary and Recommendations

This section summarizes our findings about the datasets.

In [None]:
def generate_summary_report():
    """Generate a summary report of the data validation"""
    print("\n" + "="*80)
    print("DATA VALIDATION SUMMARY REPORT")
    print("="*80)
    
    # Count datasets by area
    datasets_by_area = {area: len(data) for area, data in research_data.items()}
    total_datasets = sum(datasets_by_area.values())
    
    print(f"\nTotal datasets analyzed: {total_datasets}")
    for area, count in datasets_by_area.items():
        if count > 0:
            print(f"  {area.replace('_', ' ').title()}: {count} datasets")
    
    # Data completeness summary
    print("\nData Completeness:")
    for area, results in completeness_analysis.items():
        if results['total_modules'] > 0:
            print(f"  {area.replace('_', ' ').title()}: {results['coverage_percentage']:.1f}% coverage "  
                  f"({results['modules_with_data']}/{results['total_modules']} test modules have data)")
    
    # Data quality issues
    print("\nData Quality Issues:")
    for area, results in stats_results.items():
        issues = []
        for dataset in results:
            for test in dataset['tests']:
                if 'outliers' in test and 'error' not in test['outliers'] and test['outliers'].get('has_many_outliers', False):
                    issues.append(f"High outliers in {dataset['name']} - {test['column']} ({test['outliers']['percentage']:.2f}%)")
                if 'normality' in test and 'error' not in test['normality'] and not test['normality']['is_normal']:
                    issues.append(f"Non-normal distribution in {dataset['name']} - {test['column']}")
        if issues:
            print(f"\n  {area.replace('_', ' ').title()}:")
            for issue in issues[:5]:  # Show top 5 issues
                print(f"    - {issue}")
            if len(issues) > 5:
                print(f"    - ... and {len(issues) - 5} more issues")
        else:
            print(f"  {area.replace('_', ' ').title()}: No major issues detected")
            
    # Recommendations
    print("\nRecommendations:")
    
    # Check for missing data
    missing_data_areas = []
    for area, results in completeness_analysis.items():
        if results['coverage_percentage'] < 100:
            missing_data_areas.append(area)
    
    if missing_data_areas:
        print("  1. Generate missing test data for modules:")
        for area in missing_data_areas:
            missing_modules = [module for module, coverage in completeness_analysis[area]['module_coverage'].items() 
                             if not coverage['has_data']]
            if missing_modules:
                print(f"     - {area.replace('_', ' ').title()}: {', '.join([m.replace('test_', '') for m in missing_modules])}")
    
    # Check for time series datasets
    if time_series_datasets:
        print("  2. Consider time-based analysis for the following datasets:")
        for ts in time_series_datasets[:3]:  # Top 3
            print(f"     - {ts['name']} in {ts['area'].replace('_', ' ').title()}")
    
    # Check for potential joins
    has_joinable = False
    print("  3. Explore relationships between datasets:")
    for area in RESEARCH_AREAS:
        related_pairs = find_related_datasets(area)
        if related_pairs:
            has_joinable = True
            top_pair = related_pairs[0]
            print(f"     - {area.replace('_', ' ').title()}: Join {top_pair['dataset1']} with {top_pair['dataset2']} on '{', '.join(sorted(top_pair['common_columns'])[:2])}'")    

    if not has_joinable:
        print("     - No closely related datasets identified.")

generate_summary_report()

## Next Steps

Based on the analysis, consider the following next steps:

1. Address any data completeness issues by generating test data for modules without data
2. Investigate statistical anomalies identified in the analysis
3. Consider deeper analysis of time series data if relevant to your research
4. Explore potential relationships between datasets through joins where appropriate
5. Review test modules with low data coverage to ensure they are functioning correctly