In [2]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os

In [None]:
# Method 1: Loop through all parquet files and inspect them
# loop through files under C:\Users\YTUN\source\New folder\pv-forecast-mlops\data
data_dir = r'C:\Users\YTUN\source\New folder\pv-forecast-mlops\data'

print("üîç EXPLORING ALL PARQUET FILES")
print("="*80)

if os.path.exists(data_dir):
    # Get all parquet files
    parquet_files = [f for f in os.listdir(data_dir) if f.endswith('.parquet')]
    
    if parquet_files:
        print(f"Found {len(parquet_files)} parquet files in the directory\n")
        
        for i, file in enumerate(parquet_files, 1):
            file_path = os.path.join(data_dir, file)
            
            try:
                # Read the parquet file
                df = pd.read_parquet(file_path)
                
                print(f"üìÑ FILE {i}: {file}")
                print("-" * 50)
                print(f"üìä Shape: {df.shape[0]:,} rows √ó {df.shape[1]} columns")
                print(f"üìã Columns: {list(df.columns)}")
                print(f"üî¢ Data Types:\n{df.dtypes.to_string()}")
                print(f"\nüëÄ FIRST 10 ROWS:")
                print(df.head(10).to_string())
                print("\n" + "="*80 + "\n")
                
            except Exception as e:
                print(f"‚ùå Error reading {file}: {e}")
                print("="*80 + "\n")
    else:
        print(f"No parquet files found in {data_dir}")
        print("Available files:", os.listdir(data_dir))
else:
    print(f"‚ùå Directory not found: {data_dir}")
    print("Please check if the path exists.")

Dataset Shape: (42616800, 3)

Column Names:
['unique_id', 'timestamp', 'y']

Data Types:
unique_id          category
timestamp    datetime64[ns]
y                   float64
dtype: object

First 5 rows:
          unique_id           timestamp    y
0  6f8872976c8fe471 2014-01-01 01:00:00  0.0
1  6f8872976c8fe471 2014-01-01 01:15:00  0.0
2  6f8872976c8fe471 2014-01-01 01:30:00  0.0
3  6f8872976c8fe471 2014-01-01 01:45:00  0.0
4  6f8872976c8fe471 2014-01-01 02:00:00  0.0

Basic Statistics:
                           timestamp             y
count                       42616800  3.980491e+07
mean   2014-05-25 00:50:55.879370240  7.886642e-02
min              2010-06-30 23:30:00 -6.125488e-01
25%              2012-07-09 20:22:30  0.000000e+00
50%              2014-07-12 16:15:00  0.000000e+00
75%              2016-04-06 20:30:00  6.089245e-02
max              2018-01-01 00:45:00  1.057635e+02
std                              NaN  3.146534e-01
                           timestamp             y

In [None]:
# Method 3: Check parquet file without loading entire dataset (for large files)
# This is useful for very large parquet files

if os.path.exists(parquet_file):
    # Read only the first few rows
    df_sample = pd.read_parquet(parquet_file, nrows=5)
    print("Sample data (first 5 rows):")
    print(df_sample)
    
    # Read specific columns only
    # columns_to_read = ['column1', 'column2']  # Specify columns you want
    # df_subset = pd.read_parquet(parquet_file, columns=columns_to_read)
    
    # Check memory usage
    df_sample_info = df_sample.info(memory_usage='deep')
else:
    print("Update the parquet_file path in the first cell")

In [None]:
# Method 4: Check multiple parquet files in a directory
data_dir = '../data'

if os.path.exists(data_dir):
    # List all parquet files in the data directory
    parquet_files = [f for f in os.listdir(data_dir) if f.endswith('.parquet')]
    
    print(f"Found {len(parquet_files)} parquet files in {data_dir}:")
    
    for file in parquet_files:
        file_path = os.path.join(data_dir, file)
        file_size = os.path.getsize(file_path) / (1024*1024)  # Size in MB
        
        # Get basic info without loading the entire file
        try:
            pf = pq.ParquetFile(file_path)
            num_rows = pf.metadata.num_rows
            num_cols = len(pf.schema)
            
            print(f"\nüìÑ {file}:")
            print(f"   Size: {file_size:.2f} MB")
            print(f"   Rows: {num_rows:,}")
            print(f"   Columns: {num_cols}")
            
        except Exception as e:
            print(f"\n‚ùå Error reading {file}: {e}")
else:
    print(f"Directory not found: {data_dir}")
    print("Available directories:", [d for d in os.listdir('..') if os.path.isdir(os.path.join('..', d))])