In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
import gc
import openpyxl
warnings.filterwarnings('ignore')


pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)


plt.style.use('default')
sns.set_palette("husl")

print("=== EMISSIONS 2003 DATASET - EXPLORATORY DATA ANALYSIS ===\n")

try:
    df = pd.read_excel('emissions_2003.xlsx')
    
    print("Successfully loaded full dataset into memory!")
    print(f"Dataset shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    full_dataset_loaded = True
    
except MemoryError:
    print("Memory error - switching to chunk-based processing")
    full_dataset_loaded = False
except Exception as e:
    print(f"Error loading full dataset: {e}")
    print("Switching to chunk-based processing")
    full_dataset_loaded = False

if full_dataset_loaded:

    print("First 5 rows:")
    print(df.head())
    print()
    
    print("Data types and non-null counts:")
    print(df.info())
    print()
    
 
    missing_data = pd.DataFrame({
        'Missing Count': df.isnull().sum(),
        'Missing Percentage': (df.isnull().sum() / len(df)) * 100
    })
    missing_data = missing_data[missing_data['Missing Count'] > 0].sort_values('Missing Count', ascending=False)
    
    if not missing_data.empty:
        print("Missing values by column:")
        print(missing_data)
    else:
        print("No missing values found!")
    print()
    
    duplicates = df.duplicated().sum()
    print(f"Duplicate rows: {duplicates}")
    print()
    
    
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    
    print(f"Numeric columns ({len(numeric_cols)}): {numeric_cols}")
    print(f"Categorical columns ({len(categorical_cols)}): {categorical_cols}")
    print()
    
    if numeric_cols:
        print("Statistical summary for numeric columns:")
        print(df[numeric_cols].describe())
        print()
    
    print("EMISSIONS ANALYSIS")
    print("="*50)
    
    emission_cols = ['ECO2', 'ECO', 'ECH4', 'EPM2.5']
    available_emission_cols = [col for col in emission_cols if col in df.columns]
    
    if available_emission_cols:
        print("Emission variables summary:")
        print(df[available_emission_cols].describe())
        print()
        
        for col in available_emission_cols:
            zero_count = (df[col] == 0).sum()
            negative_count = (df[col] < 0).sum()
            print(f"{col}: {zero_count} zero values, {negative_count} negative values")
        print()
    
    print("GEOGRAPHICAL DISTRIBUTION")
    print("="*50)
    
    if 'longitude' in df.columns and 'latitude' in df.columns:
        print("Geographical bounds:")
        print(f"Longitude range: {df['longitude'].min():.4f} to {df['longitude'].max():.4f}")
        print(f"Latitude range: {df['latitude'].min():.4f} to {df['latitude'].max():.4f}")
        print()
    
    print("TEMPORAL ANALYSIS")
    print("="*50)
    
    if 'year' in df.columns:
        print("Year distribution:")
        print(df['year'].value_counts().sort_index())
        print()
    
    if 'doy' in df.columns:
        print("Day of year statistics:")
        print(f"DOY range: {df['doy'].min()} to {df['doy'].max()}")
        print(f"Mean DOY: {df['doy'].mean():.1f}")
        print()
    
    print("CATEGORICAL VARIABLES ANALYSIS")
    print("="*50)
    
    categorical_vars = ['covertype', 'fuelcode', 'fuel_moisture_class', 'burn_source', 'burnday_source']
    available_categorical = [col for col in categorical_vars if col in df.columns]
    
    for col in available_categorical:
        if col in df.columns:
            print(f"\n{col.upper()} distribution:")
            value_counts = df[col].value_counts()
            print(value_counts.head(10))  # Show top 10 categories
            if len(value_counts) > 10:
                print(f"... and {len(value_counts) - 10} more categories")
    print()
    
    print("FIRE CHARACTERISTICS")
    print("="*50)
    
    fire_vars = ['area_burned', 'prefire_fuel', 'consumed_fuel', 'cwd_frac', 'duff_frac']
    available_fire_vars = [col for col in fire_vars if col in df.columns]
    
    if available_fire_vars:
        print("Fire characteristics summary:")
        print(df[available_fire_vars].describe())
        print()
    
    print("OUTLIER DETECTION")
    print("="*50)
    
    def detect_outliers_iqr(data, column):
        Q1 = data[column].quantile(0.25)
        Q3 = data[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
        return len(outliers)
    
    key_vars = ['area_burned', 'ECO2', 'ECO', 'ECH4', 'EPM2.5']
    available_key_vars = [col for col in key_vars if col in df.columns]
    
    print("Outlier count (using IQR method):")
    for col in available_key_vars:
        outlier_count = detect_outliers_iqr(df, col)
        outlier_pct = (outlier_count / len(df)) * 100
        print(f"{col}: {outlier_count} outliers ({outlier_pct:.2f}%)")
    print()
    
    print("CORRELATION ANALYSIS")
    print("="*50)
    
    if len(numeric_cols) > 1:
        corr_matrix = df[numeric_cols].corr()
        
        high_corr_pairs = []
        for i in range(len(corr_matrix.columns)):
            for j in range(i+1, len(corr_matrix.columns)):
                corr_val = corr_matrix.iloc[i, j]
                if abs(corr_val) > 0.7:
                    high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_val))
        
        if high_corr_pairs:
            print("Highly correlated variable pairs (|r| > 0.7):")
            for var1, var2, corr in high_corr_pairs:
                print(f"{var1} - {var2}: {corr:.3f}")
        else:
            print("No highly correlated variable pairs found (|r| > 0.7)")
        print()

else:
    
    file_size = os.path.getsize('emissions_2003.xlsx') / (1024**2)
    print(f"File size: {file_size:.2f} MB")
    
    header_df = pd.read_excel('emissions_2003.xlsx', nrows=0, engine='openpyxl')
    sample_df = pd.read_excel('emissions_2003.xlsx', nrows=1000, engine='openpyxl')
    
    print(f"Number of columns: {len(header_df.columns)}")
    print(f"Column names: {list(header_df.columns)}")
    print(f"\nSample data types:")
    print(sample_df.dtypes)
    print(f"\nFirst 3 rows:")
    print(sample_df.head(3))
    
    dtype_dict = {}
    float32_cols = ['longitude', 'latitude', 'area_burned', 'prefire_fuel', 'consumed_fuel', 
                    'ECO2', 'ECO', 'ECH4', 'EPM2.5', 'cwd_frac', 'duff_frac']
    int16_cols = ['year', 'doy', 'grid10k', 'covertype', 'fuelcode', 'fuel_moisture_class']
    int8_cols = ['burn_source', 'burnday_source', 'BSEV_flag']
    
    for col in header_df.columns:
        if col in float32_cols:
            dtype_dict[col] = 'float32'
        elif col in int16_cols:
            dtype_dict[col] = 'int16'
        elif col in int8_cols:
            dtype_dict[col] = 'int8'
        else:
            dtype_dict[col] = 'object'
    
    try:
        wb = openpyxl.load_workbook('emissions_2003.xlsx', read_only=True)
        sheet = wb.active
        total_rows = sheet.max_row - 1 if sheet.max_row is not None else float('inf')
        wb.close()
    except Exception as e:
        print(f"Could not determine row count, will process until end: {e}")
        total_rows = float('inf')
    
    chunk_size = 25000
    
    chunk_stats = []
    missing_stats = {}
    categorical_stats = {}
    numeric_stats = {}
    
    for col in header_df.columns:
        missing_stats[col] = 0
        if dtype_dict.get(col) == 'object':
            categorical_stats[col] = {}
        elif dtype_dict.get(col) in ['float32', 'int16', 'int8']:
            numeric_stats[col] = {
                'count': 0, 'sum': 0, 'sum_sq': 0, 'min': float('inf'), 'max': float('-inf'),
                'zero_count': 0, 'negative_count': 0
            }
    
    print(f"\nProcessing Excel data in chunks of {chunk_size:,} rows...")
    
    chunk_count = 0
    total_processed = 0
    start_row = 1
    
    while start_row < total_rows or total_rows == float('inf'):
        try:
            chunk = pd.read_excel('emissions_2003.xlsx', 
                                 skiprows=range(1, start_row + 1),
                                 nrows=chunk_size,
                                 engine='openpyxl',
                                 dtype=dtype_dict)
            
            if len(chunk) == 0:
                break
                
            chunk_count += 1
            rows_in_chunk = len(chunk)
            total_processed += rows_in_chunk
            
            chunk_stats.append({
                'chunk': chunk_count,
                'rows': rows_in_chunk,
                'memory_mb': chunk.memory_usage(deep=True).sum() / (1024**2)
            })
            
            for col in chunk.columns:
                missing_stats[col] += chunk[col].isnull().sum()
            
            for col in chunk.columns:
                if dtype_dict.get(col) == 'object':
                    chunk_counts = chunk[col].value_counts()
                    for value, count in chunk_counts.items():
                        if pd.notna(value):
                            if value in categorical_stats[col]:
                                categorical_stats[col][value] += count
                            else:
                                categorical_stats[col][value] = count
            
            for col in chunk.columns:
                if col in numeric_stats:
                    valid_data = chunk[col].dropna()
                    if len(valid_data) > 0:
                        numeric_stats[col]['count'] += len(valid_data)
                        numeric_stats[col]['sum'] += valid_data.sum()
                        numeric_stats[col]['sum_sq'] += (valid_data ** 2).sum()
                        numeric_stats[col]['min'] = min(numeric_stats[col]['min'], valid_data.min())
                        numeric_stats[col]['max'] = max(numeric_stats[col]['max'], valid_data.max())
                        numeric_stats[col]['zero_count'] += (valid_data == 0).sum()
                        numeric_stats[col]['negative_count'] += (valid_data < 0).sum()
            
            print(f"Processed chunk {chunk_count}: rows {start_row:,}-{start_row + rows_in_chunk - 1:,} ({rows_in_chunk:,} rows)")
            
            start_row += chunk_size
            
            if total_rows == float('inf') and rows_in_chunk < chunk_size:
                break
            
            del chunk
            gc.collect()
            
        except Exception as e:
            print(f"Error processing chunk starting at row {start_row}: {e}")
            break
    
    print(f"\nTotal rows processed: {total_processed:,}")
    
    print("\nDATA QUALITY ASSESSMENT")
    print("="*50)
    
    missing_df = pd.DataFrame({
        'Column': list(missing_stats.keys()),
        'Missing_Count': list(missing_stats.values()),
        'Missing_Percentage': [(count/total_processed)*100 for count in missing_stats.values()]
    })
    
    missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)
    
    if not missing_df.empty:
        print("Missing values by column:")
        print(missing_df.to_string(index=False))
    else:
        print("No missing values found!")
    print()
    
    print("STATISTICAL SUMMARY")
    print("="*50)
    
    numeric_cols = [col for col, dtype in dtype_dict.items() if dtype in ['float32', 'int16', 'int8']]
    categorical_cols = [col for col, dtype in dtype_dict.items() if dtype == 'object']
    
    print(f"Numeric columns ({len(numeric_cols)}): {numeric_cols}")
    print(f"Categorical columns ({len(categorical_cols)}): {categorical_cols}")
    print()
    
    print("Statistical summary for numeric columns:")
    print("-" * 90)
    print(f"{'Column':<15} {'Count':<10} {'Mean':<12} {'Std':<12} {'Min':<12} {'Max':<12} {'Zeros':<8} {'Negative':<8}")
    print("-" * 90)
    
    for col, stats in numeric_stats.items():
        if stats['count'] > 0:
            mean = stats['sum'] / stats['count']
            variance = (stats['sum_sq'] / stats['count']) - (mean ** 2)
            std = np.sqrt(max(0, variance))
            
            print(f"{col:<15} {stats['count']:<10,} {mean:<12.2e} {std:<12.2e} {stats['min']:<12.2e} {stats['max']:<12.2e} {stats['zero_count']:<8,} {stats['negative_count']:<8,}")
    
    print("\nEMISSIONS ANALYSIS")
    print("="*50)
    
    emission_cols = ['ECO2', 'ECO', 'ECH4', 'EPM2.5']
    available_emissions = [col for col in emission_cols if col in numeric_stats and numeric_stats[col]['count'] > 0]
    
    if available_emissions:
        print("Emission Variables Analysis:")
        for col in available_emissions:
            stats = numeric_stats[col]
            if stats['count'] > 0:
                mean = stats['sum'] / stats['count']
                print(f"\n{col}:")
                print(f"  Total emissions: {stats['sum']:.2e}")
                print(f"  Mean emissions: {mean:.2e}")
                print(f"  Zero emissions: {stats['zero_count']:,} ({(stats['zero_count']/stats['count'])*100:.1f}%)")
                if stats['negative_count'] > 0:
                    print(f"  Negative emissions: {stats['negative_count']:,} (CHECK DATA QUALITY)")
    
    print("\nGEOGRAPHICAL DISTRIBUTION")
    print("="*50)
    
    if 'longitude' in numeric_stats and 'latitude' in numeric_stats:
        lon_stats = numeric_stats['longitude']
        lat_stats = numeric_stats['latitude']
        if lon_stats['count'] > 0 and lat_stats['count'] > 0:
            print("Geographical bounds:")
            print(f"Longitude range: {lon_stats['min']:.4f} to {lon_stats['max']:.4f}")
            print(f"Latitude range: {lat_stats['min']:.4f} to {lat_stats['max']:.4f}")
    
    print("\nTEMPORAL ANALYSIS")
    print("="*50)
    
    if 'doy' in numeric_stats:
        doy_stats = numeric_stats['doy']
        if doy_stats['count'] > 0:
            mean_doy = doy_stats['sum'] / doy_stats['count']
            print("Day of year statistics:")
            print(f"DOY range: {doy_stats['min']} to {doy_stats['max']}")
            print(f"Mean DOY: {mean_doy:.1f}")
    
    print("\nCATEGORICAL VARIABLES ANALYSIS")
    print("="*50)
    
    categorical_vars = ['covertype', 'fuelcode', 'fuel_moisture_class', 'burn_source', 'burnday_source']
    available_categorical = [col for col in categorical_vars if col in categorical_stats]
    
    for col in available_categorical:
        if categorical_stats[col]:
            sorted_counts = sorted(categorical_stats[col].items(), key=lambda x: x[1], reverse=True)
            total_unique = len(sorted_counts)
            print(f"\n{col.upper()} distribution:")
            print(f"  Unique values: {total_unique}")
            print("  Top 10 categories:")
            for i, (value, count) in enumerate(sorted_counts[:10]):
                percentage = (count/total_processed)*100
                print(f"    {value}: {count:,} ({percentage:.2f}%)")
            if total_unique > 10:
                print(f"    ... and {total_unique-10} more categories")

