In [2]:
import os
import pandas as pd

# ✅ Get the absolute path to this script's directory
script_dir = os.getcwd()
print("📂 Current Working Directory:", script_dir)

# ✅ Absolute path to data folder (based on your description, adjust if needed)
data_dir = os.path.abspath(os.path.join(script_dir, '..', '..', 'data', 'raw_data', 'CRISIS_NLP', 'HumAID_data_events_set1_47K', 'events_set1'))

print("📁 Data folder (resolved absolute path):", data_dir)

# Collect all DataFrames here
all_dfs = []

# Traverse folders inside the dataset
for event_folder in os.listdir(data_dir):
    event_path = os.path.join(data_dir, event_folder)

    if not os.path.isdir(event_path):
        continue

    print(f"\n📁 Checking folder: {event_folder}")
    files_in_folder = os.listdir(event_path)
    print("📄 Files found:", files_in_folder)

    for file in files_in_folder:
        if file.endswith('.tsv'):
            file_path = os.path.join(event_path, file)

            try:
                df = pd.read_csv(file_path, sep='\t')
                df['event'] = event_folder

                file_lower = file.lower()
                if 'train' in file_lower:
                    df['split'] = 'train'
                elif 'dev' in file_lower or 'val' in file_lower:
                    df['split'] = 'dev'
                elif 'test' in file_lower:
                    df['split'] = 'test'
                else:
                    df['split'] = 'unknown'

                all_dfs.append(df)
                print(f"✅ Loaded: {file}")
            except Exception as e:
                print(f"❌ Failed to read: {file_path}")
                print(f"   Error: {e}")

# Combine and save
if all_dfs:
    combined_df = pd.concat(all_dfs, ignore_index=True, sort=False)

    # ✅ Save the output file next to the input data, or wherever you want
    output_path = os.path.join(data_dir, '..', '..', '..', '..', 'interim_data', 'crisis_consolidated_humanAID.tsv')
    combined_df.to_csv(output_path, sep='\t', index=False)

    print(f"\n✅ Combined TSV saved to: {output_path}")
else:
    print("\n⚠️ No TSV files were loaded. Please check the paths and try again.")


📂 Current Working Directory: c:\Users\MUHAMMAD ZAIN\Desktop\DisasterInsight_AI\DisasterInsight_AI_Global_Real-Time_Disaster_Analytics_Platform\notebooks\Consolidate_notebooks
📁 Data folder (resolved absolute path): c:\Users\MUHAMMAD ZAIN\Desktop\DisasterInsight_AI\DisasterInsight_AI_Global_Real-Time_Disaster_Analytics_Platform\data\raw_data\CRISIS_NLP\HumAID_data_events_set1_47K\events_set1

📁 Checking folder: canada_wildfires_2016
📄 Files found: ['canada_wildfires_2016_dev.tsv', 'canada_wildfires_2016_test.tsv', 'canada_wildfires_2016_train.tsv']
✅ Loaded: canada_wildfires_2016_dev.tsv
✅ Loaded: canada_wildfires_2016_test.tsv
✅ Loaded: canada_wildfires_2016_train.tsv

📁 Checking folder: cyclone_idai_2019
📄 Files found: ['cyclone_idai_2019_dev.tsv', 'cyclone_idai_2019_test.tsv', 'cyclone_idai_2019_train.tsv']
✅ Loaded: cyclone_idai_2019_dev.tsv
✅ Loaded: cyclone_idai_2019_test.tsv
✅ Loaded: cyclone_idai_2019_train.tsv

📁 Checking folder: ecuador_earthquake_2016
📄 Files found: ['ecuador