# Import
Import text files from .zip folder and create for each .zip folder a .parquet file

In [None]:
import pandas as pd
import zipfile
import glob
import os
import re



# --- Regex for Split File Detection ---
# Matches patterns like "...(1 of 3).txt", "... ( 2 of 10 ).txt", etc.
# Groups: 1=Base Name, 2=Part Number, 3=Total Parts
split_file_pattern = re.compile(r"^(.*?)\s*\((\d+)\s+of\s+(\d+)\)\.txt$", re.IGNORECASE)

# --- Find Zip Files ---
print("Searching for zip files...")
zip_file_paths = glob.glob('zip/*.zip')

# Dictionary to store code -> metadata mapping across all files
code_metadata_map = {}

if not zip_file_paths:
    print("Error: No .zip files found in the current directory.")
else:
    print(f"Found zip files: {zip_file_paths}")

    # --- Loop through Each Zip File Independently ---
    for zip_file_path in zip_file_paths:
        print(f"\n--- Processing Zip File: {zip_file_path} ---")

        # Initialize storage for DataFrames found IN THIS ZIP FILE
        dfs_in_zip = {}
        # NEW: Dictionary to hold parts of split files before combining
        # Key: (base_name, total_parts), Value: {part_num: DataFrame}
        split_file_groups = {}
        loaded_files_count = 0

        try:
            with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
                zip_namelist = zip_ref.namelist() # Get full paths within zip
                print(f"  Files found inside zip: {[os.path.basename(f) for f in zip_namelist]}")

                # --- Load All TXT Files from the Current Zip ---
                for internal_path in zip_namelist:
                    base_filename = os.path.basename(internal_path)

                    # Check if the file is a .txt file
                    if not base_filename.lower().endswith('.txt'):
                        continue

                    print(f"  - Attempting to load: '{base_filename}' (assuming header=0, skiprows=[1])")
                    try:
                        with zip_ref.open(internal_path) as file_in_zip:

                            # 1. Read only the first two rows for code/metadata mapping (same as before)
                            try:
                                header_meta_df = pd.read_csv(file_in_zip, sep='\t', header=None, nrows=2, encoding='utf-8', low_memory=False, dtype=str)
                                if header_meta_df.shape[0] == 2:
                                    codes = header_meta_df.iloc[0].values
                                    descriptions = header_meta_df.iloc[1].values
                                    for code, desc in zip(codes, descriptions):
                                        code_str = str(code).strip().strip('"')
                                        desc_str = str(desc).strip().strip('"')
                                        if code_str and code_str != 'IDRSSD' and code_str not in code_metadata_map and pd.notna(desc) and not desc_str.isdigit():
                                             code_metadata_map[code_str] = desc_str
                            except Exception as e:
                                print(f"    - Warning: Could not read header/metadata rows from {base_filename}: {e}")

                            # 2. Rewind file stream
                            file_in_zip.seek(0)

                            # 3. Read the actual data (same logic as before)
                            read_params = {
                                'sep': '\t', 'low_memory': False, 'dtype': {'IDRSSD': str},
                                'encoding': 'utf-8', 'header': 0, 'skiprows': [1]
                            }
                            try:
                                df = pd.read_csv(file_in_zip, **read_params)
                            except UnicodeDecodeError:
                                print(f"    - Warning: UTF-8 decoding failed for {base_filename}. Trying 'latin1'.")
                                read_params['encoding'] = 'latin1'
                                file_in_zip.seek(0)
                                df = pd.read_csv(file_in_zip, **read_params)
                            except ValueError as ve:
                                if 'Integer column has NA values' in str(ve) or 'cannot safely convert' in str(ve):
                                     print(f"    - Warning: Potential mixed types or NAs in IDRSSD for {base_filename}. Retrying with object dtype.")
                                     read_params.pop('dtype', None)
                                     file_in_zip.seek(0)
                                     df = pd.read_csv(file_in_zip, **read_params, dtype=object)
                                else: raise ve

                            # Basic validation & cleaning (same as before)
                            if df.empty:
                                print(f"    - Warning: File {base_filename} loaded as empty DataFrame. Skipping.")
                                continue
                            df.columns = [str(col).strip().strip('"') for col in df.columns]
                            if 'IDRSSD' not in df.columns:
                                print(f"    - Warning: 'IDRSSD' column not found in {base_filename}. Skipping this file.")
                                continue
                            df['IDRSSD'] = df['IDRSSD'].astype(str)
                            df = df.dropna(subset=['IDRSSD'])
                            df = df[df['IDRSSD'].str.strip() != '']
                            if df.empty:
                                 print(f"    - Warning: File {base_filename} has no valid IDRSSD values. Skipping.")
                                 continue
                            non_idrssd_cols = df.columns.difference(['IDRSSD'])
                            df = df.dropna(subset=non_idrssd_cols, how='all')
                            if df.empty:
                                print(f"    - Warning: File {base_filename} has no valid data after removing empty rows. Skipping.")
                                continue
                            if df['IDRSSD'].duplicated().any():
                                print(f"      - Warning: Duplicate 'IDRSSD' values found in {base_filename}. Keeping first occurrence.")
                                df = df.drop_duplicates(subset='IDRSSD', keep='first')
                            try:
                                df = df.set_index('IDRSSD')
                            except KeyError:
                                 print(f"    - Error: Could not set 'IDRSSD' as index for {base_filename}. Skipping.")
                                 continue

                            # --- NEW: Check if file is part of a split ---
                            match = split_file_pattern.match(base_filename)
                            if match:
                                base_name_part = match.group(1).strip()
                                part_num = int(match.group(2))
                                total_parts = int(match.group(3))
                                group_key = (base_name_part, total_parts)

                                print(f"      Detected split file: Base='{base_name_part}', Part={part_num}, Total={total_parts}. Shape: {df.shape}")

                                if group_key not in split_file_groups:
                                    split_file_groups[group_key] = {}

                                # Check for duplicate part numbers within the same group
                                if part_num in split_file_groups[group_key]:
                                     print(f"      - Warning: Duplicate part number {part_num} found for group {group_key}. Overwriting with data from {base_filename}.")
                                
                                # Store the dataframe part
                                split_file_groups[group_key][part_num] = df
                                loaded_files_count += 1 # Count loaded parts

                            # --- ELSE: Handle non-split files ---
                            else:
                                # Check if any columns remain besides the index
                                if df.shape[1] == 0:
                                    print(f"    - Skipping non-split file {base_filename} as it contains no data columns after processing.")
                                    continue

                                loaded_files_count += 1
                                # Use filename without extension as key, ensure uniqueness
                                file_key = os.path.splitext(base_filename)[0]
                                original_key = file_key
                                counter = 1
                                while file_key in dfs_in_zip or any(key[0] == file_key for key in split_file_groups.keys()): # Check against split groups too
                                    file_key = f"{original_key}_{counter}"
                                    counter += 1
                                if file_key != original_key:
                                     print(f"    - Warning: Potential key conflict for '{original_key}'. Using '{file_key}' instead.")

                                dfs_in_zip[file_key] = df
                                print(f"      Stored non-split file as '{file_key}' (in zip). Shape: {df.shape}")

                    except pd.errors.EmptyDataError:
                        print(f"  - Warning: File is empty inside zip: {base_filename}")
                    except Exception as e:
                        print(f"  - Error loading or processing {base_filename} from {zip_file_path}: {e}")
                # --- End Loop through files in zip ---

            # --- NEW: Combine Split File Groups found in THIS Zip ---
            print("\n  - Checking for complete split file groups...")
            completed_group_keys = list(split_file_groups.keys()) # Iterate over a copy

            for group_key in completed_group_keys:
                base_name_part, total_parts = group_key
                parts_dict = split_file_groups[group_key]

                # Check if the group is complete
                if len(parts_dict) == total_parts and all(i in parts_dict for i in range(1, total_parts + 1)):
                    print(f"    - Found complete group: '{base_name_part}' ({total_parts} parts). Concatenating horizontally...")
                    
                    try:
                        # Sort parts by part number
                        sorted_parts = [parts_dict[i] for i in range(1, total_parts + 1)]
                        
                        # Concatenate horizontally
                        combined_df = sorted_parts[0]
                        for i in range(1, total_parts):
                            next_part_df = sorted_parts[i]
                            # Find overlapping columns (excluding index implicitly)
                            overlapping_cols = combined_df.columns.intersection(next_part_df.columns)
                            if len(overlapping_cols) > 0:
                                print(f"      - Warning: Overlapping columns found in part {i+1}: {list(overlapping_cols)}. Dropping from part {i+1}.")
                                next_part_df = next_part_df.drop(columns=overlapping_cols)
                            # Concat (indexes should align)
                            combined_df = pd.concat([combined_df, next_part_df], axis=1)

                        # Generate a unique key for the combined dataframe
                        combined_key = f"{base_name_part}_Combined_{total_parts}parts"
                        original_key = combined_key
                        counter = 1
                        while combined_key in dfs_in_zip: # Ensure uniqueness against existing keys
                             combined_key = f"{original_key}_{counter}"
                             counter += 1
                        if combined_key != original_key:
                              print(f"      - Warning: Potential key conflict for combined group '{original_key}'. Using '{combined_key}'.")

                        dfs_in_zip[combined_key] = combined_df
                        print(f"      - Combined group shape: {combined_df.shape}. Stored as '{combined_key}'.")
                        # Remove the group from split_file_groups as it's processed
                        del split_file_groups[group_key]
                    except Exception as e:
                         print(f"      - Error during concatenation for group {group_key}: {e}. Skipping combination for this group.")
                         # Decide if you want to add individual parts to dfs_in_zip here or just leave them out
                         
                else:
                    print(f"    - Incomplete group found: '{base_name_part}'. Expected {total_parts} parts, found {len(parts_dict)}: {list(parts_dict.keys())}. Parts will not be combined.")
                    # Optionally: Move incomplete parts to the main dfs_in_zip dictionary with descriptive names
                    for part_num, df_part in parts_dict.items():
                        part_key = f"{base_name_part}_Part_{part_num}_of_{total_parts}"
                        original_key = part_key
                        counter = 1
                        while part_key in dfs_in_zip: # Check uniqueness
                            part_key = f"{original_key}_{counter}"
                            counter +=1
                        if part_key != original_key:
                             print(f"      - Warning: Potential key conflict for orphan part '{original_key}'. Using '{part_key}'.")
                        dfs_in_zip[part_key] = df_part
                        print(f"      - Storing incomplete part {part_num} as '{part_key}'. Shape: {df_part.shape}")
                    del split_file_groups[group_key] # Remove processed incomplete group

            # --- Merge DataFrames Loaded from THIS Zip (including combined splits) ---
            if not dfs_in_zip:
                print("  - No mergeable dataframes were loaded or derived from this zip file.")
                merged_df_for_zip = pd.DataFrame()
            else:
                print(f"\n  - Merging {len(dfs_in_zip)} DataFrames (incl. combined/orphan splits) loaded from {zip_file_path}...")
                # Determine base DataFrame for merge (prefer combined or larger files)
                base_key = None
                # Try to find a combined key first
                combined_keys = [k for k in dfs_in_zip.keys() if "_Combined_" in k]
                if combined_keys:
                     # Pick the one with the most columns or rows as a heuristic? Or just the first one? Let's take the first.
                     base_key = combined_keys[0]
                
                # Fallback: Use preferred keys or just the first available key
                if not base_key:
                    potential_base_keys = list(dfs_in_zip.keys())
                    preferred_key_parts = ['ENT', 'POR', 'RC', 'RCA'] # Original preference
                    for p_key_part in preferred_key_parts:
                        for actual_key in potential_base_keys:
                             # Check if key starts with or contains the part, avoiding partial matches in middle
                             if actual_key.startswith(p_key_part) or f" {p_key_part}" in actual_key:
                                  base_key = actual_key
                                  break
                        if base_key: break

                if not base_key and dfs_in_zip: # Final fallback
                    # Maybe choose the one with the most rows/columns?
                    base_key = max(dfs_in_zip, key=lambda k: dfs_in_zip[k].shape[0] * dfs_in_zip[k].shape[1])
                    # base_key = list(dfs_in_zip.keys())[0] # Simpler fallback: just take the first one

                if not base_key:
                     print("    - Error: Could not determine a base DataFrame for merging.")
                     merged_df_for_zip = pd.DataFrame()
                else:
                    try:
                        merged_df_for_zip = dfs_in_zip.pop(base_key)
                        print(f"    - Starting merge with '{base_key}' (Shape: {merged_df_for_zip.shape})")
                        keys_to_merge = list(dfs_in_zip.keys())
                        for key in keys_to_merge:
                            # Check if the key still exists (might have been part of an incomplete split)
                            if key in dfs_in_zip:
                                df_to_merge = dfs_in_zip.pop(key)
                                print(f"    - Merging '{key}' (Shape: {df_to_merge.shape})...")
                                # Ensure the dataframe to merge actually has columns before merging
                                if df_to_merge.shape[1] > 0:
                                    merged_df_for_zip = pd.merge(
                                        merged_df_for_zip, df_to_merge, left_index=True,
                                        right_index=True, how='outer', suffixes=('', f'_{key}') # Suffix helps identify origin if columns clash unexpectedly
                                    )
                                    print(f"      - Merged shape: {merged_df_for_zip.shape}")
                                else:
                                    print(f"      - Skipping merge for '{key}' as it has no columns besides index.")
                            # else: # Key might have been removed if it was part of an incomplete split handled above
                            #    print(f"    - Key '{key}' already processed or removed, skipping merge step.")
                        merged_df_for_zip = merged_df_for_zip.reset_index()
                        print(f"  - Merge complete for this zip. Final shape: {merged_df_for_zip.shape}")
                    except Exception as e:
                         print(f"    - Error during merge process: {e}")
                         merged_df_for_zip = pd.DataFrame() # Reset on error

            # --- Save the Merged DataFrame for THIS Zip as Parquet (Same as before) ---
            if 'merged_df_for_zip' in locals() and not merged_df_for_zip.empty:
                base_zip_name = os.path.basename(zip_file_path)
                output_filename_base, _ = os.path.splitext(base_zip_name)
                output_filename = f"parquet/{output_filename_base}.parquet"
                print(f"\n  - Saving merged data for {zip_file_path} to: {output_filename}")
                try:
                    # Ensure final IDRSSD is string before saving
                    if 'IDRSSD' in merged_df_for_zip.columns:
                        merged_df_for_zip['IDRSSD'] = merged_df_for_zip['IDRSSD'].astype(str)
                    merged_df_for_zip.to_parquet(output_filename, index=False, engine='pyarrow')
                    print(f"  - Successfully saved {output_filename}")
                except ImportError:
                     print("    - Error: 'pyarrow' not found. Trying 'fastparquet'.")
                     try:
                         if 'IDRSSD' in merged_df_for_zip.columns:
                            merged_df_for_zip['IDRSSD'] = merged_df_for_zip['IDRSSD'].astype(str)
                         merged_df_for_zip.to_parquet(output_filename, index=False, engine='fastparquet')
                         print(f"  - Successfully saved {output_filename} using fastparquet")
                     except ImportError:
                          print("    - Error: Neither 'pyarrow' nor 'fastparquet' found. Cannot save Parquet file.")
                     except Exception as e:
                          print(f"    - Error saving Parquet file {output_filename} with fastparquet: {e}")
                except Exception as e:
                    print(f"  - Error saving Parquet file {output_filename}: {e}")
            elif loaded_files_count > 0:
                 print("\n  - No data to save for this zip (merge resulted in empty DataFrame or only invalid files loaded).")
            else:
                 print("\n  - No valid FFIEC text files found or loaded in this zip file.")

        except zipfile.BadZipFile:
            print(f"Error: Failed to open {zip_file_path}. It might be corrupted or not a valid zip file.")
        except Exception as e:
            print(f"An unexpected error occurred while processing {zip_file_path}: {e}")
    # --- End loop through zip files ---

    # --- Create and display the Code <-> Metadata Mapping DataFrame (Same as before) ---
    if code_metadata_map:
        print("\n\n--- Code to Metadata Mapping ---")
        metadata_df = pd.DataFrame(list(code_metadata_map.items()), columns=['Code', 'Metadata'])
        # Optional: Adjust display settings if needed
        # pd.set_option('display.max_rows', None)
        # pd.set_option('display.max_columns', None)
        # pd.set_option('display.width', None)

        try:
            mapping_filename = "code_metadata_mapping.csv"
            metadata_df.to_csv(mapping_filename, index=False)
            print(f"\nMapping saved to {mapping_filename}")
        except Exception as e:
            print(f"\nError saving mapping file: {e}")
    else:
        print("\n\n--- No code-metadata mapping generated (no valid metadata rows found or files processed) ---")

    print("\n--- Finished processing all zip files ---")

Searching for zip files...
Found zip files: ['FFIEC CDR Call Bulk All Schedules 03312001.zip', 'FFIEC CDR Call Bulk All Schedules 03312002.zip', 'FFIEC CDR Call Bulk All Schedules 03312003.zip', 'FFIEC CDR Call Bulk All Schedules 03312004.zip', 'FFIEC CDR Call Bulk All Schedules 03312005.zip', 'FFIEC CDR Call Bulk All Schedules 03312006.zip', 'FFIEC CDR Call Bulk All Schedules 03312007.zip', 'FFIEC CDR Call Bulk All Schedules 03312008.zip', 'FFIEC CDR Call Bulk All Schedules 03312009.zip', 'FFIEC CDR Call Bulk All Schedules 03312010.zip', 'FFIEC CDR Call Bulk All Schedules 03312011.zip', 'FFIEC CDR Call Bulk All Schedules 03312012.zip', 'FFIEC CDR Call Bulk All Schedules 03312013.zip', 'FFIEC CDR Call Bulk All Schedules 03312014.zip', 'FFIEC CDR Call Bulk All Schedules 03312015.zip', 'FFIEC CDR Call Bulk All Schedules 03312016.zip', 'FFIEC CDR Call Bulk All Schedules 03312017.zip', 'FFIEC CDR Call Bulk All Schedules 03312018.zip', 'FFIEC CDR Call Bulk All Schedules 03312019.zip', 'FFIE