<a href="https://colab.research.google.com/github/xmendevs/Bank-Data-Websraping/blob/main/Bank_Data_Webscraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import pandas as pd
import os

# --- Configuration ---
# List of the 5 data files you want to join (using the exact file names you provided).
FILE_NAMES = [
    'nigerian_banks_seo_dataset.csv',
    'nigerian_banks_seo_dataset (1).csv',
    'nigerian_banks_seo_dataset (2).csv',
    'nigerian_banks_seo_dataset (3).csv',
    'nigerian_banks_seo_dataset (4).csv'
]
OUTPUT_CONCAT_FILE = 'master_nigerian_banks_seo_dataset.csv'
# Removed dummy file creation since we are using your uploaded files.

# --- 1. Concatenation (Joining Datasets Vertically) ---

def concatenate_datasets(file_list):
    """
    Reads multiple files and joins them vertically (stacking rows) using pd.concat().
    """
    df_list = []

    # Load all files into a list of DataFrames
    for file in file_list:
        if os.path.exists(file):
            try:
                # Assuming the files are standard comma-separated, as typical for SEO data.
                # If they are tab-separated, change to sep='\t'
                df = pd.read_csv(file)
                df_list.append(df)
                print(f"Loaded {file} ({len(df)} rows)")
            except Exception as e:
                print(f"Error reading {file}: {e}. Skipping this file.")
        else:
            print(f"Warning: File not found: {file}. Skipping.")

    # Use pd.concat() to stack the DataFrames vertically
    if df_list:
        # ignore_index=True resets the index of the new combined DataFrame
        master_df = pd.concat(df_list, ignore_index=True)
        # This line saves the final combined DataFrame to a new CSV file
        master_df.to_csv(OUTPUT_CONCAT_FILE, index=False)
        print(f"\n--- CONCATENATION SUCCESS ---")
        print(f"Total rows in master dataset: {len(master_df)}. Saved to {OUTPUT_CONCAT_FILE}")
        print("\nMaster Concatenated Data Preview (First 10 rows):")
        print(master_df.head(10))
        return master_df

    print("\n--- CONCATENATION FAILED ---")
    print("No dataframes were loaded successfully.")
    return pd.DataFrame()


# ----------------------------------------------------------------------
# --- Main Execution ---
# ----------------------------------------------------------------------

if __name__ == '__main__':
    # We are skipping create_dummy_data() as we use your real uploaded files

    # Run Concatenation using all 5 file names
    print("\n--- Starting Vertical Concatenation of all 5 Datasets ---")
    concatenate_datasets(FILE_NAMES)



--- Starting Vertical Concatenation of all 5 Datasets ---
Loaded nigerian_banks_seo_dataset.csv (266 rows)
Loaded nigerian_banks_seo_dataset (1).csv (86 rows)
Loaded nigerian_banks_seo_dataset (2).csv (113 rows)
Loaded nigerian_banks_seo_dataset (3).csv (192 rows)
Loaded nigerian_banks_seo_dataset (4).csv (108 rows)

--- CONCATENATION SUCCESS ---
Total rows in master dataset: 765. Saved to master_nigerian_banks_seo_dataset.csv

Master Concatenated Data Preview (First 10 rows):
                            keyword  rank_position  \
0              best bank in nigeria              2   
1              best bank in nigeria              8   
2  best bank for savings in nigeria              1   
3  best bank for savings in nigeria              5   
4  best bank for savings in nigeria              7   
5  best bank for savings in nigeria              8   
6  open bank account online nigeria              3   
7  open bank account online nigeria              5   
8  open bank account online nig

In [20]:
import pandas as pd
import os

# --- Configuration ---
# List of the 5 data files you want to join (using the exact file names you provided).
FILE_NAMES = [
    'nigerian_banks_seo_dataset.csv',
    'nigerian_banks_seo_dataset (1).csv',
    'nigerian_banks_seo_dataset (2).csv',
    'nigerian_banks_seo_dataset (3).csv',
    'nigerian_banks_seo_dataset (4).csv'
]
# The file to save the initial combined dataset (including duplicates)
OUTPUT_CONCAT_FILE = 'master_nigerian_banks_seo_dataset.csv'
# The file to save the final, cleaned dataset
OUTPUT_CLEAN_FILE = 'clean_master_nigerian_banks_seo_dataset.csv'

# --- 1. Concatenation (Joining Datasets Vertically) ---

def concatenate_datasets(file_list):
    """
    Reads multiple files, joins them vertically, removes duplicates, and saves the clean result.
    """
    df_list = []

    # Load all files into a list of DataFrames
    for file in file_list:
        if os.path.exists(file):
            try:
                # Assuming the files are standard comma-separated, as typical for SEO data.
                df = pd.read_csv(file)
                df_list.append(df)
                print(f"Loaded {file} ({len(df)} rows)")
            except Exception as e:
                print(f"Error reading {file}: {e}. Skipping this file.")
        else:
            print(f"Warning: File not found: {file}. Skipping.")

    # Use pd.concat() to stack the DataFrames vertically
    if df_list:
        # Step 1: Concatenate all files
        master_df = pd.concat(df_list, ignore_index=True)
        initial_rows = len(master_df)

        # Optional: Save the initial concatenated file (including duplicates)
        master_df.to_csv(OUTPUT_CONCAT_FILE, index=False)
        print(f"\n--- CONCATENATION SUCCESS ---")
        print(f"Initial total rows (including potential duplicates): {initial_rows}. Saved to {OUTPUT_CONCAT_FILE}")

        # Step 2: Remove duplicates
        # keep='first' means it keeps the first occurrence and drops subsequent identical rows
        clean_df = master_df.drop_duplicates(keep='first')
        dropped_rows = initial_rows - len(clean_df)

        # Step 3: Save the clean master dataset
        clean_df.to_csv(OUTPUT_CLEAN_FILE, index=False)

        print(f"\n--- CLEANING SUCCESS ---")
        print(f"Total duplicate rows dropped: {dropped_rows}")
        print(f"Final clean rows in master dataset: {len(clean_df)}. Saved to {OUTPUT_CLEAN_FILE}")
        print("\nClean Master Data Preview (First 10 rows):")
        print(clean_df.head(10))

        return clean_df

    print("\n--- CONCATENATION FAILED ---")
    print("No dataframes were loaded successfully.")
    return pd.DataFrame()


# ----------------------------------------------------------------------
# --- Main Execution ---
# ----------------------------------------------------------------------

if __name__ == '__main__':

    # Run Concatenation and Cleaning using all 5 file names
    print("\n--- Starting Vertical Concatenation and Cleaning of all 5 Datasets ---")
    concatenate_datasets(FILE_NAMES)



--- Starting Vertical Concatenation and Cleaning of all 5 Datasets ---
Loaded nigerian_banks_seo_dataset.csv (266 rows)
Loaded nigerian_banks_seo_dataset (1).csv (86 rows)
Loaded nigerian_banks_seo_dataset (2).csv (113 rows)
Loaded nigerian_banks_seo_dataset (3).csv (192 rows)
Loaded nigerian_banks_seo_dataset (4).csv (108 rows)

--- CONCATENATION SUCCESS ---
Initial total rows (including potential duplicates): 765. Saved to master_nigerian_banks_seo_dataset.csv

--- CLEANING SUCCESS ---
Total duplicate rows dropped: 0
Final clean rows in master dataset: 765. Saved to clean_master_nigerian_banks_seo_dataset.csv

Clean Master Data Preview (First 10 rows):
                            keyword  rank_position  \
0              best bank in nigeria              2   
1              best bank in nigeria              8   
2  best bank for savings in nigeria              1   
3  best bank for savings in nigeria              5   
4  best bank for savings in nigeria              7   
5  best bank