# Concatenate DataFrames Vertically and Horizontally
## 1. Basic Concatenation Operations

In [1]:
import pandas as pd
import numpy as np

# Create sample datasets
print("CREATING SAMPLE DATAFRAMES")
print("="*80)

# Dataset 1: Q1 Sales
q1_sales = pd.DataFrame({
    'employee_id': [101, 102, 103],
    'name': ['Alice', 'Bob', 'Charlie'],
    'department': ['IT', 'HR', 'Finance'],
    'q1_sales': [15000, 12000, 18000],
    'q1_target': [14000, 13000, 17000]
})

# Dataset 2: Q2 Sales
q2_sales = pd.DataFrame({
    'employee_id': [101, 102, 103, 104],  # Note: New employee 104
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'department': ['IT', 'HR', 'Finance', 'IT'],
    'q2_sales': [16000, 13000, 19000, 22000],
    'q2_target': [15000, 14000, 18000, 20000]
})

# Dataset 3: Employee Info
employee_info = pd.DataFrame({
    'employee_id': [101, 102, 103, 104, 105],
    'hire_date': ['2020-01-15', '2019-03-20', '2021-07-10', '2022-02-01', '2023-05-15'],
    'salary': [75000, 65000, 80000, 72000, 68000],
    'location': ['NYC', 'LA', 'Chicago', 'NYC', 'LA']
})

print("Q1 Sales DataFrame:")
print(q1_sales)
print("\nQ2 Sales DataFrame:")
print(q2_sales)
print("\nEmployee Info DataFrame:")
print(employee_info)

CREATING SAMPLE DATAFRAMES
Q1 Sales DataFrame:
   employee_id     name department  q1_sales  q1_target
0          101    Alice         IT     15000      14000
1          102      Bob         HR     12000      13000
2          103  Charlie    Finance     18000      17000

Q2 Sales DataFrame:
   employee_id     name department  q2_sales  q2_target
0          101    Alice         IT     16000      15000
1          102      Bob         HR     13000      14000
2          103  Charlie    Finance     19000      18000
3          104    David         IT     22000      20000

Employee Info DataFrame:
   employee_id   hire_date  salary location
0          101  2020-01-15   75000      NYC
1          102  2019-03-20   65000       LA
2          103  2021-07-10   80000  Chicago
3          104  2022-02-01   72000      NYC
4          105  2023-05-15   68000       LA


# 2. Vertical Concatenation (Row-wise)
## Example 2.1: Basic Vertical Concatenation

In [2]:
print("\n" + "="*80)
print("VERTICAL CONCATENATION (ROW-WISE)")
print("="*80)

# Simple vertical concatenation
combined_sales = pd.concat([q1_sales, q2_sales], axis=0)  # axis=0 is default
print("1. Basic vertical concatenation (all rows from both DataFrames):")
print(combined_sales)
print(f"Shape: {combined_sales.shape} (rows, columns)")
print(f"Note: Duplicate rows appear because we concatenated similar data")

# Reset index after concatenation
combined_sales_reset = pd.concat([q1_sales, q2_sales], ignore_index=True)
print("\n2. With ignore_index=True (new sequential index):")
print(combined_sales_reset)


VERTICAL CONCATENATION (ROW-WISE)
1. Basic vertical concatenation (all rows from both DataFrames):
   employee_id     name department  q1_sales  q1_target  q2_sales  q2_target
0          101    Alice         IT   15000.0    14000.0       NaN        NaN
1          102      Bob         HR   12000.0    13000.0       NaN        NaN
2          103  Charlie    Finance   18000.0    17000.0       NaN        NaN
0          101    Alice         IT       NaN        NaN   16000.0    15000.0
1          102      Bob         HR       NaN        NaN   13000.0    14000.0
2          103  Charlie    Finance       NaN        NaN   19000.0    18000.0
3          104    David         IT       NaN        NaN   22000.0    20000.0
Shape: (7, 7) (rows, columns)
Note: Duplicate rows appear because we concatenated similar data

2. With ignore_index=True (new sequential index):
   employee_id     name department  q1_sales  q1_target  q2_sales  q2_target
0          101    Alice         IT   15000.0    14000.0      

# Example 2.2: Vertical Concatenation with Different Columns

In [3]:
# Create DataFrames with different columns
q1_extra = pd.DataFrame({
    'employee_id': [101, 102, 103],
    'name': ['Alice', 'Bob', 'Charlie'],
    'q1_bonus': [1500, 1200, 1800],
    'q1_commissions': [750, 600, 900]
})

q2_extra = pd.DataFrame({
    'employee_id': [101, 102, 103],
    'name': ['Alice', 'Bob', 'Charlie'],
    'q2_bonus': [1600, 1300, 1900],
    'q2_vacation_days': [5, 3, 7]
})

print("\n3. DataFrames with different columns:")
print("Q1 Extra:")
print(q1_extra)
print("\nQ2 Extra:")
print(q2_extra)

# Concatenate with different columns
combined_different = pd.concat([q1_extra, q2_extra], ignore_index=True)
print("\nConcatenated (different columns filled with NaN):")
print(combined_different)


3. DataFrames with different columns:
Q1 Extra:
   employee_id     name  q1_bonus  q1_commissions
0          101    Alice      1500             750
1          102      Bob      1200             600
2          103  Charlie      1800             900

Q2 Extra:
   employee_id     name  q2_bonus  q2_vacation_days
0          101    Alice      1600                 5
1          102      Bob      1300                 3
2          103  Charlie      1900                 7

Concatenated (different columns filled with NaN):
   employee_id     name  q1_bonus  q1_commissions  q2_bonus  q2_vacation_days
0          101    Alice    1500.0           750.0       NaN               NaN
1          102      Bob    1200.0           600.0       NaN               NaN
2          103  Charlie    1800.0           900.0       NaN               NaN
3          101    Alice       NaN             NaN    1600.0               5.0
4          102      Bob       NaN             NaN    1300.0               3.0
5          10

# Example 2.3: Vertical Concatenation with keys

In [4]:
# Concatenate with keys for multi-level index
combined_with_keys = pd.concat([q1_sales, q2_sales], 
                               keys=['Q1', 'Q2'], 
                               names=['Quarter', 'Row'])
print("\n4. Vertical concatenation with keys (multi-level index):")
print(combined_with_keys)
print(f"\nIndex levels: {combined_with_keys.index.names}")

# Access data by key
print("\nAccessing Q1 data:")
print(combined_with_keys.loc['Q1'])

print("\nAccessing Q2 data:")
print(combined_with_keys.loc['Q2'])


4. Vertical concatenation with keys (multi-level index):
             employee_id     name department  q1_sales  q1_target  q2_sales  \
Quarter Row                                                                   
Q1      0            101    Alice         IT   15000.0    14000.0       NaN   
        1            102      Bob         HR   12000.0    13000.0       NaN   
        2            103  Charlie    Finance   18000.0    17000.0       NaN   
Q2      0            101    Alice         IT       NaN        NaN   16000.0   
        1            102      Bob         HR       NaN        NaN   13000.0   
        2            103  Charlie    Finance       NaN        NaN   19000.0   
        3            104    David         IT       NaN        NaN   22000.0   

             q2_target  
Quarter Row             
Q1      0          NaN  
        1          NaN  
        2          NaN  
Q2      0      15000.0  
        1      14000.0  
        2      18000.0  
        3      20000.0  

Inde

# 3. Horizontal Concatenation (Column-wise)
## Example 3.1: Basic Horizontal Concatenation

In [5]:
print("\n" + "="*80)
print("HORIZONTAL CONCATENATION (COLUMN-WISE)")
print("="*80)

# Create DataFrames with same index/rows but different columns
employee_personal = pd.DataFrame({
    'employee_id': [101, 102, 103, 104],
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'age': [28, 35, 42, 31],
    'education': ['Masters', 'Bachelors', 'PhD', 'Masters']
})

employee_professional = pd.DataFrame({
    'employee_id': [101, 102, 103, 104],
    'position': ['Developer', 'Manager', 'Analyst', 'Developer'],
    'years_exp': [5, 8, 12, 4],
    'certifications': ['AWS,Python', 'PMP,Scrum', 'CFA,CPA', 'Azure,Java']
})

print("Employee Personal Info:")
print(employee_personal)
print("\nEmployee Professional Info:")
print(employee_professional)

# Horizontal concatenation
employee_complete = pd.concat([employee_personal, employee_professional], axis=1)
print("\n1. Basic horizontal concatenation (axis=1):")
print(employee_complete)
print(f"Note: Duplicate employee_id column")


HORIZONTAL CONCATENATION (COLUMN-WISE)
Employee Personal Info:
   employee_id     name  age  education
0          101    Alice   28    Masters
1          102      Bob   35  Bachelors
2          103  Charlie   42        PhD
3          104    David   31    Masters

Employee Professional Info:
   employee_id   position  years_exp certifications
0          101  Developer          5     AWS,Python
1          102    Manager          8      PMP,Scrum
2          103    Analyst         12        CFA,CPA
3          104  Developer          4     Azure,Java

1. Basic horizontal concatenation (axis=1):
   employee_id     name  age  education  employee_id   position  years_exp  \
0          101    Alice   28    Masters          101  Developer          5   
1          102      Bob   35  Bachelors          102    Manager          8   
2          103  Charlie   42        PhD          103    Analyst         12   
3          104    David   31    Masters          104  Developer          4   

  certifica

# Example 3.2: Horizontal Concatenation with Different Index

In [6]:
# Create DataFrames with different indices
sales_q1 = pd.DataFrame({
    'q1_sales': [15000, 12000, 18000],
    'q1_target': [14000, 13000, 17000]
}, index=[101, 102, 103])  # employee_id as index

sales_q2 = pd.DataFrame({
    'q2_sales': [16000, 13000, 19000, 22000],
    'q2_target': [15000, 14000, 18000, 20000]
}, index=[101, 102, 103, 104])  # Different set of indices

print("\n2. DataFrames with different indices:")
print("Sales Q1 (index: 101, 102, 103):")
print(sales_q1)
print("\nSales Q2 (index: 101, 102, 103, 104):")
print(sales_q2)

# Horizontal concatenation with different indices
sales_combined = pd.concat([sales_q1, sales_q2], axis=1)
print("\nHorizontal concatenation (different indices - NaN for missing):")
print(sales_combined)


2. DataFrames with different indices:
Sales Q1 (index: 101, 102, 103):
     q1_sales  q1_target
101     15000      14000
102     12000      13000
103     18000      17000

Sales Q2 (index: 101, 102, 103, 104):
     q2_sales  q2_target
101     16000      15000
102     13000      14000
103     19000      18000
104     22000      20000

Horizontal concatenation (different indices - NaN for missing):
     q1_sales  q1_target  q2_sales  q2_target
101   15000.0    14000.0     16000      15000
102   12000.0    13000.0     13000      14000
103   18000.0    17000.0     19000      18000
104       NaN        NaN     22000      20000


# Example 3.3: Horizontal Concatenation with join parameter

In [7]:
# Using join parameter to control how indices are combined
print("\n3. Horizontal concatenation with join parameter:")

# Inner join (intersection of indices)
inner_join = pd.concat([sales_q1, sales_q2], axis=1, join='inner')
print("Inner join (only common indices 101, 102, 103):")
print(inner_join)

# Outer join (union of indices) - default
outer_join = pd.concat([sales_q1, sales_q2], axis=1, join='outer')
print("\nOuter join (all indices 101, 102, 103, 104):")
print(outer_join)


3. Horizontal concatenation with join parameter:
Inner join (only common indices 101, 102, 103):
     q1_sales  q1_target  q2_sales  q2_target
101     15000      14000     16000      15000
102     12000      13000     13000      14000
103     18000      17000     19000      18000

Outer join (all indices 101, 102, 103, 104):
     q1_sales  q1_target  q2_sales  q2_target
101   15000.0    14000.0     16000      15000
102   12000.0    13000.0     13000      14000
103   18000.0    17000.0     19000      18000
104       NaN        NaN     22000      20000



# 4. Real-World Business Examples
## Example 4.1: Monthly Sales Reports

In [8]:
print("\n" + "="*80)
print("REAL-WORLD EXAMPLE: MONTHLY SALES REPORTS")
print("="*80)

# Create monthly sales data
jan_sales = pd.DataFrame({
    'product_id': ['P001', 'P002', 'P003'],
    'product_name': ['Laptop', 'Mouse', 'Keyboard'],
    'jan_units': [150, 300, 200],
    'jan_revenue': [150000, 9000, 16000]
})

feb_sales = pd.DataFrame({
    'product_id': ['P001', 'P002', 'P004'],  # P003 missing, P004 new
    'product_name': ['Laptop', 'Mouse', 'Monitor'],
    'feb_units': [180, 320, 90],
    'feb_revenue': [180000, 9600, 27000]
})

mar_sales = pd.DataFrame({
    'product_id': ['P001', 'P003', 'P004', 'P005'],  # P002 missing, P005 new
    'product_name': ['Laptop', 'Keyboard', 'Monitor', 'Headphones'],
    'mar_units': [200, 220, 100, 150],
    'mar_revenue': [200000, 17600, 30000, 22500]
})

print("January Sales:")
print(jan_sales)
print("\nFebruary Sales:")
print(feb_sales)
print("\nMarch Sales:")
print(mar_sales)

# Method 1: Horizontal concatenation (side-by-side months)
monthly_comparison = pd.concat([jan_sales.set_index('product_id'),
                                feb_sales.set_index('product_id'),
                                mar_sales.set_index('product_id')], 
                               axis=1, join='outer')
print("\n1. Monthly Comparison (horizontal concatenation):")
print(monthly_comparison)

# Method 2: Vertical concatenation (stacked data)
all_months = pd.concat([jan_sales.assign(month='January'),
                        feb_sales.assign(month='February'),
                        mar_sales.assign(month='March')], 
                       ignore_index=True)
print("\n2. All Months Data (vertical concatenation):")
print(all_months.sort_values(['product_id', 'month']))


REAL-WORLD EXAMPLE: MONTHLY SALES REPORTS
January Sales:
  product_id product_name  jan_units  jan_revenue
0       P001       Laptop        150       150000
1       P002        Mouse        300         9000
2       P003     Keyboard        200        16000

February Sales:
  product_id product_name  feb_units  feb_revenue
0       P001       Laptop        180       180000
1       P002        Mouse        320         9600
2       P004      Monitor         90        27000

March Sales:
  product_id product_name  mar_units  mar_revenue
0       P001       Laptop        200       200000
1       P003     Keyboard        220        17600
2       P004      Monitor        100        30000
3       P005   Headphones        150        22500

1. Monthly Comparison (horizontal concatenation):
           product_name  jan_units  jan_revenue product_name  feb_units  \
product_id                                                                
P001             Laptop      150.0     150000.0       Laptop

# Example 4.2: Customer Data from Multiple Sources

In [9]:
print("\n" + "="*80)
print("REAL-WORLD EXAMPLE: CUSTOMER DATA INTEGRATION")
print("="*80)

# Data from different systems
crm_data = pd.DataFrame({
    'customer_id': ['C001', 'C002', 'C003', 'C004'],
    'name': ['Alice Smith', 'Bob Johnson', 'Charlie Brown', 'Diana Prince'],
    'email': ['alice@email.com', 'bob@email.com', 'charlie@email.com', 'diana@email.com'],
    'phone': ['555-0101', '555-0102', '555-0103', '555-0104'],
    'signup_date': ['2023-01-15', '2023-02-20', '2023-03-10', '2023-04-05']
})

orders_data = pd.DataFrame({
    'customer_id': ['C001', 'C001', 'C002', 'C003', 'C004', 'C005'],  # C005 not in CRM
    'order_id': ['O1001', 'O1002', 'O1003', 'O1004', 'O1005', 'O1006'],
    'order_date': ['2023-05-10', '2023-06-15', '2023-05-22', '2023-06-01', '2023-06-20', '2023-06-25'],
    'order_amount': [150.50, 89.99, 299.99, 45.50, 599.99, 129.99],
    'product_category': ['Electronics', 'Books', 'Clothing', 'Home', 'Electronics', 'Sports']
})

support_data = pd.DataFrame({
    'customer_id': ['C001', 'C002', 'C002', 'C004'],
    'ticket_id': ['T001', 'T002', 'T003', 'T004'],
    'issue_date': ['2023-04-01', '2023-05-15', '2023-06-10', '2023-06-18'],
    'issue_type': ['Billing', 'Technical', 'Returns', 'Technical'],
    'status': ['Resolved', 'Open', 'Resolved', 'Open']
})

print("CRM Data (Customer Info):")
print(crm_data)
print("\nOrders Data:")
print(orders_data)
print("\nSupport Tickets Data:")
print(support_data)

# Strategy 1: Combine all data horizontally
# First, aggregate orders and support data
orders_summary = orders_data.groupby('customer_id').agg({
    'order_id': 'count',
    'order_amount': 'sum'
}).rename(columns={'order_id': 'total_orders', 'order_amount': 'total_spent'})

support_summary = support_data.groupby('customer_id').agg({
    'ticket_id': 'count',
    'issue_type': lambda x: ', '.join(set(x))
}).rename(columns={'ticket_id': 'ticket_count', 'issue_type': 'issue_types'})

# Combine all data horizontally
customer_360 = pd.concat([crm_data.set_index('customer_id'),
                          orders_summary,
                          support_summary], 
                         axis=1, join='outer')

print("\nCustomer 360 View (horizontal concatenation):")
print(customer_360)

# Strategy 2: Combine all data vertically (for data warehouse)
all_customer_ids = pd.concat([
    crm_data[['customer_id']],
    orders_data[['customer_id']],
    support_data[['customer_id']]
]).drop_duplicates().reset_index(drop=True)

print("\nAll Unique Customer IDs (vertical concatenation):")
print(all_customer_ids)


REAL-WORLD EXAMPLE: CUSTOMER DATA INTEGRATION
CRM Data (Customer Info):
  customer_id           name              email     phone signup_date
0        C001    Alice Smith    alice@email.com  555-0101  2023-01-15
1        C002    Bob Johnson      bob@email.com  555-0102  2023-02-20
2        C003  Charlie Brown  charlie@email.com  555-0103  2023-03-10
3        C004   Diana Prince    diana@email.com  555-0104  2023-04-05

Orders Data:
  customer_id order_id  order_date  order_amount product_category
0        C001    O1001  2023-05-10        150.50      Electronics
1        C001    O1002  2023-06-15         89.99            Books
2        C002    O1003  2023-05-22        299.99         Clothing
3        C003    O1004  2023-06-01         45.50             Home
4        C004    O1005  2023-06-20        599.99      Electronics
5        C005    O1006  2023-06-25        129.99           Sports

Support Tickets Data:
  customer_id ticket_id  issue_date issue_type    status
0        C001      T0

# Example 4.3: Financial Data Consolidation

In [10]:
print("\n" + "="*80)
print("REAL-WORLD EXAMPLE: FINANCIAL REPORT CONSOLIDATION")
print("="*80)

# Financial data from different regions
us_financials = pd.DataFrame({
    'account_code': ['REV-001', 'REV-002', 'COGS-001', 'COGS-002'],
    'account_name': ['Product Sales', 'Service Revenue', 'Materials', 'Labor'],
    'q1_us': [1000000, 250000, 400000, 300000],
    'q2_us': [1100000, 275000, 420000, 310000]
})

eu_financials = pd.DataFrame({
    'account_code': ['REV-001', 'REV-002', 'COGS-001', 'COGS-002'],
    'account_name': ['Product Sales', 'Service Revenue', 'Materials', 'Labor'],
    'q1_eu': [800000, 150000, 300000, 200000],
    'q2_eu': [850000, 160000, 310000, 210000]
})

asia_financials = pd.DataFrame({
    'account_code': ['REV-001', 'REV-002', 'COGS-001', 'COGS-003'],  # Note: COGS-003 different
    'account_name': ['Product Sales', 'Service Revenue', 'Materials', 'Shipping'],
    'q1_asia': [600000, 100000, 250000, 50000],
    'q2_asia': [650000, 110000, 260000, 55000]
})

print("US Financials:")
print(us_financials)
print("\nEU Financials:")
print(eu_financials)
print("\nAsia Financials:")
print(asia_financials)

# Method 1: Horizontal concatenation by region
regional_comparison = pd.concat([
    us_financials.set_index('account_code'),
    eu_financials.set_index('account_code'),
    asia_financials.set_index('account_code')
], axis=1)

print("\n1. Regional Financial Comparison (horizontal):")
print(regional_comparison)

# Method 2: Vertical concatenation for consolidated P&L
# First, melt each regional dataframe
us_melted = us_financials.melt(id_vars=['account_code', 'account_name'],
                               var_name='period_region',
                               value_name='amount')
us_melted[['quarter', 'region']] = us_melted['period_region'].str.split('_', expand=True)

eu_melted = eu_financials.melt(id_vars=['account_code', 'account_name'],
                               var_name='period_region',
                               value_name='amount')
eu_melted[['quarter', 'region']] = eu_melted['period_region'].str.split('_', expand=True)

asia_melted = asia_financials.melt(id_vars=['account_code', 'account_name'],
                                   var_name='period_region',
                                   value_name='amount')
asia_melted[['quarter', 'region']] = asia_melted['period_region'].str.split('_', expand=True)

# Now concatenate vertically
consolidated_financials = pd.concat([us_melted, eu_melted, asia_melted], ignore_index=True)

print("\n2. Consolidated Financials (vertical - melted format):")
print(consolidated_financials.head(10))

# Create pivot table from consolidated data
pivot_table = consolidated_financials.pivot_table(
    index=['account_code', 'account_name'],
    columns=['quarter', 'region'],
    values='amount',
    aggfunc='sum',
    fill_value=0
)

print("\n3. Pivot Table from Consolidated Data:")
print(pivot_table)


REAL-WORLD EXAMPLE: FINANCIAL REPORT CONSOLIDATION
US Financials:
  account_code     account_name    q1_us    q2_us
0      REV-001    Product Sales  1000000  1100000
1      REV-002  Service Revenue   250000   275000
2     COGS-001        Materials   400000   420000
3     COGS-002            Labor   300000   310000

EU Financials:
  account_code     account_name   q1_eu   q2_eu
0      REV-001    Product Sales  800000  850000
1      REV-002  Service Revenue  150000  160000
2     COGS-001        Materials  300000  310000
3     COGS-002            Labor  200000  210000

Asia Financials:
  account_code     account_name  q1_asia  q2_asia
0      REV-001    Product Sales   600000   650000
1      REV-002  Service Revenue   100000   110000
2     COGS-001        Materials   250000   260000
3     COGS-003         Shipping    50000    55000

1. Regional Financial Comparison (horizontal):
                 account_name      q1_us      q2_us     account_name  \
account_code                           

# 5. Advanced Concatenation Techniques
# Example 5.1: Concatenation with Sorting

In [11]:
print("\n" + "="*80)
print("ADVANCED: CONCATENATION WITH SORTING")
print("="*80)

# Create unsorted dataframes
df1 = pd.DataFrame({
    'A': [3, 1, 2],
    'B': [6, 4, 5]
}, index=['c', 'a', 'b'])

df2 = pd.DataFrame({
    'A': [9, 7, 8],
    'B': [12, 10, 11]
}, index=['f', 'd', 'e'])

print("DataFrame 1 (unsorted):")
print(df1)
print("\nDataFrame 2 (unsorted):")
print(df2)

# Concatenate and sort
concatenated_sorted = pd.concat([df1, df2]).sort_index()
print("\nConcatenated and sorted by index:")
print(concatenated_sorted)

# Sort by column after concatenation
concatenated_sorted_col = pd.concat([df1, df2], ignore_index=True).sort_values('A')
print("\nConcatenated and sorted by column A:")
print(concatenated_sorted_col)


ADVANCED: CONCATENATION WITH SORTING
DataFrame 1 (unsorted):
   A  B
c  3  6
a  1  4
b  2  5

DataFrame 2 (unsorted):
   A   B
f  9  12
d  7  10
e  8  11

Concatenated and sorted by index:
   A   B
a  1   4
b  2   5
c  3   6
d  7  10
e  8  11
f  9  12

Concatenated and sorted by column A:
   A   B
1  1   4
2  2   5
0  3   6
4  7  10
5  8  11
3  9  12


# Example 5.2: Concatenation with Custom Index

In [12]:
print("\n" + "="*80)
print("ADVANCED: CONCATENATION WITH CUSTOM INDEX HANDLING")
print("="*80)

# Create data with custom index
products_2023 = pd.DataFrame({
    'product': ['A', 'B', 'C'],
    'sales_2023': [100, 200, 150]
}).set_index('product')

products_2024 = pd.DataFrame({
    'product': ['B', 'C', 'D'],
    'sales_2024': [220, 170, 90]
}).set_index('product')

print("Products 2023:")
print(products_2023)
print("\nProducts 2024:")
print(products_2024)

# Method 1: Reindex before concatenation
all_products = products_2023.index.union(products_2024.index)

products_2023_reindexed = products_2023.reindex(all_products)
products_2024_reindexed = products_2024.reindex(all_products)

combined_reindexed = pd.concat([products_2023_reindexed, products_2024_reindexed], axis=1)
print("\n1. Combined after reindexing:")
print(combined_reindexed)


ADVANCED: CONCATENATION WITH CUSTOM INDEX HANDLING
Products 2023:
         sales_2023
product            
A               100
B               200
C               150

Products 2024:
         sales_2024
product            
B               220
C               170
D                90

1. Combined after reindexing:
         sales_2023  sales_2024
product                        
A             100.0         NaN
B             200.0       220.0
C             150.0       170.0
D               NaN        90.0


# Example 5.3: Concatenation with verify_integrity

In [13]:
print("\n" + "="*80)
print("ADVANCED: CONCATENATION WITH INTEGRITY CHECKING")
print("="*80)

# Create data with duplicate indices
df_duplicate1 = pd.DataFrame({
    'value': [1, 2, 3]
}, index=['a', 'b', 'c'])

df_duplicate2 = pd.DataFrame({
    'value': [4, 5, 6]
}, index=['c', 'd', 'c'])  # Note: duplicate 'c' index

print("DataFrame 1:")
print(df_duplicate1)
print("\nDataFrame 2 (with duplicate index 'c'):")
print(df_duplicate2)

# Without verify_integrity (default - allows duplicates)
combined_default = pd.concat([df_duplicate1, df_duplicate2])
print("\n1. Concatenated without verify_integrity (allows duplicates):")
print(combined_default)

# With verify_integrity (raises error on duplicates)
try:
    combined_integrity = pd.concat([df_duplicate1, df_duplicate2], verify_integrity=True)
    print(combined_integrity)
except ValueError as e:
    print(f"\n2. With verify_integrity=True: {e}")


ADVANCED: CONCATENATION WITH INTEGRITY CHECKING
DataFrame 1:
   value
a      1
b      2
c      3

DataFrame 2 (with duplicate index 'c'):
   value
c      4
d      5
c      6

1. Concatenated without verify_integrity (allows duplicates):
   value
a      1
b      2
c      3
c      4
d      5
c      6

2. With verify_integrity=True: Indexes have overlapping values: Index(['c'], dtype='object')


# 6. Performance Considerations

In [14]:
print("\n" + "="*80)
print("PERFORMANCE CONSIDERATIONS")
print("="*80)

import time

# Create large DataFrames for performance testing
np.random.seed(42)
n_rows = 100000

large_df1 = pd.DataFrame({
    'id': range(n_rows),
    'value1': np.random.randn(n_rows),
    'category1': np.random.choice(['A', 'B', 'C'], n_rows)
})

large_df2 = pd.DataFrame({
    'id': range(n_rows, n_rows * 2),
    'value2': np.random.randn(n_rows),
    'category2': np.random.choice(['X', 'Y', 'Z'], n_rows)
})

large_df3 = pd.DataFrame({
    'id': range(n_rows),
    'value3': np.random.randn(n_rows),
    'category3': np.random.choice(['M', 'N', 'O'], n_rows)
})

print(f"Large DataFrames created: {n_rows:,} rows each")

# Test vertical concatenation performance
start_time = time.time()
vertical_result = pd.concat([large_df1, large_df2], ignore_index=True)
vertical_time = time.time() - start_time
print(f"\n1. Vertical concatenation time: {vertical_time:.4f} seconds")
print(f"   Result shape: {vertical_result.shape}")

# Test horizontal concatenation performance
start_time = time.time()
horizontal_result = pd.concat([large_df1.set_index('id'), 
                               large_df3.set_index('id')], axis=1)
horizontal_time = time.time() - start_time
print(f"\n2. Horizontal concatenation time: {horizontal_time:.4f} seconds")
print(f"   Result shape: {horizontal_result.shape}")

# Alternative: Using merge for horizontal combination
start_time = time.time()
merge_result = pd.merge(large_df1, large_df3, on='id')
merge_time = time.time() - start_time
print(f"\n3. Merge (alternative to horizontal concat): {merge_time:.4f} seconds")
print(f"   Result shape: {merge_result.shape}")


PERFORMANCE CONSIDERATIONS
Large DataFrames created: 100,000 rows each

1. Vertical concatenation time: 0.0284 seconds
   Result shape: (200000, 5)

2. Horizontal concatenation time: 0.0104 seconds
   Result shape: (100000, 4)

3. Merge (alternative to horizontal concat): 0.0103 seconds
   Result shape: (100000, 5)


# 7. Best Practices and Patterns

In [15]:
print("\n" + "="*80)
print("BEST PRACTICES AND COMMON PATTERNS")
print("="*80)

patterns = [
    ("Data Collection Pattern",
     "# Collect data from multiple sources\n"
     "daily_data = []\n"
     "for file in daily_files:\n"
     "    df = pd.read_csv(file)\n"
     "    daily_data.append(df)\n"
     "# Concatenate all at once\n"
     "monthly_data = pd.concat(daily_data, ignore_index=True)"),
    
    ("Multi-Region Reporting Pattern",
     "# Combine regional reports\n"
     "regions = ['us', 'eu', 'asia']\n"
     "dfs = []\n"
     "for region in regions:\n"
     "    df = pd.read_excel(f'{region}_sales.xlsx')\n"
     "    df['region'] = region  # Add identifier\n"
     "    dfs.append(df)\n"
     "global_sales = pd.concat(dfs, ignore_index=True)"),
    
    ("Time Series Extension Pattern",
     "# Extend time series with new data\n"
     "historical = pd.read_csv('historical_data.csv')\n"
     "new_data = pd.read_csv('new_data.csv')\n"
     "updated_series = pd.concat([historical, new_data])\n"
     "updated_series = updated_series.sort_values('date')"),
    
    ("Feature Engineering Pattern",
     "# Combine features from different sources\n"
     "customer_features = pd.concat([\n"
     "    demographics.set_index('customer_id'),\n"
     "    transaction_stats.set_index('customer_id'),\n"
     "    behavioral_data.set_index('customer_id')\n"
     "], axis=1, join='inner')  # Only customers with all data"),
]

for title, code in patterns:
    print(f"\n{title}:")
    print("-" * len(title))
    print(code)


BEST PRACTICES AND COMMON PATTERNS

Data Collection Pattern:
-----------------------
# Collect data from multiple sources
daily_data = []
for file in daily_files:
    df = pd.read_csv(file)
    daily_data.append(df)
# Concatenate all at once
monthly_data = pd.concat(daily_data, ignore_index=True)

Multi-Region Reporting Pattern:
------------------------------
# Combine regional reports
regions = ['us', 'eu', 'asia']
dfs = []
for region in regions:
    df = pd.read_excel(f'{region}_sales.xlsx')
    df['region'] = region  # Add identifier
    dfs.append(df)
global_sales = pd.concat(dfs, ignore_index=True)

Time Series Extension Pattern:
-----------------------------
# Extend time series with new data
historical = pd.read_csv('historical_data.csv')
new_data = pd.read_csv('new_data.csv')
updated_series = pd.concat([historical, new_data])
updated_series = updated_series.sort_values('date')

Feature Engineering Pattern:
---------------------------
# Combine features from different sources
c

# 8. Quick Reference Cheat Sheet

In [16]:
print("\n" + "="*80)
print("CONCATENATION CHEAT SHEET")
print("="*80)

cheat_sheet = """
BASIC SYNTAX:
------------
pd.concat([df1, df2])                    # Vertical (default)
pd.concat([df1, df2], axis=0)            # Vertical (explicit)
pd.concat([df1, df2], axis=1)            # Horizontal

KEY PARAMETERS:
--------------
axis=0/1           # 0: vertical, 1: horizontal
ignore_index=True  # Reset index after concatenation
keys=['a', 'b']    # Create multi-level index
names=['source']   # Name the index levels
join='outer'       # 'outer' (default) or 'inner'
sort=False         # Sort non-concatenation axis

VERTICAL CONCATENATION USE CASES:
--------------------------------
1. Appending new rows to existing data
2. Combining similar datasets from different sources
3. Stacking time periods (months, quarters, years)
4. Aggregating data from multiple files

HORIZONTAL CONCATENATION USE CASES:
----------------------------------
1. Combining different features about same entities
2. Merging data from different systems (CRM + ERP)
3. Adding calculated columns to existing data
4. Creating wide-format data for analysis

ALTERNATIVES TO CONCAT:
---------------------
# For vertical concatenation:
df1.append(df2)           # Similar to concat, but deprecated
pd.concat([df1, df2])     # Preferred method

# For horizontal concatenation:
pd.merge(df1, df2, on='key')    # SQL-like join
df1.join(df2, how='inner')      # Join on index
pd.concat([df1, df2], axis=1)   # Simple column binding

PERFORMANCE TIPS:
---------------
1. Use ignore_index=True for vertical concat to avoid index issues
2. Pre-sort data before horizontal concat if joining on index
3. Use merge() instead of concat() for complex horizontal combinations
4. Concatenate in batches for very large datasets
5. Use keys parameter to track data sources

COMMON PITFALLS:
--------------
1. Duplicate indices causing unexpected results
2. Misaligned columns creating NaN values
3. Memory issues with very large concatenations
4. Forgetting to reset index after vertical concatenation
5. Incorrect axis parameter (0 vs 1)
"""

print(cheat_sheet)


CONCATENATION CHEAT SHEET

BASIC SYNTAX:
------------
pd.concat([df1, df2])                    # Vertical (default)
pd.concat([df1, df2], axis=0)            # Vertical (explicit)
pd.concat([df1, df2], axis=1)            # Horizontal

KEY PARAMETERS:
--------------
axis=0/1           # 0: vertical, 1: horizontal
ignore_index=True  # Reset index after concatenation
keys=['a', 'b']    # Create multi-level index
names=['source']   # Name the index levels
join='outer'       # 'outer' (default) or 'inner'
sort=False         # Sort non-concatenation axis

VERTICAL CONCATENATION USE CASES:
--------------------------------
1. Appending new rows to existing data
2. Combining similar datasets from different sources
3. Stacking time periods (months, quarters, years)
4. Aggregating data from multiple files

HORIZONTAL CONCATENATION USE CASES:
----------------------------------
1. Combining different features about same entities
2. Merging data from different systems (CRM + ERP)
3. Adding calculate