### Polars Version

In [17]:
import polars as pl
import hashlib

# Function to convert text to SHA-256 hash
def text_to_sha256(text):
    # Ensure the input is converted to string and encoded to bytes
    return hashlib.sha256(str(text).encode('utf-8')).hexdigest()

# Initialize the DataFrame
df = pl.DataFrame({
    'unique_id': [1, 2, 3],
    'value': ['A', 'B', 'C']
})

# Apply the hashing function to the 'unique_id' column
df_hashed = df.with_columns(
    df['unique_id'].map_elements(text_to_sha256, return_dtype=pl.Utf8).alias('unique_id_hashed')
)

# Print the resulting DataFrame
print("DataFrame with Hashed Unique IDs:")
print(df_hashed)

DataFrame with Hashed Unique IDs:
shape: (3, 3)
┌───────────┬───────┬─────────────────────────────────┐
│ unique_id ┆ value ┆ unique_id_hashed                │
│ ---       ┆ ---   ┆ ---                             │
│ i64       ┆ str   ┆ str                             │
╞═══════════╪═══════╪═════════════════════════════════╡
│ 1         ┆ A     ┆ 6b86b273ff34fce19d6b804eff5a3f… │
│ 2         ┆ B     ┆ d4735e3a265e16eee03f59718b9b5d… │
│ 3         ┆ C     ┆ 4e07408562bedb8b60ce05c1decfe3… │
└───────────┴───────┴─────────────────────────────────┘


In [10]:
import polars as pl

# Initialize sample DataFrames
df = pl.DataFrame({
    'unique_id': [1, 2, 3,5],
    'value': ['A', 'B', 'C', 'E']
})

df_new = pl.DataFrame({
    'unique_id': [1, 2, 3, 4,6],
    'value': ['A','B_updated', 'C_updated', 'D_new', 'F_new']
})

# Use 'full' join to merge DataFrames on 'unique_id'
merged_df = df.join(df_new, on='unique_id', how='full', suffix='_new')

df = merged_df.with_columns(
    pl.when(pl.col("unique_id_new").is_null())
    .then(pl.col("unique_id"))
    .otherwise(pl.col("unique_id_new"))
    .alias('unique_id'),

    pl.when(pl.col("value_new").is_null())
    .then(pl.col("value"))
    .otherwise(pl.col("value_new"))
    .alias('value')
).select(['unique_id', 'value'])


# Print the merged DataFrame
print("Merged DataFrame:")
print(df)

# updated_rows_filter = (merged_df['value'] != merged_df['value_new']) & merged_df['value_new'].is_not_null()
# updated_rows = updated_rows_filter.sum()
# print("Number of updated rows:", updated_rows)
# inserted_rows = df_new.select('unique_id').join(df, on='unique_id', how='anti').shape[0]
# print(f"Number of inserted rows: {inserted_rows}")

Merged DataFrame:
shape: (6, 2)
┌───────────┬───────────┐
│ unique_id ┆ value     │
│ ---       ┆ ---       │
│ i64       ┆ str       │
╞═══════════╪═══════════╡
│ 1         ┆ A         │
│ 2         ┆ B_updated │
│ 3         ┆ C_updated │
│ 4         ┆ D_new     │
│ 6         ┆ F_new     │
│ 5         ┆ E         │
└───────────┴───────────┘


In [11]:
merged_df = df.join(df_new, on='unique_id', how='full', suffix='_new')
print(merged_df)

shape: (6, 4)
┌───────────┬───────────┬───────────────┬───────────┐
│ unique_id ┆ value     ┆ unique_id_new ┆ value_new │
│ ---       ┆ ---       ┆ ---           ┆ ---       │
│ i64       ┆ str       ┆ i64           ┆ str       │
╞═══════════╪═══════════╪═══════════════╪═══════════╡
│ 1         ┆ A         ┆ 1             ┆ A         │
│ 2         ┆ B_updated ┆ 2             ┆ B_updated │
│ 3         ┆ C_updated ┆ 3             ┆ C_updated │
│ 4         ┆ D_new     ┆ 4             ┆ D_new     │
│ 6         ┆ F_new     ┆ 6             ┆ F_new     │
│ 5         ┆ E         ┆ null          ┆ null      │
└───────────┴───────────┴───────────────┴───────────┘


In [43]:
import pandas as pd

# Initialize the DataFrames
df = pd.DataFrame({
    'unique_id': [1, 2, 3],
    'value': ['A', 'B', 'C']
})

df_new = pd.DataFrame({
    'unique_id': [2, 3, 4],
    'value': ['B_updated', 'C_updated', 'D']
})

# Set 'unique_id' as the index for both DataFrames
df.set_index('unique_id', inplace=True)
df_new.set_index('unique_id', inplace=True)

# Copy the original DataFrame to use for the updated row count
original_df = df.copy()

# Update the DataFrame (UPSERT existing rows)
df.update(df_new)

# Identify updated rows by comparing the values
updated_rows = df_new.index.intersection(original_df.index).size

# Combine the DataFrames (this will add new rows)
df_combined = df.combine_first(df_new)

# Count the number of inserted rows by finding new indices
inserted_rows = df_combined.index.difference(original_df.index).size

# Reset index to make 'unique_id' a column again
df_combined.reset_index(inplace=True)

# Print results
# print(f"Updated rows: {updated_rows}")
print(f"{df_new.index.intersection(original_df.index.to_list())=}")
print(f"Inserted rows: {inserted_rows}")
print("Final DataFrame:")
print(df_combined)

df_new.index.intersection(original_df.index.to_list())=Index([2, 3], dtype='int64', name='unique_id')
Inserted rows: 1
Final DataFrame:
   unique_id      value
0          1          A
1          2  B_updated
2          3  C_updated
3          4          D


In [2]:
import pandas as pd

# Initialize the DataFrames
df = pd.DataFrame({
    'unique_id': [1, 2, 3],
    'value': ['A', 'B', 'C']
})

df_new = pd.DataFrame({
    'unique_id': [2, 3, 4],
    'value': ['B_updated', 'C_updated', 'D']
})

# Set 'unique_id' as the index for both DataFrames
df.set_index('unique_id', inplace=True)
df_new.set_index('unique_id', inplace=True)

# Copy the original DataFrame to use for the updated row count
original_df = df.copy()

# Update the DataFrame (UPSERT existing rows)
df.update(df_new)

# Identify updated rows by comparing the values
# updated_rows = (df.loc[df_new.index.intersection(original_df.index)] != original_df).any(axis=1).sum()

# Combine the DataFrames (this will add new rows)
df_combined = df.combine_first(df_new)

# Count the number of inserted rows by finding new indices
inserted_rows = df_combined.index.difference(original_df.index).size

# Reset index to make 'unique_id' a column again
df_combined.reset_index(inplace=True)

# Print results
# print(f"Updated rows: {updated_rows}")
print(f"Inserted rows: {inserted_rows}")
print("Final DataFrame:")
print(df_combined)

Inserted rows: 1
Final DataFrame:
   unique_id      value
0          1          A
1          2  B_updated
2          3  C_updated
3          4          D


In [4]:
import pandas as pd

# Initialize sample DataFrames
df = pd.DataFrame({
    'unique_id': [1, 2, 3],
    'value': ['A', 'B', 'C']
})

df_new = pd.DataFrame({
    'unique_id': [2, 3, 4],
    'value': ['B_updated', 'C_updated', 'D']
})

# Perform UPSERT operation
df.set_index('unique_id', inplace=True)
df_new.set_index('unique_id', inplace=True)

# Keep a copy of the original DataFrame for comparison
original_df = df.copy()

# Update existing rows
df.update(df_new)

# Count the updated rows
updated_rows = (df != original_df).any(axis=1).sum()

# Append new rows
df = df.combine_first(df_new)

# Reset index to ensure unique_id is a column
df.reset_index(inplace=True)

# Print the number of updated rows
print("Number of updated rows:", updated_rows)

# Print the final DataFrame
print("Upserted DataFrame:")
print(df)

Number of updated rows: 2
Upserted DataFrame:
   unique_id      value
0          1          A
1          2  B_updated
2          3  C_updated
3          4          D


In [8]:
import polars as pl

# Initialize sample DataFrames
df = pl.DataFrame({
    'unique_id': [1, 2, 3],
    'value': ['A', 'B', 'C']
})

df_new = pl.DataFrame({
    'unique_id': [2, 3, 4],
    'value': ['B_updated', 'C_updated', 'D']
})

# Perform a full join on 'unique_id' to identify and update existing rows
joined_df = df.join(df_new, on='unique_id', how='full', suffix='_new')

# Determine updated rows where the 'value' in df is different from 'df_new'
updated_rows_filter = (joined_df['value'] != joined_df['value_new']) & joined_df['value_new'].is_not_null()
updated_rows = updated_rows_filter.sum()

# Update values where available
df_updated = joined_df.with_columns([
    pl.when(joined_df['value_new'].is_null())
    .then(joined_df['value'])
    .otherwise(joined_df['value_new'])
    .alias('final_value')
])

# Select columns and resolve renaming
df_upserted = df_updated.select(['unique_id', 'final_value']).rename({'final_value': 'value'})

# Count the number of inserted rows
# Inserted rows are determined by rows in df_new not appearing in the original df
inserted_rows = df_new.select('unique_id').join(df, on='unique_id', how='anti').shape[0]

# Print the counts and the final DataFrame
print(f"Number of updated rows: {updated_rows}")
print(f"Number of inserted rows: {inserted_rows}")
print("Upserted DataFrame:")
print(df_upserted)

Number of updated rows: 2
Number of inserted rows: 1
Upserted DataFrame:
shape: (4, 2)
┌───────────┬───────────┐
│ unique_id ┆ value     │
│ ---       ┆ ---       │
│ i64       ┆ str       │
╞═══════════╪═══════════╡
│ 2         ┆ B_updated │
│ 3         ┆ C_updated │
│ null      ┆ D         │
│ 1         ┆ A         │
└───────────┴───────────┘


In [9]:
import polars as pl

# Initialize sample DataFrames
df = pl.DataFrame({
    'unique_id': [1, 2, 3],
    'value': ['A', 'B', 'C']
})

df_new = pl.DataFrame({
    'unique_id': [2, 3, 4],
    'value': ['B_updated', 'C_updated', 'D']
})

# Perform a full join on 'unique_id' to identify and update existing rows
joined_df = df.join(df_new, on='unique_id', how='full', suffix='_new')

# Update values where available
df_updated = joined_df.with_columns([
    pl.when(joined_df['value_new'].is_null())
    .then(joined_df['value'])
    .otherwise(joined_df['value_new'])
    .alias('final_value')
])

# Select the resultant columns and correct the index
df_upserted = df_updated.select(['unique_id', 'final_value']).rename({'final_value': 'value'})

# Set 'unique_id' as a column instead of allowing it to become an index by avoiding any drop errors
df_upserted = df_upserted.filter(df_upserted['unique_id'].is_not_null())

# Determine updated rows where the 'value' in df is different from 'df_new'
updated_rows_filter = (joined_df['value'] != joined_df['value_new']) & joined_df['value_new'].is_not_null()
updated_rows = updated_rows_filter.sum()

# Count the number of inserted rows
inserted_rows = df_new.select('unique_id').join(df, on='unique_id', how='anti').shape[0]

# Print the counts and the final DataFrame
print(f"Number of updated rows: {updated_rows}")
print(f"Number of inserted rows: {inserted_rows}")
print("Upserted DataFrame:")
print(df_upserted)

Number of updated rows: 2
Number of inserted rows: 1
Upserted DataFrame:
shape: (3, 2)
┌───────────┬───────────┐
│ unique_id ┆ value     │
│ ---       ┆ ---       │
│ i64       ┆ str       │
╞═══════════╪═══════════╡
│ 2         ┆ B_updated │
│ 3         ┆ C_updated │
│ 1         ┆ A         │
└───────────┴───────────┘


In [10]:
import polars as pl

# Initialize sample DataFrames
df = pl.DataFrame({
    'unique_id': [1, 2, 3],
    'value': ['A', 'B', 'C']
})

df_new = pl.DataFrame({
    'unique_id': [2, 3, 4],
    'value': ['B_updated', 'C_updated', 'D']
})

# Perform upsert operation in a more concise way
df_upserted = (
    df.join(
        df_new,
        on='unique_id',
        how='outer'
    ).with_columns(
        pl.coalesce('value_right', 'value').alias('value')
    ).select(['unique_id', 'value'])
)

# Calculate metrics
updated_rows = (
    df.join(df_new, on='unique_id')
    .filter(pl.col('value') != pl.col('value_right'))
    .shape[0]
)

inserted_rows = df_new.join(df, on='unique_id', how='anti').shape[0]

# Print results
print(f"Number of updated rows: {updated_rows}")
print(f"Number of inserted rows: {inserted_rows}")
print("Upserted DataFrame:")
print(df_upserted)

Number of updated rows: 2
Number of inserted rows: 1
Upserted DataFrame:
shape: (4, 2)
┌───────────┬───────────┐
│ unique_id ┆ value     │
│ ---       ┆ ---       │
│ i64       ┆ str       │
╞═══════════╪═══════════╡
│ 2         ┆ B_updated │
│ 3         ┆ C_updated │
│ null      ┆ D         │
│ 1         ┆ A         │
└───────────┴───────────┘


  df.join(


In [11]:
import polars as pl

# Initialize sample DataFrames
df = pl.DataFrame({
    'unique_id': [1, 2, 3],
    'value': ['A', 'B', 'C']
})

df_new = pl.DataFrame({
    'unique_id': [2, 3, 4],
    'value': ['B_updated', 'C_updated', 'D']
})

# Perform upsert operation using how='full'
df_upserted = (
    df.join(
        df_new,
        on='unique_id',
        how='full'
    ).with_columns(
        pl.coalesce('value_right', 'value').alias('value')
    ).select(['unique_id', 'value'])
)

# Calculate metrics
updated_rows = (
    df.join(df_new, on='unique_id')
    .filter(pl.col('value') != pl.col('value_right'))
    .shape[0]
)

inserted_rows = df_new.join(df, on='unique_id', how='anti').shape[0]

# Print results
print(f"Number of updated rows: {updated_rows}")
print(f"Number of inserted rows: {inserted_rows}")
print("Upserted DataFrame:")
print(df_upserted)


Number of updated rows: 2
Number of inserted rows: 1
Upserted DataFrame:
shape: (4, 2)
┌───────────┬───────────┐
│ unique_id ┆ value     │
│ ---       ┆ ---       │
│ i64       ┆ str       │
╞═══════════╪═══════════╡
│ 2         ┆ B_updated │
│ 3         ┆ C_updated │
│ null      ┆ D         │
│ 1         ┆ A         │
└───────────┴───────────┘


In [12]:
import polars as pl

# Initialize sample DataFrames
df = pl.DataFrame({
    'unique_id': [1, 2, 3],
    'value': ['A', 'B', 'C']
})

df_new = pl.DataFrame({
    'unique_id': [2, 3, 4],
    'value': ['B_updated', 'C_updated', 'D']
})

# Perform upsert operation with proper indexing
df_upserted = (
    df.join(
        df_new,
        on='unique_id',
        how='full'
    ).with_columns(
        pl.coalesce('value_right', 'value').alias('value')
    ).select(['unique_id', 'value'])
    .sort('unique_id')  # Add sorting to maintain index order
)

# Calculate metrics
updated_rows = (
    df.join(df_new, on='unique_id')
    .filter(pl.col('value') != pl.col('value_right'))
    .shape[0]
)

inserted_rows = df_new.join(df, on='unique_id', how='anti').shape[0]

# Print results
print(f"Number of updated rows: {updated_rows}")
print(f"Number of inserted rows: {inserted_rows}")
print("Upserted DataFrame:")
print(df_upserted)


Number of updated rows: 2
Number of inserted rows: 1
Upserted DataFrame:
shape: (4, 2)
┌───────────┬───────────┐
│ unique_id ┆ value     │
│ ---       ┆ ---       │
│ i64       ┆ str       │
╞═══════════╪═══════════╡
│ null      ┆ D         │
│ 1         ┆ A         │
│ 2         ┆ B_updated │
│ 3         ┆ C_updated │
└───────────┴───────────┘


### [Upsert and Merge with Delta Lake Tables in Python Polars](https://stuffbyyuki.com/upsert-and-merge-with-delta-lake-tables-in-python-polars/)

In [14]:
import polars as pl

df = pl.DataFrame(
    {
        'key': [1, 2, 3],
        'letter': ['a', 'b', 'c'],
        'value': [100, 200, 300]
    }
)
print(df)

shape: (3, 3)
┌─────┬────────┬───────┐
│ key ┆ letter ┆ value │
│ --- ┆ ---    ┆ ---   │
│ i64 ┆ str    ┆ i64   │
╞═════╪════════╪═══════╡
│ 1   ┆ a      ┆ 100   │
│ 2   ┆ b      ┆ 200   │
│ 3   ┆ c      ┆ 300   │
└─────┴────────┴───────┘


In [15]:
output_table_path = './my_delta_lake_table'

df.write_delta(output_table_path, mode='overwrite')
print('The target Delta Lake table output:\n', pl.read_delta(output_table_path))


ModuleNotFoundError: deltalake is not installed

Please run: pip install deltalake

In [16]:
df_col_updated_and_row_deleted = (
    df
    .with_columns(
        pl.when(pl.col('letter')=='c')  # update a column 
        .then(pl.lit('d'))
        .otherwise(pl.col('letter'))  
        .alias('letter')
    )
    .filter(pl.col('key') != 1)  # delete a row 
)

df_with_changes = (
    pl.concat(
        [
            df_col_updated_and_row_deleted,
            pl.DataFrame({'key': 4, 'letter': 'd', 'value': 400})  # a new row
        ],
        how='vertical'
    )
)
print(df_with_changes)

shape: (3, 3)
┌─────┬────────┬───────┐
│ key ┆ letter ┆ value │
│ --- ┆ ---    ┆ ---   │
│ i64 ┆ str    ┆ i64   │
╞═════╪════════╪═══════╡
│ 2   ┆ b      ┆ 200   │
│ 3   ┆ d      ┆ 300   │
│ 4   ┆ d      ┆ 400   │
└─────┴────────┴───────┘


In [17]:
(
    df_with_changes
    .write_delta(
        output_table_path,
        mode='merge',
        delta_merge_options={
            'predicate': 'source.key = target.key',
            'source_alias': 'source',
            'target_alias': 'target',
        },
    )
    .when_matched_update_all()
    .when_not_matched_insert_all()
    .when_not_matched_by_source_delete()
    .execute()
)  

ModuleNotFoundError: deltalake is not installed

Please run: pip install deltalake