In [None]:
// filepath: /home/utku/Frostbyte/examples/diff_functionality.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Frostbyte Diff Functionality\n",
    "\n",
    "This notebook demonstrates how to use the improved diff functionality in Frostbyte.\n",
    "\n",
    "## Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "import tempfile\n",
    "\n",
    "# Import the direct diff functionality\n",
    "from frostbyte.utils.diff import diff_dataframes\n",
    "\n",
    "# Import the archive-based diff functionality\n",
    "from frostbyte import init, archive, diff, restore"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Direct DataFrame Comparison\n",
    "\n",
    "First, we'll create two DataFrames with various differences and compare them directly."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Create sample dataframes\n",
    "df1 = pd.DataFrame({\n",
    "    'id': [1, 2, 3, 4, 5],\n",
    "    'name': ['A', 'B', 'C', 'D', 'E'],\n",
    "    'value': [10, 20, 30, 40, 50],\n",
    "    'category': ['X', 'Y', 'Z', 'X', 'Y']\n",
    "})\n",
    "\n",
    "# Second dataframe with various changes:\n",
    "# - Row removed (id=1)\n",
    "# - Row added (id=6)\n",
    "# - Value changed for id=3 \n",
    "# - Name changed for id=4\n",
    "# - Column removed (category)\n",
    "# - Column added (status)\n",
    "df2 = pd.DataFrame({\n",
    "    'id': [2, 3, 4, 5, 6],\n",
    "    'name': ['B', 'C', 'X', 'E', 'F'], \n",
    "    'value': [20, 35, 40, 50, 60],\n",
    "    'status': ['Active', 'Active', 'Inactive', 'Active', 'Active']\n",
    "})\n",
    "\n",
    "# Display the dataframes\n",
    "print(\"DataFrame 1:\")\n",
    "display(df1)\n",
    "\n",
    "print(\"\\nDataFrame 2:\")\n",
    "display(df2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Basic Comparison using ID as Key\n",
    "\n",
    "We'll compare the dataframes using the 'id' column as the key."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Run the diff with 'id' as the key column\n",
    "result = diff_dataframes(df1, df2, key_columns=['id'])\n",
    "\n",
    "# Display the basic results\n",
    "print(f\"Rows added: {result['rows_added']}\")\n",
    "print(f\"Rows removed: {result['rows_removed']}\")\n",
    "print(f\"Rows modified: {result['rows_modified']}\")\n",
    "print(f\"Total cells changed: {result['total_cells_changed']}\")\n",
    "\n",
    "# Display schema changes\n",
    "print(\"\\nSchema changes:\")\n",
    "for change in result['schema_changes']:\n",
    "    print(f\"  {change}\")\n",
    "\n",
    "# Display column-level diff counts\n",
    "print(\"\\nColumn changes:\")\n",
    "for col, count in result['column_diff_counts'].items():\n",
    "    print(f\"  {col}: {count} cells changed\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Examining Sample Row Changes\n",
    "\n",
    "Let's look at the sample rows that were added, removed, or modified."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Display sample added rows\n",
    "print(\"Added rows:\")\n",
    "for row in result['row_sample']['added']:\n",
    "    print(f\"  {row}\")\n",
    "\n",
    "# Display sample removed rows\n",
    "print(\"\\nRemoved rows:\")\n",
    "for row in result['row_sample']['removed']:\n",
    "    print(f\"  {row}\")\n",
    "\n",
    "# Display sample modified rows\n",
    "print(\"\\nModified rows:\")\n",
    "for mod in result['row_sample']['modified']:\n",
    "    print(f\"  Key: {mod['key']}\")\n",
    "    print(\"  Changes:\")\n",
    "    for col, values in mod['changes'].items():\n",
    "        print(f\"    {col}: {values['old']} → {values['new']}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Automatic Key Detection\n",
    "\n",
    "If you don't specify key columns, Frostbyte will attempt to detect suitable keys."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Run diff without specifying key columns\n",
    "auto_result = diff_dataframes(df1, df2)\n",
    "\n",
    "# Display basic results\n",
    "print(f\"Rows added: {auto_result['rows_added']}\")\n",
    "print(f\"Rows removed: {auto_result['rows_removed']}\")\n",
    "print(f\"Rows modified: {auto_result['rows_modified']}\")\n",
    "\n",
    "# The results might be different from the key-based approach\n",
    "# if a suitable key wasn't found"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Archive-Based Workflow\n",
    "\n",
    "Now let's see how to use the diff functionality with archived files."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Create a temporary directory for our test\n",
    "temp_dir = tempfile.mkdtemp()\n",
    "os.environ[\"FROSTBYTE_ROOT\"] = temp_dir\n",
    "\n",
    "# Initialize Frostbyte\n",
    "init_result = init()\n",
    "print(f\"Initialized Frostbyte: {init_result}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Save the dataframes to CSV files\n",
    "file1_path = os.path.join(temp_dir, \"data1.csv\")\n",
    "file2_path = os.path.join(temp_dir, \"data2.csv\")\n",
    "\n",
    "df1.to_csv(file1_path, index=False)\n",
    "df2.to_csv(file2_path, index=False)\n",
    "\n",
    "print(f\"Saved dataframes to {file1_path} and {file2_path}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Archive the files\n",
    "archive1_info = archive(file1_path)\n",
    "archive2_info = archive(file2_path)\n",
    "\n",
    "print(f\"Archived file1 as version {archive1_info.get('version')}\")\n",
    "print(f\"Archived file2 as version {archive2_info.get('version')}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Compare the archived files\n",
    "diff_result = diff(file1_path, file2_path)\n",
    "\n",
    "# Display the results\n",
    "print(f\"Rows added: {diff_result['rows_added']}\")\n",
    "print(f\"Rows removed: {diff_result['rows_removed']}\")\n",
    "print(f\"Rows modified: {diff_result['rows_modified']}\")\n",
    "print(f\"Total cells changed: {diff_result['total_cells_changed']}\")\n",
    "\n",
    "print(\"\\nSchema changes:\")\n",
    "for change in diff_result['schema_changes']:\n",
    "    print(f\"  {change}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Comparing Specific Versions\n",
    "\n",
    "You can compare specific versions of archived files using the @version syntax."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Modify a file and archive it again\n",
    "df3 = df2.copy()\n",
    "df3.loc[df3['id'] == 3, 'value'] = 50  # Further change a value\n",
    "df3.to_csv(file2_path, index=False)\n",
    "\n",
    "# Archive the new version\n",
    "archive3_info = archive(file2_path)\n",
    "print(f\"Archived updated file as version {archive3_info.get('version')}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Compare specific versions\n",
    "version_result = diff(f\"{file1_path}@1\", f\"{file2_path}@1\")\n",
    "\n",
    "# Display the results\n",
    "print(f\"Comparing {file1_path}@1 with {file2_path}@1:\")\n",
    "print(f\"Rows added: {version_result['rows_added']}\")\n",
    "print(f\"Rows removed: {version_result['rows_removed']}\")\n",
    "print(f\"Rows modified: {version_result['rows_modified']}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cleanup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Clean up the temporary directory\n",
    "import shutil\n",
    "shutil.rmtree(temp_dir)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}