# Datatune with Semantic Deduplication

Let's start by installing dependencies. We will be using duckdb as our database backend and OpenAI LLM API.

In [None]:
!pip install datatune

## Import required libraries

In [None]:
import datatune as dt
import seaborn as sns
from datatune.llm.llm import OpenAI
import dask.dataframe as dd
import os

## Initialize your LLM

In [None]:
os.environ["OPENAI_API_KEY"] = "your_openai_api_key_here"     # Replace with your actual OpenAI API key
llm = OpenAI(model="gpt-3.5-turbo", rpm=500, tpm=150000)      # Initialize the LLM with your rate limits                 

In [None]:
df = sns.load_dataset("flights")      # Load the flights dataset
df = dd.from_pandas(df)

## Get deduplication clusters

In [None]:
clusters = dt.reduce(df, action="dedup", embedding_model="text-embedding-3-small", llm=llm)

Reduce gets a deduplication map that can be passed to map and filter which sends only canonical rows to the LLM API for transformation and transmits their result to the duplicate rows thereby reducing tokens and therfore cost.

In [None]:
# Create mapping and filtering prompt to transform the dataset

mapping_prompt = "Add a column passenger_trend_comment that describes the trend in passenger numbers..make the comment descriptive and varied"

filtering_prompt = "Based on the passenger_trend_comment column, filter the dataset to include only those months where there is a significant change in passenger numbers compared to the previous month."


## Transforming our dataset

In [None]:
mapped = dt.map(
    prompt = mapping_prompt,
    output_fields=["passenger_trend_comment"],       # input fields to be used for mapping
    input_fields=["passengers","month","year"],
    clusters=clusters                               # pass deduplication clusters
)(llm, df)

# Now pass the mapped Ibis table expression to filter
filtered = dt.filter(
    prompt = filtering_prompt,
    input_fields=["passenger_trend_comment"],
    clusters=clusters                               # pass deduplication clusters
)(llm, mapped)

result = filtered.execute()      # Result is a pandas DataFrame

## Convert the transformed dataset into CSV

In [None]:
result.to_csv("duckdb_transformed.csv")

print(result.head())