In [15]:
# Let's start by installing libraries

In [16]:
!pip install datatune



In [17]:
import os
import dask.dataframe as dd
import datatune as dt
from datatune.llm.llm import Azure

In [18]:
# set your LLM API keys

os.environ["AZURE_API_KEY"] = "Your Azure API KEY"
os.environ["AZURE_API_BASE"] = "Your Azure API Base"
os.environ["AZURE_API_VERSION"] = "Your API Version"


In [19]:
llm = Azure(
    model_name="gpt-4o-mini",
    api_key=os.getenv("AZURE_API_KEY"),
    api_base=os.getenv("AZURE_API_BASE"),
    api_version=os.getenv("AZURE_API_VERSION"),
    tpm=1000000,
    rpm=5000
)

In [20]:
# Create a Datatune Agent
agent = dt.Agent(llm)

In [21]:
# Load the famous Titanic dataset
# You can download it from: https://www.kaggle.com/c/titanic/data
# Or use seaborn's built-in version
import seaborn as sns

# Load Titanic dataset
titanic_pandas = sns.load_dataset('titanic')
print("Dataset shape:", titanic_pandas.shape)

# Convert to Dask DataFrame
df = dd.from_pandas(titanic_pandas, npartitions=4)

print("\nOriginal Titanic Dataset:")
print(df.head(10))

print("\nDataset Info:")
print(f"Columns: {list(df.columns)}")
print(f"Total passengers: {len(titanic_pandas)}")

Dataset shape: (891, 15)

Original Titanic Dataset:
   survived  pclass     sex   age  sibsp  parch     fare embarked   class  \
0         0       3    male  22.0      1      0   7.2500        S   Third   
1         1       1  female  38.0      1      0  71.2833        C   First   
2         1       3  female  26.0      0      0   7.9250        S   Third   
3         1       1  female  35.0      1      0  53.1000        S   First   
4         0       3    male  35.0      0      0   8.0500        S   Third   
5         0       3    male   NaN      0      0   8.4583        Q   Third   
6         0       1    male  54.0      0      0  51.8625        S   First   
7         0       3    male   2.0      3      1  21.0750        S   Third   
8         1       3  female  27.0      0      2  11.1333        S   Third   
9         1       2  female  14.0      1      0  30.0708        C  Second   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  

In [25]:
# Create family-related features using natural language
prompt = """
Create new columns:
1. 'family_size' = sibsp + parch + 1 (including the person)
2. 'family_category' = 'Solo' if alone, 'Small' if family_size 2-4, 'Large' if 5+
3. 'title' = extract title from who column and give labels Mr, Mrs, Miss, Master, etc, based on if they're adult or not
"""

df_family = agent.do(prompt, df)

print("Family features added:")
result_family = df_family.compute()

📝Plan: [{'type': 'dask', 'operation': 'add_column', 'params': {'new_column': 'family_size', 'expression': "df['sibsp'] + df['parch'] + 1"}}, {'type': 'primitive', 'operation': 'Map', 'params': {'subprompt': "Categorize family size into 'Solo', 'Small', or 'Large'. 'Solo' if alone, 'Small' if family_size is between 2 and 4, and 'Large' if family_size is 5 or more.", 'input_fields': ['alone', 'family_size'], 'output_fields': ['family_category']}}, {'type': 'primitive', 'operation': 'Map', 'params': {'subprompt': "Extract title from the 'who' column and label them as Mr, Mrs, Miss, Master, etc., based on whether they are an adult or not.", 'input_fields': ['who', 'adult_male'], 'output_fields': ['title']}}]
🔍Executing step: {'type': 'dask', 'operation': 'add_column', 'params': {'new_column': 'family_size', 'expression': "df['sibsp'] + df['parch'] + 1"}} 1
✅ Executed step: {'type': 'dask', 'operation': 'add_column', 'params': {'new_column': 'family_size', 'expression': "df['sibsp'] + df['p

In [27]:
final_result = dt.finalize(result_family) # remove metadata created internally
final_result.to_csv("titanic_enhanced.csv", index=False)
print("Enhanced Titanic dataset saved to titanic_enhanced.csv")

Enhanced Titanic dataset saved to titanic_enhanced.csv


In [28]:
final_result.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,family_size,family_category,title
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,2,Small,Mr
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,2,Small,Mrs
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,1,Solo,Mrs
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,2,Small,Mrs
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,1,Solo,Mr
