In [4]:
from datetime import datetime
import pandas as pd
from pydantic import BaseModel, PositiveInt


class User(BaseModel):
    id: int  
    name: str = 'John Doe'  
    signup_ts: datetime | None  
    tastes: dict[str, PositiveInt]  


external_data = {
    'id': 123,
    'signup_ts': '2019-06-01 12:22',  
    'tastes': {
        'wine': 9,
        b'cheese': 7,  
        'cabbage': '1',  
    },
}

user = User(**external_data)  

print(user.id)
print(user.model_dump())  


123
{'id': 123, 'name': 'John Doe', 'signup_ts': datetime.datetime(2019, 6, 1, 12, 22), 'tastes': {'wine': 9, 'cheese': 7, 'cabbage': 1}}


In [5]:
# continuing the above example...

from datetime import datetime
from pydantic import BaseModel, PositiveInt, ValidationError


class User(BaseModel):
    id: int
    name: str | None
    signup_ts: datetime | None
    tastes: dict[str, PositiveInt]

external_data = {
    'id': 123,
    'name' : 'Yu Ye',
    'signup_ts': '2025-06-01',  
    'tastes': {
        'wine': 9,
        'cheese': 7,  
        'cabbage': '1',  
    },
}



try:
    user = User(**external_data)  
except ValidationError as e:
    print(e.errors())

user.model_dump()


{'id': 123,
 'name': 'Yu Ye',
 'signup_ts': datetime.datetime(2025, 6, 1, 0, 0),
 'tastes': {'wine': 9, 'cheese': 7, 'cabbage': 1}}

# Autogluon dataset

In [6]:
from autogluon.tabular import TabularDataset, TabularPredictor

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
data_url = 'https://raw.githubusercontent.com/mli/ag-docs/main/knot_theory/'
train_data = TabularDataset(f'{data_url}train.csv')
train_data.head()

Unnamed: 0.1,Unnamed: 0,chern_simons,cusp_volume,hyperbolic_adjoint_torsion_degree,hyperbolic_torsion_degree,injectivity_radius,longitudinal_translation,meridinal_translation_imag,meridinal_translation_real,short_geodesic_imag_part,short_geodesic_real_part,Symmetry_0,Symmetry_D3,Symmetry_D4,Symmetry_D6,Symmetry_D8,Symmetry_Z/2 + Z/2,volume,signature
0,70746,0.09053,12.226322,0,10,0.507756,10.685555,1.144192,-0.519157,-2.760601,1.015512,0.0,0.0,0.0,0.0,0.0,1.0,11.393225,-2
1,240827,0.232453,13.800773,0,14,0.413645,10.453156,1.320249,-0.158522,-3.013258,0.827289,0.0,0.0,0.0,0.0,0.0,1.0,12.742782,0
2,155659,-0.144099,14.76103,0,14,0.436928,13.405199,1.101142,0.768894,2.233106,0.873856,0.0,0.0,0.0,0.0,0.0,0.0,15.236505,2
3,239963,-0.171668,13.738019,0,22,0.249481,27.819496,0.493827,-1.188718,-2.042771,0.498961,0.0,0.0,0.0,0.0,0.0,0.0,17.27989,-8
4,90504,0.235188,15.896359,0,10,0.389329,15.330971,1.036879,0.722828,-3.056138,0.778658,0.0,0.0,0.0,0.0,0.0,0.0,16.749298,4


In [8]:
from autogluon.tabular import TabularDataset, TabularPredictor

df = TabularDataset("https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv")
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,178478,Bachelors,13,Never-married,Tech-support,Own-child,White,Female,0,0,40,United-States,<=50K
1,23,State-gov,61743,5th-6th,3,Never-married,Transport-moving,Not-in-family,White,Male,0,0,35,United-States,<=50K
2,46,Private,376789,HS-grad,9,Never-married,Other-service,Not-in-family,White,Male,0,0,15,United-States,<=50K
3,55,?,200235,HS-grad,9,Married-civ-spouse,?,Husband,White,Male,0,0,50,United-States,>50K
4,36,Private,224541,7th-8th,4,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,40,El-Salvador,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39068,54,Private,83103,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,0,0,67,United-States,<=50K
39069,23,Private,172232,HS-grad,9,Never-married,Farming-fishing,Not-in-family,White,Male,0,0,53,United-States,<=50K
39070,37,Local-gov,165883,HS-grad,9,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
39071,26,Self-emp-not-inc,67240,HS-grad,9,Never-married,Handlers-cleaners,Not-in-family,White,Male,0,0,35,United-States,<=50K


In [9]:
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'class_']
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,class_
0,25,Private,178478,Bachelors,13,Never-married,Tech-support,Own-child,White,Female,0,0,40,United-States,<=50K
1,23,State-gov,61743,5th-6th,3,Never-married,Transport-moving,Not-in-family,White,Male,0,0,35,United-States,<=50K
2,46,Private,376789,HS-grad,9,Never-married,Other-service,Not-in-family,White,Male,0,0,15,United-States,<=50K
3,55,?,200235,HS-grad,9,Married-civ-spouse,?,Husband,White,Male,0,0,50,United-States,>50K
4,36,Private,224541,7th-8th,4,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,40,El-Salvador,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39068,54,Private,83103,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,0,0,67,United-States,<=50K
39069,23,Private,172232,HS-grad,9,Never-married,Farming-fishing,Not-in-family,White,Male,0,0,53,United-States,<=50K
39070,37,Local-gov,165883,HS-grad,9,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
39071,26,Self-emp-not-inc,67240,HS-grad,9,Never-married,Handlers-cleaners,Not-in-family,White,Male,0,0,35,United-States,<=50K


In [10]:
for c in df.columns:
    print('***', c)
    print(df[c].unique())

*** age
[25 23 46 55 36 51 33 18 43 41 22 39 44 37 40 24 61 19 27 47 34 29 45 31
 59 35 63 49 21 69 38 42 58 48 20 26 56 72 53 50 65 64 66 30 90 17 79 54
 28 57 68 62 70 32 52 71 60 74 67 81 75 76 77 83 73 78 85 80 82 84 88 86
 87 89]
*** workclass
[' Private' ' State-gov' ' ?' ' Local-gov' ' Self-emp-not-inc'
 ' Self-emp-inc' ' Federal-gov' ' Never-worked' ' Without-pay']
*** fnlwgt
[178478  61743 376789 ... 300777  83103 165883]
*** education
[' Bachelors' ' 5th-6th' ' HS-grad' ' 7th-8th' ' Some-college'
 ' Assoc-voc' ' Masters' ' Assoc-acdm' ' 11th' ' Prof-school' ' Doctorate'
 ' 12th' ' 10th' ' 9th' ' Preschool' ' 1st-4th']
*** education_num
[13  3  9  4 10 11 14 12  7 15 16  8  6  5  1  2]
*** marital_status
[' Never-married' ' Married-civ-spouse' ' Divorced' ' Separated'
 ' Widowed' ' Married-spouse-absent' ' Married-AF-spouse']
*** occupation
[' Tech-support' ' Transport-moving' ' Other-service' ' ?'
 ' Handlers-cleaners' ' Sales' ' Craft-repair' ' Adm-clerical'
 ' Exec-manageri

In [11]:
from pydantic import NonNegativeInt, Field
from typing import Annotated, Literal


class Person(BaseModel):
    age: Annotated[PositiveInt, Field(strict=True, lt=150)]
    workclass: Literal[' Private', ' State-gov', ' Local-gov', ' Self-emp-not-inc', 
                       ' Self-emp-inc', ' Federal-gov', ' Never-worked', ' Without-pay'] | None
    fnlwgt: PositiveInt 
    education: Literal[' Bachelors', ' 5th-6th', ' HS-grad', ' 7th-8th', ' Some-college',
                       ' Assoc-voc', ' Masters', ' Assoc-acdm', ' 11th', ' Prof-school',
                       ' Doctorate', ' 12th', ' 10th', ' 9th', ' Preschool', ' 1st-4th']
    education_num : Annotated[PositiveInt, Field(lt=20)] # if strict is not set, automatic conversion will be done
    marital_status : Literal[' Never-married', ' Married-civ-spouse', ' Divorced', ' Separated',
                             ' Widowed', ' Married-spouse-absent', ' Married-AF-spouse']
    occupation: str | None
    relationship: Literal[' Own-child', ' Not-in-family', ' Husband', ' Wife', ' Unmarried',' Other-relative']
    race : Literal[' White', ' Asian-Pac-Islander', ' Other', ' Black', ' Amer-Indian-Eskimo']
    sex : Literal[' Female', ' Male']
    capital_gain: NonNegativeInt
    capital_loss : NonNegativeInt
    hours_per_week: Annotated[PositiveInt, Field(strict=True, lt=100)] # if strict = true, automatic conversion will not be done
    native_country : str | None
    class_ : Literal[' <=50K', ' >50K']


In [12]:
external_data = {'age': 25, 
                 'workclass': ' Local-gov', 
                 'fnlwgt': '178478', 
                 'education': ' Bachelors', 
                 'education_num': '13', 
                 'marital_status': ' Never-married', 
                 'occupation': ' Tech-support', 
                 'relationship': ' Own-child', 
                 'race': ' White', 
                 'sex': ' Female', 
                 'capital_gain': 0, 
                 'capital_loss': 0, 
                 'hours_per_week': 40, 
                 'native_country': ' United-States', 
                 'class_': ' <=50K'}

try:
    person = Person(**external_data)  
    print('ok.')
    print(repr(person))
except ValidationError as e:
    print(e.errors())



ok.
Person(age=25, workclass=' Local-gov', fnlwgt=178478, education=' Bachelors', education_num=13, marital_status=' Never-married', occupation=' Tech-support', relationship=' Own-child', race=' White', sex=' Female', capital_gain=0, capital_loss=0, hours_per_week=40, native_country=' United-States', class_=' <=50K')


In [13]:
external_data = {'age': 25, 
                 'workclass': ' Local-gov', 
                 'fnlwgt': '178478', 
                 'education': ' Bachelors', 
                 'education_num': '13', 
                 'marital_status': ' Never-married', 
                 'occupation': ' Tech-support', 
                 'relationship': ' Own-child', 
                 'race': ' White', 
                 'sex': ' Fema', 
                 'capital_gain': 0, 
                 'capital_loss': 0, 
                 'hours_per_week': 40, 
                 'native_country': ' United-States', 
                 'class_': ' <=50K'}

try:
    person = Person(**external_data)  
    print('ok.')
    print(repr(person))
except ValidationError as e:
    print(e.errors())



[{'type': 'literal_error', 'loc': ('sex',), 'msg': "Input should be ' Female' or ' Male'", 'input': ' Fema', 'ctx': {'expected': "' Female' or ' Male'"}, 'url': 'https://errors.pydantic.dev/2.11/v/literal_error'}]


In [14]:
df_replaced_none = df.replace(' ?', None)
df_replaced_none

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,class_
0,25,Private,178478,Bachelors,13,Never-married,Tech-support,Own-child,White,Female,0,0,40,United-States,<=50K
1,23,State-gov,61743,5th-6th,3,Never-married,Transport-moving,Not-in-family,White,Male,0,0,35,United-States,<=50K
2,46,Private,376789,HS-grad,9,Never-married,Other-service,Not-in-family,White,Male,0,0,15,United-States,<=50K
3,55,,200235,HS-grad,9,Married-civ-spouse,,Husband,White,Male,0,0,50,United-States,>50K
4,36,Private,224541,7th-8th,4,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,40,El-Salvador,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39068,54,Private,83103,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,0,0,67,United-States,<=50K
39069,23,Private,172232,HS-grad,9,Never-married,Farming-fishing,Not-in-family,White,Male,0,0,53,United-States,<=50K
39070,37,Local-gov,165883,HS-grad,9,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
39071,26,Self-emp-not-inc,67240,HS-grad,9,Never-married,Handlers-cleaners,Not-in-family,White,Male,0,0,35,United-States,<=50K


In [15]:
import pandas as pd
from pydantic import BaseModel, ValidationError


# Validate each row
validated_rows = []
errors = []

for index, row in df_replaced_none.iterrows():
    try:
        person = Person(**row.to_dict())
        validated_rows.append(person)
    except ValidationError as e:
        errors.append((index, e))

# Results
print("Valid rows:", len(validated_rows))
print("Errors:", len(errors))


Valid rows: 39073
Errors: 0


In [16]:
validated_rows[5]

Person(age=51, workclass=' Private', fnlwgt=178054, education=' Some-college', education_num=10, marital_status=' Married-civ-spouse', occupation=' Sales', relationship=' Husband', race=' White', sex=' Male', capital_gain=0, capital_loss=0, hours_per_week=40, native_country=None, class_=' >50K')

# Download datasets from openml

In [2]:
import openml

df_openml = openml.datasets.list_datasets(output_format='dataframe')
df_openml

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
2,2,anneal,1,1,active,ARFF,684.0,7.0,8.0,5.0,39.0,898.0,898.0,22175.0,6.0,33.0
3,3,kr-vs-kp,1,1,active,ARFF,1669.0,3.0,1527.0,2.0,37.0,3196.0,0.0,0.0,0.0,37.0
4,4,labor,1,1,active,ARFF,37.0,3.0,20.0,2.0,17.0,57.0,56.0,326.0,8.0,9.0
5,5,arrhythmia,1,1,active,ARFF,245.0,13.0,2.0,13.0,280.0,452.0,384.0,408.0,206.0,74.0
6,6,letter,1,1,active,ARFF,813.0,26.0,734.0,26.0,17.0,20000.0,0.0,0.0,16.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47038,47038,001,1,51974,active,arff,,,,,25.0,1716.0,0.0,0.0,19.0,0.0
47039,47039,criteo-uplift-balanced,1,30703,active,arff,500000.0,,154478.0,4.0,14.0,1366544.0,0.0,0.0,12.0,2.0
47040,47040,seeds_dataset,1,52160,active,arff,,,,,8.0,209.0,0.0,0.0,8.0,0.0
47041,47041,Wine,9,52297,active,arff,,,,,13.0,6500.0,0.0,0.0,12.0,0.0


In [11]:
df_openml_with_missing = df_openml[df_openml['NumberOfMissingValues'] > 0]
df_openml_with_missing[df_openml_with_missing['did'] > 46904]

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
46908,46908,APSFailure,2,34097,active,arff,74625.0,,1375.0,2.0,171.0,76000.0,75244.0,1078695.0,170.0,1.0
46909,46909,ASP-POTASSCO,1,34097,active,arff,240.0,,21.0,11.0,139.0,1212.0,218.0,18258.0,138.0,1.0
46920,46920,customer_satisfaction_in_airline,1,34097,active,arff,71087.0,,58793.0,2.0,22.0,129880.0,393.0,393.0,5.0,17.0
46922,46922,Diabetes130US,4,34097,active,arff,65225.0,,6293.0,2.0,48.0,71518.0,70601.0,137663.0,8.0,40.0
46927,46927,Fitness_Club,1,34097,active,arff,1046.0,,454.0,2.0,7.0,1500.0,20.0,20.0,3.0,4.0
46929,46929,GiveMeSomeCredit,1,34097,active,arff,139974.0,,10026.0,2.0,11.0,150000.0,29731.0,33655.0,10.0,1.0
46935,46935,HR_Analytics_Job_Change_of_Data_Scientists,1,34097,active,arff,14381.0,,4777.0,2.0,13.0,19158.0,65.0,65.0,2.0,10.0
46939,46939,kddcup09_appetency,2,34097,active,arff,49110.0,,890.0,2.0,213.0,50000.0,50000.0,6727959.0,174.0,39.0
46940,46940,Marketing_Campaign,1,34097,active,arff,1906.0,,334.0,2.0,26.0,2240.0,24.0,24.0,16.0,9.0
46943,46943,MIC,2,34097,active,arff,1428.0,,12.0,8.0,123.0,1699.0,1699.0,6513.0,17.0,106.0


In [15]:
df_openml[df_openml['did'] == 46927] #46939 all vars

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
46927,46927,Fitness_Club,1,34097,active,arff,1046.0,,454.0,2.0,7.0,1500.0,20.0,20.0,3.0,4.0


In [3]:
dataset = openml.datasets.get_dataset(46927)
dataset

OpenML Dataset
Name.........: Fitness_Club
Version......: 1
Format.......: arff
Upload Date..: 2025-04-30 20:16:53
Licence......: Public Domain
Download URL.: https://api.openml.org/data/v1/download/22125238/Fitness_Club.arff
OpenML URL...: https://www.openml.org/d/46927
# of features: None

In [4]:
# Get the data itself as a dataframe (or otherwise)
X, y, categorical_indicator, attribute_names = dataset.get_data(dataset_format="dataframe")
X

Unnamed: 0,months_as_member,weight,days_before,day_of_week,time,category,attended
0,15,65.47,6,Wed,AM,HIIT,Yes
1,18,77.85,8,Thu,AM,Strength,Yes
2,13,67.26,10,Fri,AM,Cycling,No
3,7,86.70,12,Sat,AM,HIIT,No
4,5,135.18,8,Thu,AM,HIIT,No
...,...,...,...,...,...,...,...
1495,15,77.20,10,Fri,AM,Cycling,No
1496,17,85.26,6,Wed,AM,Yoga,No
1497,6,95.66,3,Tue,PM,Cycling,No
1498,13,73.31,12,Sat,PM,Cycling,No


In [18]:
X.to_csv('./exampleSparseDataset.csv', index=False)

In [7]:
X.dtypes

Var1          float64
Var2          float64
Var3          float64
Var4          float64
Var5          float64
               ...   
Var226       category
Var227       category
Var228       category
Var229       category
appetency    category
Length: 213, dtype: object

In [21]:
y

In [22]:
categorical_indicator

[True,
 True,
 True,
 False,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 True,
 True,
 True,
 True]

In [9]:
attribute_names

['family',
 'product-type',
 'steel',
 'carbon',
 'hardness',
 'temper_rolling',
 'condition',
 'formability',
 'strength',
 'non-ageing',
 'surface-finish',
 'surface-quality',
 'enamelability',
 'bc',
 'bf',
 'bt',
 'bw_me',
 'bl',
 'm',
 'chrom',
 'phos',
 'cbond',
 'marvi',
 'exptl',
 'ferro',
 'corr',
 'blue_bright_varn_clean',
 'lustre',
 'jurofm',
 's',
 'p',
 'shape',
 'thick',
 'width',
 'len',
 'oil',
 'bore',
 'packing',
 'classes']

## Easy example

In [None]:
import pandas as pd
from pydantic import create_model
from enum import Enum
from typing import Optional
import datetime
import numpy as np


# Sample unknown dataset
data = {
    "name": ["Alice", "Bob", "Charlie", "Alex"],
    "age": [30, 28, 'nan', 25],
    "score": [85.5, 88, 90.0, None],
    "passed": [True, True, False, True],
    "registered_on": [pd.Timestamp("2023-01-01"), pd.Timestamp("2023-01-05"), pd.Timestamp("2023-01-02"), pd.Timestamp("2023-01-03")],
    "role": pd.Series(["admin", "Student", "user", "nan"], dtype="category")
}

df = pd.DataFrame(data)


df = df.replace('nan', None)
df = df.infer_objects()

# Build fields for Pydantic model
fields = {
    col: (map_dtype(col, dtype, df), ...)
    for col, dtype in df.dtypes.items()
}

# Create dynamic Pydantic model
DynamicModel = create_model("DynamicModel", **fields)

df_replaced_nan_for_none = df.astype(object).where(pd.notnull(df), None)

# Validate each row
validated = [DynamicModel(**row.to_dict()) for _, row in df_replaced_nan_for_none.iterrows()]

# Print validated models
for item in validated:
    print(item)



In [None]:
df

Unnamed: 0,name,age,score,passed,registered_on,role
0,Alice,30.0,85.5,True,2023-01-01,admin
1,Bob,28.0,88.0,True,2023-01-05,Student
2,Charlie,,90.0,False,2023-01-02,user
3,Alex,25.0,,True,2023-01-03,


In [None]:
df_replaced_nan_for_none

Unnamed: 0,name,age,score,passed,registered_on,role
0,Alice,30.0,85.5,True,2023-01-01 00:00:00,admin
1,Bob,28.0,88.0,True,2023-01-05 00:00:00,Student
2,Charlie,,90.0,False,2023-01-02 00:00:00,user
3,Alex,25.0,,True,2023-01-03 00:00:00,


In [None]:
external_data = {'name': 'Alice', 'age': None, 'score': 85.5, 'passed': True, 'registered_on': '2023-01-01 05:00:00', 'role': 'user'}
try:
    dynamicModel = DynamicModel(**external_data)  
    print('ok.')
    print(repr(dynamicModel))
except ValidationError as e:
    print(e.errors())

ok.
DynamicModel(name='Alice', age=None, score=85.5, passed=True, registered_on=datetime.datetime(2023, 1, 1, 5, 0), role=<RoleEnum.user: 'user'>)


In [None]:
fields

{'name': (str, Ellipsis),
 'age': (typing.Optional[float], Ellipsis),
 'score': (typing.Optional[float], Ellipsis),
 'passed': (bool, Ellipsis),
 'registered_on': (datetime.datetime, Ellipsis),
 'role': (<enum 'RoleEnum'>, Ellipsis)}

## Tabarena example

In [None]:
for col in X.select_dtypes(include='category').columns:
    nunique = X[col].nunique()
    X[col] = X[col].astype("str").replace('nan', None)
    if nunique > CATEGORY_UNIQUE_VALUES_THRESHOLD:
        #X[col] = X[col].astype('string')
        #converted_columns.append(col)
        print(col, 'has', nunique, 'unique values. Changed to str dtype.')
    else:
        X[col] = X[col].astype("category")

X = X.replace('nan', None)
X = X.infer_objects()

# Build fields for Pydantic model
fields = {
    col: (map_dtype(col, dtype, X), ...)
    for col, dtype in X.dtypes.items()
}

# Create dynamic Pydantic model
DynamicModel = create_model("DynamicModel", **fields)

# Replace the nan values for None
X_replaced_nan_for_none = X.astype(object).where(pd.notnull(X), None)

# Validate each row
validated = [DynamicModel(**row.to_dict()) for _, row in X_replaced_nan_for_none.iterrows()]

# Print validated models
for item in validated:
    print(item)

family=<FamilyEnum.not_applicable: 'not_applicable'> product-type=<Product-typeEnum.C: 'C'> steel=<SteelEnum.M: 'M'> carbon=0 hardness=0 temper_rolling=<Temper_rollingEnum.not_applicable: 'not_applicable'> condition=<ConditionEnum.not_applicable: 'not_applicable'> formability=<FormabilityEnum.not_applicable: 'not_applicable'> strength=350 non-ageing=<Non-ageingEnum.not_applicable: 'not_applicable'> surface-finish=<Surface-finishEnum.not_applicable: 'not_applicable'> surface-quality=<Surface-qualityEnum.G: 'G'> enamelability=<EnamelabilityEnum.not_applicable: 'not_applicable'> bc=<BcEnum.not_applicable: 'not_applicable'> bf=<BfEnum.not_applicable: 'not_applicable'> bt=<BtEnum.not_applicable: 'not_applicable'> bw_me=<Bw_meEnum.not_applicable: 'not_applicable'> bl=<BlEnum.not_applicable: 'not_applicable'> m=<MEnum.not_applicable: 'not_applicable'> chrom=<ChromEnum.not_applicable: 'not_applicable'> phos=<PhosEnum.not_applicable: 'not_applicable'> cbond=<CbondEnum.not_applicable: 'not_appli

In [None]:
fields

{'family': (<enum 'FamilyEnum'>, Ellipsis),
 'product-type': (<enum 'Product-typeEnum'>, Ellipsis),
 'steel': (<enum 'SteelEnum'>, Ellipsis),
 'carbon': (typing.Annotated[int, Ge(ge=0), FieldInfo(annotation=NoneType, required=True, metadata=[Lt(lt=255)])],
  Ellipsis),
 'hardness': (typing.Annotated[int, Ge(ge=0), FieldInfo(annotation=NoneType, required=True, metadata=[Lt(lt=255)])],
  Ellipsis),
 'temper_rolling': (<enum 'Temper_rollingEnum'>, Ellipsis),
 'condition': (<enum 'ConditionEnum'>, Ellipsis),
 'formability': (<enum 'FormabilityEnum'>, Ellipsis),
 'strength': (int, Ellipsis),
 'non-ageing': (<enum 'Non-ageingEnum'>, Ellipsis),
 'surface-finish': (<enum 'Surface-finishEnum'>, Ellipsis),
 'surface-quality': (<enum 'Surface-qualityEnum'>, Ellipsis),
 'enamelability': (<enum 'EnamelabilityEnum'>, Ellipsis),
 'bc': (<enum 'BcEnum'>, Ellipsis),
 'bf': (<enum 'BfEnum'>, Ellipsis),
 'bt': (<enum 'BtEnum'>, Ellipsis),
 'bw_me': (<enum 'Bw_meEnum'>, Ellipsis),
 'bl': (<enum 'BlEnum'>,

In [None]:
external_data = {'family': 'not_applicable', 
                 'product-type': 'C', 
                 'steel': 'M', 
                 'carbon': 0, 
                 'hardness': 0, 
                 'temper_rolling': 'not_applicable', 
                 'condition': 'not_applicable', 
                 'formability': 'not_applicable', 
                 'strength': 350, 
                 'non-ageing': 'not_applicable', 
                 'surface-finish': 'not_applicable', 
                 'surface-quality': 'G', 
                 'enamelability': 'not_applicable', 
                 'bc': 'not_applicable', 
                 'bf': 'not_applicable', 
                 'bt': 'not_applicable', 
                 'bw_me': 'not_applicable', 
                 'bl': 'not_applicable', 
                 'm': 'not_applicable', 
                 'chrom': 'not_applicable', 
                 'phos': 'not_applicable', 
                 'cbond': 'not_applicable', 
                 'marvi': 'not_applicable', 
                 'exptl': 'not_applicable', 
                 'ferro': 'not_applicable', 
                 'corr': 'not_applicable', 
                 'blue_bright_varn_clean': 'not_applicable', 
                 'lustre': 'not_applicable', 
                 'jurofm': 'not_applicable', 
                 's': 'not_applicable', 'p': 'not_applicable', 'shape': 'COIL', 'thick': 1.601, 'width': 609.9, 'len': 0, 'oil': 'not_applicable', 'bore': '0', 'packing': 'not_applicable', 'classes': '3'}


try:
    dynamicModel = DynamicModel(**external_data)  
    print('ok.')
    print(repr(dynamicModel))
except ValidationError as e:
    print(e.errors())

[{'type': 'enum', 'loc': ('steel',), 'msg': "Input should be 'A', 'K', 'M', 'R', 'S', 'V', 'W' or 'not_applicable'", 'input': None, 'ctx': {'expected': "'A', 'K', 'M', 'R', 'S', 'V', 'W' or 'not_applicable'"}, 'url': 'https://errors.pydantic.dev/2.11/v/enum'}]


In [None]:
DynamicModel.model_json_schema()

{'$defs': {'BcEnum': {'enum': ['Y', 'not_applicable'],
   'title': 'BcEnum',
   'type': 'string'},
  'BfEnum': {'enum': ['Y', 'not_applicable'],
   'title': 'BfEnum',
   'type': 'string'},
  'BlEnum': {'enum': ['Y', 'not_applicable'],
   'title': 'BlEnum',
   'type': 'string'},
  'Blue_bright_varn_cleanEnum': {'enum': ['B', 'C', 'V', 'not_applicable'],
   'title': 'Blue_bright_varn_cleanEnum',
   'type': 'string'},
  'BoreEnum': {'enum': ['0', '500', '600'],
   'title': 'BoreEnum',
   'type': 'string'},
  'BtEnum': {'enum': ['Y', 'not_applicable'],
   'title': 'BtEnum',
   'type': 'string'},
  'Bw_meEnum': {'enum': ['B', 'M', 'not_applicable'],
   'title': 'Bw_meEnum',
   'type': 'string'},
  'CbondEnum': {'enum': ['Y', 'not_applicable'],
   'title': 'CbondEnum',
   'type': 'string'},
  'ChromEnum': {'enum': ['C', 'not_applicable'],
   'title': 'ChromEnum',
   'type': 'string'},
  'ClassesEnum': {'enum': ['1', '2', '3', '5', 'U'],
   'title': 'ClassesEnum',
   'type': 'string'},
  'Con

In [None]:
fields

{'family': (<enum 'FamilyEnum'>, Ellipsis),
 'product-type': (<enum 'Product-typeEnum'>, Ellipsis),
 'steel': (<enum 'SteelEnum'>, Ellipsis),
 'carbon': (typing.Annotated[int, Ge(ge=0), FieldInfo(annotation=NoneType, required=True, metadata=[Lt(lt=255)])],
  Ellipsis),
 'hardness': (typing.Annotated[int, Ge(ge=0), FieldInfo(annotation=NoneType, required=True, metadata=[Lt(lt=255)])],
  Ellipsis),
 'temper_rolling': (<enum 'Temper_rollingEnum'>, Ellipsis),
 'condition': (<enum 'ConditionEnum'>, Ellipsis),
 'formability': (<enum 'FormabilityEnum'>, Ellipsis),
 'strength': (int, Ellipsis),
 'non-ageing': (<enum 'Non-ageingEnum'>, Ellipsis),
 'surface-finish': (<enum 'Surface-finishEnum'>, Ellipsis),
 'surface-quality': (<enum 'Surface-qualityEnum'>, Ellipsis),
 'enamelability': (<enum 'EnamelabilityEnum'>, Ellipsis),
 'bc': (<enum 'BcEnum'>, Ellipsis),
 'bf': (<enum 'BfEnum'>, Ellipsis),
 'bt': (<enum 'BtEnum'>, Ellipsis),
 'bw_me': (<enum 'Bw_meEnum'>, Ellipsis),
 'bl': (<enum 'BlEnum'>,

# Try all datasets from tabarena (openML)

In [None]:
import pandas as pd
df_tabarena = pd.read_csv('https://raw.githubusercontent.com/TabArena/tabarena_dataset_curation/refs/heads/main/dataset_creation_scripts/metadata/tabarena_dataset_metadata.csv')
df_tabarena

Unnamed: 0,dataset_id,task_id,target_feature,is_classification,dataset_name,openml_dataset_name,problem_type,num_features,num_instances,num_classes,...,openml_num_repeats,tabarena_num_repeats,can_run_tabpfnv2,can_run_tabicl,reference,data_source,domain,year,licence,original_data_url
0,46904,363612,scaled-sound-pressure,False,airfoil_self_noise,airfoil_self_noise,regression,6.0,1503.0,,...,10,10,True,False,brooks1989airfoil,UCI,physics & astronomy,2014,CC BY 4.0,https://doi.org/10.24432/C5VW2C
1,46905,363613,ResourceApproved,True,Amazon_employee_access,Amazon_employee_access,binary,10.0,32769.0,2.0,...,10,3,False,True,hamner2013amazon,Kaggle,business & marketing,2010,Public Domain,https://www.kaggle.com/c/amazon-employee-acces...
2,46906,363614,classes,True,anneal,anneal,multiclass,39.0,898.0,5.0,...,10,10,True,True,uci1990annealing,UCI,chemistry & material science,1990,CC BY 4.0,https://doi.org/10.24432/C5RW2F
3,46907,363615,price,False,Another-Dataset-on-used-Fiat-500,Another-Dataset-on-used-Fiat-500,regression,8.0,1538.0,,...,10,10,True,False,paolocons2020fiat,Kaggle,technology & internet,2020,CC0: Public Domain,https://www.kaggle.com/datasets/paolocons/anot...
4,46908,363616,AirPressureSystemFailure,True,APSFailure,APSFailure,binary,171.0,76000.0,2.0,...,10,3,False,True,ida2016challenge,UCI,industry & manufacturing,2016,CC BY 4.0,https://doi.org/10.24432/C5V60Q
5,46910,363618,SubscribeTermDeposit,True,bank-marketing,bank-marketing,binary,14.0,45211.0,2.0,...,10,3,False,True,"moro2011using,moro2014data",UCI,finance,2012,CC BY 4.0,https://doi.org/10.24432/C5K306
6,46911,363619,churn,True,Bank_Customer_Churn,Bank_Customer_Churn,binary,11.0,10000.0,2.0,...,10,3,True,True,topre2022churn,Kaggle,finance,2020,Public,https://www.kaggle.com/datasets/gauravtopre/ba...
7,46912,363620,MoleculeElicitsResponse,True,Bioresponse,Bioresponse,binary,1777.0,3751.0,2.0,...,10,3,False,False,hamner2012bioresponse,Kaggle,biology & life sciences,2012,Public Domain,https://www.kaggle.com/c/bioresponse
8,46913,363621,DonatedBloodInMarch2007,True,blood-transfusion-service-center,blood-transfusion-service-center,binary,5.0,748.0,2.0,...,10,10,True,True,yeh2009knowledge,UCI,medical & healthcare,2008,CC BY 4.0,https://doi.org/10.24432/C5GS39
9,46915,363623,CustomerChurned,True,churn,churn,binary,20.0,5000.0,2.0,...,10,3,True,True,marcoulides2005discovering,OpenML,business & marketing,2005,MIT License,https://github.com/EpistasisLab/pmlb/tree/mast...


In [None]:
print(df_tabarena.iloc[0].dataset_id)

46904


In [None]:

# Create Enum from category values
def create_enum_from_categories(name, categories):
    return Enum(name, {str(cat): str(cat) for cat in categories})

CATEGORY_UNIQUE_VALUES_THRESHOLD = 50

# Map pandas dtype to Python type
def map_dtype(col, dtype, df):
    dtype_str = str(dtype)
    if dtype_str == "int64":
        return Optional[int] if df[col].isnull().any() else int
    elif dtype_str == "uint8":
        return Optional[Annotated[NonNegativeInt, Field(lt=255)]] if df[col].isnull().any() else Annotated[NonNegativeInt, Field(lt=255)] 
    elif dtype_str == "float64":
        return Optional[float] if df[col].isnull().any() else float
    elif dtype_str == "bool":
        return Optional[bool] if df[col].isnull().any() else bool
    elif dtype_str == "object":
        return Optional[str] if df[col].isnull().any() else str
    elif dtype_str == "datetime64[ns]":
        return Optional[datetime.datetime] if df[col].isnull().any() else datetime.datetime
    elif dtype_str == "timedelta64[ns]":
        return Optional[datetime.timedelta] if df[col].isnull().any() else datetime.timedelta
    elif dtype_str == "category":
        enum_type = create_enum_from_categories(f"{col.capitalize()}Enum", df[col].cat.categories)
        return Optional[enum_type] if df[col].isnull().any() else enum_type
    else:
        return Optional[str]
    

def checkAllTabarenaDatasets():
    # Iterare all datasets
    for id in df_tabarena['dataset_id']:
        print(id)

        # Get the dataset by ID
        dataset = openml.datasets.get_dataset(id)
        X, y, categorical_indicator, attribute_names = dataset.get_data(dataset_format="dataframe")
        print('Number of rows:', len(X))

        # Check if the number unique values of a category column exceeds the threshold -> if so, change it to str type
        # Also replace 'nan' (in str) to None
        for col in X.select_dtypes(include='category').columns:
            nunique = X[col].nunique()
            X[col] = X[col].astype("str").replace('nan', None)
            if nunique > CATEGORY_UNIQUE_VALUES_THRESHOLD:
                print(col, 'has', nunique, 'unique values. Changed to str dtype.')
            else:
                X[col] = X[col].astype("category")

        # Replace all 'nan' (as string) values to None and infer the dtypes of each column
        X = X.replace('nan', None)
        X = X.infer_objects()
        
        # Build fields for Pydantic model
        fields = {
            col: (map_dtype(col, dtype, X), ...)
            for col, dtype in X.dtypes.items()
        }

        # Create dynamic Pydantic model
        DynamicModel = create_model("DynamicModel", **fields)

        # Replace the nan values for None
        X_replaced_nan_for_none = X.astype(object).where(pd.notnull(X), None)

        # Validate each row
        validated = [DynamicModel(**row.to_dict()) for _, row in X_replaced_nan_for_none.iterrows()]
        print('Number of validated rows:', len(validated))
        print(repr(validated[0]))
        print()
        # Print validated models
        #for item in validated:
        #    print(item)


In [None]:
checkAllTabarenaDatasets()

46904
Number of rows: 1503
Number of validated rows: 1503
DynamicModel(frequency=8000, attack-angle=<Attack-angleEnum.1.5: '1.5'>, chord-length=0.3048, free-stream-velocity=71.3, suction-side-displacement-thickness=0.00336729, scaled-sound-pressure=115.372)

46905
Number of rows: 32769
RESOURCE has 7518 unique values. Changed to str dtype.
MGR_ID has 4243 unique values. Changed to str dtype.
ROLE_ROLLUP_1 has 128 unique values. Changed to str dtype.
ROLE_ROLLUP_2 has 177 unique values. Changed to str dtype.
ROLE_DEPTNAME has 449 unique values. Changed to str dtype.
ROLE_TITLE has 343 unique values. Changed to str dtype.
ROLE_FAMILY_DESC has 2358 unique values. Changed to str dtype.
ROLE_FAMILY has 67 unique values. Changed to str dtype.
ROLE_CODE has 343 unique values. Changed to str dtype.
Number of validated rows: 32769
DynamicModel(ResourceApproved=<ResourceapprovedEnum.Yes: 'Yes'>, RESOURCE='37793', MGR_ID='81744', ROLE_ROLLUP_1='117902', ROLE_ROLLUP_2='117903', ROLE_DEPTNAME='1187

In [None]:
fields

{'Var1': (typing.Optional[float], Ellipsis),
 'Var2': (typing.Optional[float], Ellipsis),
 'Var3': (typing.Optional[float], Ellipsis),
 'Var4': (typing.Optional[float], Ellipsis),
 'Var5': (typing.Optional[float], Ellipsis),
 'Var6': (typing.Optional[float], Ellipsis),
 'Var7': (typing.Optional[float], Ellipsis),
 'Var9': (typing.Optional[float], Ellipsis),
 'Var10': (typing.Optional[float], Ellipsis),
 'Var11': (typing.Optional[float], Ellipsis),
 'Var12': (typing.Optional[float], Ellipsis),
 'Var13': (typing.Optional[float], Ellipsis),
 'Var14': (typing.Optional[float], Ellipsis),
 'Var16': (typing.Optional[float], Ellipsis),
 'Var17': (typing.Optional[float], Ellipsis),
 'Var18': (typing.Optional[float], Ellipsis),
 'Var19': (typing.Optional[float], Ellipsis),
 'Var21': (typing.Optional[float], Ellipsis),
 'Var22': (typing.Optional[float], Ellipsis),
 'Var23': (typing.Optional[float], Ellipsis),
 'Var24': (typing.Optional[float], Ellipsis),
 'Var25': (typing.Optional[float], Ellipsi

In [None]:

# Create Enum from category values
def create_enum_from_categories(name, categories):
    return Enum(name, {str(cat): str(cat) for cat in categories})

CATEGORY_UNIQUE_VALUES_THRESHOLD = 50

# Map pandas dtype to Python type
def map_dtype(col, dtype, df):
    dtype_str = str(dtype)
    if dtype_str == "int64":
        return Optional[int] if df[col].isnull().any() else int
    elif dtype_str == "uint8":
        return Optional[Annotated[NonNegativeInt, Field(lt=255)]] if df[col].isnull().any() else Annotated[NonNegativeInt, Field(lt=255)] 
    elif dtype_str == "float64":
        return Optional[float] if df[col].isnull().any() else float
    elif dtype_str == "bool":
        return Optional[bool] if df[col].isnull().any() else bool
    elif dtype_str == "object":
        return Optional[str] if df[col].isnull().any() else str
    elif dtype_str == "datetime64[ns]":
        return Optional[datetime.datetime] if df[col].isnull().any() else datetime.datetime
    elif dtype_str == "timedelta64[ns]":
        return Optional[datetime.timedelta] if df[col].isnull().any() else datetime.timedelta
    elif dtype_str == "category":

        # If the number of unique values exceeds the threshold, set the type as str
        #if df[col].nunique() > CATEGORY_UNIQUE_VALUES_THRESHOLD:
        #    return Optional[str] if df[col].isnull().any() else str
        
        enum_type = create_enum_from_categories(f"{col.capitalize()}Enum", df[col].cat.categories)
        return Optional[enum_type] if df[col].isnull().any() else enum_type
    else:
        return Optional[str]
    

def checkAllTabarenaDatasets():
    # Iterare all datasets
    for id in df_tabarena['dataset_id']:
        print(id)

        # Get the dataset by ID
        dataset = openml.datasets.get_dataset(id)
        X, y, categorical_indicator, attribute_names = dataset.get_data(dataset_format="dataframe")
        print('Number of rows:', len(X))

        # Check if the number unique values of a category column exceeds the threshold -> if so, change it to str type
        # Also replace 'nan' (in str) to None
        for col in X.select_dtypes(include='category').columns:
            nunique = X[col].nunique()
            X[col] = X[col].astype("str").replace('nan', None)
            if nunique > CATEGORY_UNIQUE_VALUES_THRESHOLD:
                print(col, 'has', nunique, 'unique values. Changed to str dtype.')
            else:
                X[col] = X[col].astype("category")

        # Replace all 'nan' values to None and infer the dtypes of each column
        X = X.replace('nan', None)
        X = X.infer_objects()
        
        # Build fields for Pydantic model
        fields = {
            col: (map_dtype(col, dtype, X), ...)
            for col, dtype in X.dtypes.items()
        }

        # Create dynamic Pydantic model
        DynamicModel = create_model("DynamicModel", **fields)

        # Replace the nan values for None
        X_replaced_nan_for_none = X.astype(object).where(pd.notnull(X), None)

        # Validate each row
        validated = [DynamicModel(**row.to_dict()) for _, row in X_replaced_nan_for_none.iterrows()]
        print('Number of validated rows:', len(validated))
        print(repr(validated[0]))
        print()
        # Print validated models
        #for item in validated:
        #    print(item)


# Import iterators and create_model function

In [1]:
from TabArenaIterator import TabArenaIterator
from pydantic import BaseModel, ValidationError
import pydantic_create_model as pdcm

pydantic_models: dict[str, BaseModel] = {}
tabArenaURL = 'https://raw.githubusercontent.com/TabArena/tabarena_dataset_curation/refs/heads/main/dataset_creation_scripts/metadata/tabarena_dataset_metadata.csv'
iterator = TabArenaIterator(tabArenaURL)
for row, df in iterator:
    model = pdcm.create_pydantic_model(str(row.dataset_id), df)
    pydantic_models[str(row.dataset_id)] = model

46904
46905
RESOURCE has 7518 unique values. Changed to str dtype.
MGR_ID has 4243 unique values. Changed to str dtype.
ROLE_ROLLUP_1 has 128 unique values. Changed to str dtype.
ROLE_ROLLUP_2 has 177 unique values. Changed to str dtype.
ROLE_DEPTNAME has 449 unique values. Changed to str dtype.
ROLE_TITLE has 343 unique values. Changed to str dtype.
ROLE_FAMILY_DESC has 2358 unique values. Changed to str dtype.
ROLE_FAMILY has 67 unique values. Changed to str dtype.
ROLE_CODE has 343 unique values. Changed to str dtype.
46906
46907
46908
46910
46911
46912
46913
46915
state has 51 unique values. Changed to str dtype.
46916
46917
46918
46919
46920
46921
46922
medical_specialty has 70 unique values. Changed to str dtype.
diag_1 has 696 unique values. Changed to str dtype.
diag_2 has 725 unique values. Changed to str dtype.
diag_3 has 758 unique values. Changed to str dtype.
46923
46924
46927
46928
Delivery_person_ID has 1320 unique values. Changed to str dtype.
46929
46930
46931
46932
46

In [5]:
pydantic_models

{'46904': pydantic_create_model.46904,
 '46905': pydantic_create_model.46905,
 '46906': pydantic_create_model.46906,
 '46907': pydantic_create_model.46907,
 '46908': pydantic_create_model.46908,
 '46910': pydantic_create_model.46910,
 '46911': pydantic_create_model.46911,
 '46912': pydantic_create_model.46912,
 '46913': pydantic_create_model.46913,
 '46915': pydantic_create_model.46915,
 '46916': pydantic_create_model.46916,
 '46917': pydantic_create_model.46917,
 '46918': pydantic_create_model.46918,
 '46919': pydantic_create_model.46919,
 '46920': pydantic_create_model.46920,
 '46921': pydantic_create_model.46921,
 '46922': pydantic_create_model.46922,
 '46923': pydantic_create_model.46923,
 '46924': pydantic_create_model.46924,
 '46927': pydantic_create_model.46927,
 '46928': pydantic_create_model.46928,
 '46929': pydantic_create_model.46929,
 '46930': pydantic_create_model.46930,
 '46931': pydantic_create_model.46931,
 '46932': pydantic_create_model.46932,
 '46933': pydantic_create

In [6]:
type(pydantic_models.get('46906'))

pydantic._internal._model_construction.ModelMetaclass

In [10]:
X.dtypes

family                    category
product-type              category
steel                     category
carbon                       uint8
hardness                     uint8
temper_rolling            category
condition                 category
formability               category
strength                     int64
non-ageing                category
surface-finish            category
surface-quality           category
enamelability             category
bc                        category
bf                        category
bt                        category
bw_me                     category
bl                        category
m                         category
chrom                     category
phos                      category
cbond                     category
marvi                     category
exptl                     category
ferro                     category
corr                      category
blue_bright_varn_clean    category
lustre                    category
jurofm              

In [10]:
external_data = {'family': 'not_applicable', 
                 'product-type': 'g', 
                 'steel': 'M', 
                 'carbon': -5, 
                 'hardness': 0, 
                 'temper_rolling': 'not_applicable', 
                 'condition': 'not_applicable', 
                 'formability': 'not_applicable', 
                 'strength': 350, 
                 'non-ageing': 'not_applicable', 
                 'surface-finish': 'not_applicable', 
                 'surface-quality': 'G', 
                 'enamelability': 'not_applicable', 
                 'bc': 'not_applicable', 
                 'bf': 'not_applicable', 
                 'bt': 'not_applicable', 
                 'bw_me': 'not_applicable', 
                 'bl': 'not_applicable', 
                 'm': 'not_applicable', 
                 'chrom': 'not_applicable', 
                 'phos': 'not_applicable', 
                 'cbond': 'not_applicable', 
                 'marvi': 'not_applicable', 
                 'exptl': 'not_applicable', 
                 'ferro': 'not_applicable', 
                 'corr': 'not_applicable', 
                 'blue_bright_varn_clean': 'not_applicable', 
                 'lustre': 'not_applicable', 
                 'jurofm': 'not_applicable', 
                 's': 'not_applicable', 'p': 'not_applicable', 'shape': 'COIL', 'thick': 1.601, 'width': 609.9, 'len': 0, 'oil': 'not_applicable', 'bore': '0', 'packing': 'not_applicable', 'classes': '3'}


try:
    dynamicModel = pydantic_models.get('46906')(**external_data)  
    print('ok.')
    print(repr(dynamicModel))
except ValidationError as e:
    print(e.errors())

[{'type': 'enum', 'loc': ('product-type',), 'msg': "Input should be 'C'", 'input': 'g', 'ctx': {'expected': "'C'"}, 'url': 'https://errors.pydantic.dev/2.11/v/enum'}, {'type': 'greater_than_equal', 'loc': ('carbon',), 'msg': 'Input should be greater than or equal to 0', 'input': -5, 'ctx': {'ge': 0}, 'url': 'https://errors.pydantic.dev/2.11/v/greater_than_equal'}]


In [None]:
pydantic_models.get('46906').model_json_schema()

{'$defs': {'BcEnum': {'enum': ['Y', 'not_applicable'],
   'title': 'BcEnum',
   'type': 'string'},
  'BfEnum': {'enum': ['Y', 'not_applicable'],
   'title': 'BfEnum',
   'type': 'string'},
  'BlEnum': {'enum': ['Y', 'not_applicable'],
   'title': 'BlEnum',
   'type': 'string'},
  'Blue_bright_varn_cleanEnum': {'enum': ['B', 'C', 'V', 'not_applicable'],
   'title': 'Blue_bright_varn_cleanEnum',
   'type': 'string'},
  'BoreEnum': {'enum': ['0', '500', '600'],
   'title': 'BoreEnum',
   'type': 'string'},
  'BtEnum': {'enum': ['Y', 'not_applicable'],
   'title': 'BtEnum',
   'type': 'string'},
  'Bw_meEnum': {'enum': ['B', 'M', 'not_applicable'],
   'title': 'Bw_meEnum',
   'type': 'string'},
  'CbondEnum': {'enum': ['Y', 'not_applicable'],
   'title': 'CbondEnum',
   'type': 'string'},
  'ChromEnum': {'enum': ['C', 'not_applicable'],
   'title': 'ChromEnum',
   'type': 'string'},
  'ClassesEnum': {'enum': ['1', '2', '3', '5', 'U'],
   'title': 'ClassesEnum',
   'type': 'string'},
  'Con

In [None]:
import pandas as pd
series = [
    ('a@a.com','Bill', 'Schneider', 123, 321, 20190502),
    ('a@a.com', 'Damian', 'Schneider', 124, 231, 20190502),
    ('b@b.com', 'Bill', 'Schneider',164, 313, 20190503),
    ('a@a.com','Bill', 'Schneider', 123, 321, 20190502),
    ('b@b.com', 'Bill', 'Schneider',164, 313, 20190503),
    ]

# Create a DataFrame object
df = pd.DataFrame(series, columns=['email', 'first name', 'last name', 'C_ID', 'A_ID', 'CreatedDate'])

# Find duplicate rows
df_duplicates = df[df.duplicated()]
print(df_duplicates)