## Creating a Lilac dataset


In [1]:
from IPython.display import display
import lilac as ll

  from .autonotebook import tqdm as notebook_tqdm


### From HuggingFace


In [3]:
config = ll.DatasetConfig(
  namespace='local',
  name='glue',
  source=ll.HuggingFaceSource(dataset_name='glue', config_name='ax'))
dataset = ll.create_dataset(config)

Found cached dataset glue (/Users/dsmilkov/.cache/huggingface/datasets/glue/ax/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 1/1 [00:00<00:00, 428.60it/s]
Reading from source huggingface...: 100%|██████████| 1104/1104 [00:00<00:00, 33616.55it/s]

Dataset "glue" written to ./data/datasets/local/glue





### From CSV


In [6]:
url = 'https://storage.googleapis.com/lilac-data/datasets/the_movies_dataset/the_movies_dataset.csv'
config = ll.DatasetConfig(
  namespace='local', name='the_movies_dataset', source=ll.CSVSource(filepaths=[url]))
dataset = ll.create_dataset(config)

Downloading from url https://storage.googleapis.com/lilac-data/datasets/the_movies_dataset/the_movies_dataset.csv to /tmp/./data/local_cache/7134d22e156b4fceb0000b16b61c7666


Reading from source csv...: 100%|██████████| 45460/45460 [00:00<00:00, 50050.20it/s]


Dataset "the_movies_dataset" written to ./data/datasets/local/the_movies_dataset


### From JSON


In [4]:
config = ll.DatasetConfig(
  namespace='local',
  name='news_headlines',
  source=ll.JSONSource(filepaths=[
    'https://raw.githubusercontent.com/explosion/prodigy-recipes/master/example-datasets/news_headlines.jsonl'
  ]))
dataset = ll.create_dataset(config)

Downloading from url https://raw.githubusercontent.com/explosion/prodigy-recipes/master/example-datasets/news_headlines.jsonl to /tmp/./data/local_cache/2e1090cdca7e4afda99eabcf576874f6


Reading from source json...: 100%|██████████| 200/200 [00:00<00:00, 118149.41it/s]

Dataset "news_headlines" written to ./data/datasets/local/news_headlines





### From Pandas


In [5]:
import pandas as pd

url = 'https://storage.googleapis.com/lilac-data-us-east1/datasets/csv_datasets/the_movies_dataset/the_movies_dataset.csv'
df = pd.read_csv(url, low_memory=False)

config = ll.DatasetConfig(namespace='local', name='the_movies_dataset2', source=ll.PandasSource(df))

dataset = ll.create_dataset(config)

Reading from source pandas...: 100%|██████████| 45466/45466 [00:00<00:00, 54764.93it/s]


Dataset "the_movies_dataset2" written to ./data/datasets/local/the_movies_dataset2


## Visualize the data

Now that we have imported a few datasets, let's visualize them to see what they look like.


In [None]:
ll.start_server()

INFO:     Started server process [47521]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:5432 (Press CTRL+C to quit)


### Stopping the server


In [21]:
await ll.stop_server()

INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.


## Query a dataset


In [7]:
dataset = ll.get_dataset('local', 'the_movies_dataset')
r = dataset.select_rows(['title', 'budget', 'overview'], limit=5)
print('Total number of rows', r.total_num_rows)
print(r.df())

Total number of rows 45460
                         title    budget  \
0                    Toy Story  30000000   
1                      Jumanji  65000000   
2             Grumpier Old Men         0   
3            Waiting to Exhale  16000000   
4  Father of the Bride Part II         0   

                                            overview  
0  Led by Woody, Andy's toys live happily in his ...  
1  When siblings Judy and Peter discover an encha...  
2  A family wedding reignites the ancient feud be...  
3  Cheated on, mistreated and stepped on, the wom...  
4  Just when George Banks has recovered from his ...  


In [9]:
r = dataset.select_rows(['*'], limit=5)
print(r.df())

   adult                              belongs_to_collection    budget  \
0  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
1  False                                               None  65000000   
2  False  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0   
3  False                                               None  16000000   
4  False  {'id': 96871, 'name': 'Father of the Bride Col...         0   

                                              genres  \
0  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   
1  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...   
2  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...   
3  [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...   
4                     [{'id': 35, 'name': 'Comedy'}]   

                               homepage     id    imdb_id original_language  \
0  http://toystory.disney.com/toy-story    862  tt0114709                en   
1                                  None   8844  tt0113497         

In [8]:
print(dataset.manifest())

namespace='local' dataset_name='the_movies_dataset' data_schema={
  "fields": {
    "adult": {
      "dtype": "boolean"
    },
    "belongs_to_collection": {
      "dtype": "string"
    },
    "budget": {
      "dtype": "int64"
    },
    "genres": {
      "dtype": "string"
    },
    "homepage": {
      "dtype": "string"
    },
    "id": {
      "dtype": "int64"
    },
    "imdb_id": {
      "dtype": "string"
    },
    "original_language": {
      "dtype": "string"
    },
    "original_title": {
      "dtype": "string"
    },
    "overview": {
      "dtype": "string"
    },
    "popularity": {
      "dtype": "float64"
    },
    "poster_path": {
      "dtype": "string"
    },
    "production_companies": {
      "dtype": "string"
    },
    "production_countries": {
      "dtype": "string"
    },
    "release_date": {
      "dtype": "timestamp"
    },
    "revenue": {
      "dtype": "int64"
    },
    "runtime": {
      "dtype": "float64"
    },
    "spoken_languages": {
      "dtype"

## Enriching an unstructured field with metadata


In [9]:
dataset.compute_signal(ll.PIISignal(), 'overview')

Computing pii on local/the_movies_dataset:('overview',): 100%|██████████| 45460/45460 [00:45<00:00, 1002.44it/s]


Computing signal "pii" on local/the_movies_dataset:('overview',) took 45.390s.
Wrote signal output to ./data/datasets/local/the_movies_dataset/overview/pii


In [10]:
dataset.compute_signal(ll.LangDetectionSignal(), 'overview')

Computing lang_detection: 100%|██████████| 45460/45460 [01:31<00:00, 494.44it/s]


Computing signal "lang_detection" took 91.983s.
Wrote signal output to ./data/datasets/local/the_movies_dataset/overview/lang_detection


In [11]:
dataset.compute_signal(ll.NearDuplicateSignal(), 'overview')

Fingerprinting...: 44506it [00:06, 7055.80it/s]0:00<?, ?it/s]
Computing hash collisions...: 100%|██████████| 5/5 [00:01<00:00,  4.31it/s]
Clustering...: 100%|██████████| 21/21 [00:00<00:00, 154.89it/s]
Computing near_dup: 100%|██████████| 45460/45460 [00:07<00:00, 5837.09it/s]


Computing signal "near_dup" took 7.824s.
Wrote signal output to ./data/datasets/local/the_movies_dataset/overview/near_dup


## Searching


### Compute embedding to enable advanced search

Let's compute the `SBERT` embedding on device for the `overview` field.


In [12]:
dataset.compute_embedding('sbert', 'overview')

Computing sbert: 100%|██████████| 45460/45460 [01:47<00:00, 422.10it/s]


Computing signal "sbert" took 107.781s.
Wrote signal output to ./data/datasets/local/the_movies_dataset/overview/sbert


### Keyword search


In [13]:
query = ll.KeywordSearch(path='overview', query='Aliens')
r = dataset.select_rows(['overview'], searches=[query], limit=5)
display(r.df())

Computing signal "substring_search" on local/the_movies_dataset:('overview',) took 0.001s.


Unnamed: 0,overview,overview.substring_search(query=Aliens)
0,When Environmental Protection Agency inspector...,"[{'__value__': {'start': 422, 'end': 428}}]"
1,Aliens who've come to earth to spawn deep bene...,"[{'__value__': {'start': 0, 'end': 6}}]"
2,Marcus is a kid on Manhattan's mean streets. H...,"[{'__value__': {'start': 133, 'end': 139}}]"
3,"Two hundred years after Lt. Ripley died, a gro...","[{'__value__': {'start': 168, 'end': 174}}]"
4,"In California, an old man (Bela Lugosi) grieve...","[{'__value__': {'start': 605, 'end': 611}}]"


### Semantic search


In [14]:
query = ll.SemanticSearch(path='overview', query='Aliens have invaded the earth', embedding='sbert')
r = dataset.select_rows(['overview'], searches=[query], limit=5)
display(r.df())

Loading vector store "hnsw" for local/the_movies_dataset:('overview',) with embedding "sbert" took 0.410s.
Computing topk on local/the_movies_dataset:('overview',) with embedding "sbert" and vector store "hnsw" took 0.738s.
Computing signal "semantic_similarity" on local/the_movies_dataset:('overview',) took 0.001s.


Unnamed: 0,overview,"overview.semantic_similarity(embedding=sbert,query=Aliens have invaded the earth)"
0,"The Earth is invaded by alien parasites aka ""s...","[{'__value__': {'start': 0, 'end': 165}, 'scor..."
1,"Aliens invade, this time delivering a clear ul...","[{'__value__': {'start': 0, 'end': 186}, 'scor..."
2,Aliens pretending to be friendly come to Earth...,"[{'__value__': {'start': 0, 'end': 285}, 'scor..."
3,The nations of the Earth unite in a common cau...,"[{'__value__': {'start': 0, 'end': 90}, 'score..."
4,"Aliens have landed and are hiding on Earth, bu...","[{'__value__': {'start': 0, 'end': 114}, 'scor..."


### Conceptual search


In [15]:
query = ll.ConceptSearch(
  path='overview', concept_namespace='lilac', concept_name='profanity', embedding='sbert')
r = dataset.select_rows(['overview'], searches=[query], limit=5)
display(r.df())

Computing embeddings for "lilac/profanity/sbert" took 3.171s.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has alread

Unnamed: 0,overview,overview.lilac/profanity/labels,overview.lilac/profanity/sbert
0,Mick O'Brien is a young Chicago street thug to...,,"[{'__value__': {'start': 0, 'end': 294}, 'scor..."
1,Rin Okumura is raised by a famous exorcist nam...,,"[{'__value__': {'start': 0, 'end': 315}, 'scor..."
2,Ass-breaker Dingus Magee is looking for a gold...,,"[{'__value__': {'start': 0, 'end': 398}, 'scor..."
3,While stopped at a roadside phone booth for tr...,,"[{'__value__': {'start': 0, 'end': 393}, 'scor..."
4,"Venom regulars Philip Kwok, Chiang Sheng, and ...",,"[{'__value__': {'start': 0, 'end': 397}, 'scor..."


## Downloading the enriched dataset


In [3]:
dataset.to_json('the_movies_dataset.json')

Dataset exported to the_movies_dataset.json


In [5]:
dataset.to_parquet('the_movies_dataset.parquet')

Dataset exported to the_movies_dataset.parquet


In [6]:
dataset.to_csv('the_movies_dataset.csv')

Dataset exported to the_movies_dataset.csv


In [9]:
dataset.to_pandas()[:5]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,__line_number__,__rowid__
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415,0,21f248b70d4e4efbbeebe41027f1acf7
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413,1,5b3df6d6824646ecb91941971930e625
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92,2,3f2051460298402e817ed859fe425528
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34,3,f83112ea28fc4a41b44bbd2784ccb130
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173,4,bffb8f13612a46d1b995ac405b82f27c


## Using concepts


### Use the positive-sentiment concept


In [16]:
signal = ll.signals.ConceptSignal(
  namespace='lilac', concept_name='positive-sentiment', embedding='gte-small')

result = list(signal.compute(['This product is amazing, thank you!']))

print(result)


[[{'__value__': {'start': 0, 'end': 35}, 'score': 0.9302721936202625}]]


### Create a positive product reviews concept


In [17]:
db = ll.DiskConceptDB()

concepts = db.list()
# Don't create the concept twice.
if not list(
    filter(lambda c: c.namespace == 'local' and c.name == 'positive-product-reviews', concepts)):
  db.create('local', 'positive-product-reviews')

#### Add a few training examples


In [6]:
train_data = [
  ll.ExampleIn(label=False, text='The quick brown fox jumps over the lazy dog.'),
  ll.ExampleIn(label=False, text='This is a random sentence.'),
  ll.ExampleIn(label=True, text='This product is amazing!'),
  ll.ExampleIn(label=True, text='Thank you for your awesome work on this UI.')
]
db.edit('local', 'positive-product-reviews', ll.ConceptUpdate(insert=train_data))

Concept(namespace='local', concept_name='positive-product-reviews', type=text, data={'b5213e639f1d45dfacbb12e127529f2d': Example(label=False, text='The quick brown fox jumps over the lazy dog.', img=None, origin=None, draft='main', id='b5213e639f1d45dfacbb12e127529f2d'), '6b7492e786fd4bf4ac3c84fa999d5fab': Example(label=False, text='This is a random sentence.', img=None, origin=None, draft='main', id='6b7492e786fd4bf4ac3c84fa999d5fab'), '733d90b4fa88496a9fe636a2dd3c8337': Example(label=True, text='This product is amazing!', img=None, origin=None, draft='main', id='733d90b4fa88496a9fe636a2dd3c8337'), 'f3d5059fcffd4a9a863c8c4ce8e74d89': Example(label=True, text='Thank you for your awesome work on this UI.', img=None, origin=None, draft='main', id='f3d5059fcffd4a9a863c8c4ce8e74d89')}, version=1, tags=[], description=None)

#### Show the examples in the concept


In [7]:
concept = db.get('local', 'positive-product-reviews')

if concept:
  print(concept.data)

{'b5213e639f1d45dfacbb12e127529f2d': Example(label=False, text='The quick brown fox jumps over the lazy dog.', img=None, origin=None, draft='main', id='b5213e639f1d45dfacbb12e127529f2d'), '6b7492e786fd4bf4ac3c84fa999d5fab': Example(label=False, text='This is a random sentence.', img=None, origin=None, draft='main', id='6b7492e786fd4bf4ac3c84fa999d5fab'), '733d90b4fa88496a9fe636a2dd3c8337': Example(label=True, text='This product is amazing!', img=None, origin=None, draft='main', id='733d90b4fa88496a9fe636a2dd3c8337'), 'f3d5059fcffd4a9a863c8c4ce8e74d89': Example(label=True, text='Thank you for your awesome work on this UI.', img=None, origin=None, draft='main', id='f3d5059fcffd4a9a863c8c4ce8e74d89')}


#### Remove examples


In [None]:
db.edit('local', 'positive-product-reviews',
        ll.ConceptUpdate(remove=['d86e4cb53c70443b8d8782a6847f4752']))

##### Use the new concept


In [8]:
signal = ll.signals.ConceptSignal(
  namespace='local', concept_name='positive-product-reviews', embedding='gte-small')

result = list(signal.compute(['This product is amazing, thank you!']))

print(result)

Computing embeddings for "local/positive-product-reviews/gte-small" took 2.169s.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after par

#### Concept metrics

To compute metrics for a concept, we first have to instantiate a concept model.


In [11]:
model_db = ll.DiskConceptModelDB(ll.DiskConceptDB())

model = model_db.get('local', 'positive-product-reviews', embedding_name='gte-small')

if model:
  print(model.get_metrics())


f1=0.5555555555555556 precision=0.5 recall=1.0 roc_auc=0.0 overall=<OverallScore.OK: 'ok'>


#### Remove the concept


In [12]:
db.remove('local', 'positive-product-reviews')