## Creating a Lilac dataset


In [1]:
from IPython.display import display
import lilac as ll

  from .autonotebook import tqdm as notebook_tqdm


### From HuggingFace


In [3]:
source_config = ll.HuggingFaceDataset(dataset_name='glue', config_name='ax')
dataset = ll.create_dataset('local', 'glue', source_config)

Found cached dataset glue (/Users/dsmilkov/.cache/huggingface/datasets/glue/ax/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 1/1 [00:00<00:00, 518.65it/s]
Reading from source huggingface...: 100%|██████████| 1104/1104 [00:00<00:00, 37553.62it/s]

Dataset "glue" written to ./data/datasets/local/glue





### From CSV


In [5]:
url = 'https://storage.googleapis.com/lilac-data-us-east1/datasets/csv_datasets/the_movies_dataset/the_movies_dataset.csv'
source_config = ll.CSVDataset(filepaths=[url])
dataset = ll.create_dataset('local', 'the_movies_dataset', source_config)

Downloading from url https://storage.googleapis.com/lilac-data-us-east1/datasets/csv_datasets/the_movies_dataset/the_movies_dataset.csv to /tmp/./data/local_cache/1f52e406f4594992b7a8b60776234f1d


Reading from source csv...: 100%|██████████| 45460/45460 [00:00<00:00, 51091.33it/s]


Dataset "the_movies_dataset" written to ./data/datasets/local/the_movies_dataset


### From JSON


In [5]:
source_config = ll.JSONDataset(filepaths=[
  'https://raw.githubusercontent.com/explosion/prodigy-recipes/master/example-datasets/news_headlines.jsonl'
])
dataset = ll.create_dataset('local', 'news_headlines', source_config)

Downloading from url https://raw.githubusercontent.com/explosion/prodigy-recipes/master/example-datasets/news_headlines.jsonl to /tmp/./data/local_cache/452137e2b28c444095efaba00674e4e5


Reading from source json...: 100%|██████████| 200/200 [00:00<00:00, 126907.84it/s]

Dataset "news_headlines" written to ./data/datasets/local/news_headlines





### From Pandas


In [4]:
import pandas as pd

url = 'https://storage.googleapis.com/lilac-data-us-east1/datasets/csv_datasets/the_movies_dataset/the_movies_dataset.csv'
df = pd.read_csv(url, low_memory=False)
source_config = ll.PandasDataset(df)
dataset = ll.create_dataset('local', 'the_movies_dataset2', source_config)

Reading from source pandas...: 100%|██████████| 45466/45466 [00:00<00:00, 56193.51it/s]


Dataset "the_movies_dataset2" written to ./data/datasets/local/the_movies_dataset2


## Visualize the data

Now that we have imported a few datasets, let's visualize them to see what they look like.


In [4]:
ll.start_server()

INFO:     Started server process [94705]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:5432 (Press CTRL+C to quit)


### Stopping the server


In [6]:
await ll.stop_server()

INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.


## Query a dataset


In [2]:
dataset = ll.get_dataset('local', 'the_movies_dataset')
r = dataset.select_rows(['title', 'budget', 'overview'], limit=5)
print('Total number of rows', r.total_num_rows)
display(r.df())

Total number of rows 45460


Unnamed: 0,title,budget,overview,__rowid__
0,Toy Story,30000000,"Led by Woody, Andy's toys live happily in his ...",21f248b70d4e4efbbeebe41027f1acf7
1,Jumanji,65000000,When siblings Judy and Peter discover an encha...,5b3df6d6824646ecb91941971930e625
2,Grumpier Old Men,0,A family wedding reignites the ancient feud be...,3f2051460298402e817ed859fe425528
3,Waiting to Exhale,16000000,"Cheated on, mistreated and stepped on, the wom...",f83112ea28fc4a41b44bbd2784ccb130
4,Father of the Bride Part II,0,Just when George Banks has recovered from his ...,bffb8f13612a46d1b995ac405b82f27c


## Enriching an unstructured field with metadata


In [5]:
dataset.compute_signal(ll.PIISignal(), 'overview')

Computing pii: 100%|██████████| 45460/45460 [00:45<00:00, 995.15it/s] 


Computing signal "pii" took 45.727s.
Wrote signal output to ./data/datasets/local/the_movies_dataset/overview/pii


In [6]:
dataset.compute_signal(ll.LangDetectionSignal(), 'overview')

Computing lang_detection: 100%|██████████| 45460/45460 [01:31<00:00, 494.44it/s]


Computing signal "lang_detection" took 91.983s.
Wrote signal output to ./data/datasets/local/the_movies_dataset/overview/lang_detection


In [7]:
dataset.compute_signal(ll.NearDuplicateSignal(), 'overview')

Fingerprinting...: 44506it [00:06, 7055.80it/s]0:00<?, ?it/s]
Computing hash collisions...: 100%|██████████| 5/5 [00:01<00:00,  4.31it/s]
Clustering...: 100%|██████████| 21/21 [00:00<00:00, 154.89it/s]
Computing near_dup: 100%|██████████| 45460/45460 [00:07<00:00, 5837.09it/s]


Computing signal "near_dup" took 7.824s.
Wrote signal output to ./data/datasets/local/the_movies_dataset/overview/near_dup


## Searching


### Compute embedding to enable advanced search

Let's compute the `SBERT` embedding on device for the `overview` field.


In [None]:
dataset.compute_embedding('sbert', 'overview')

Computing sbert: 100%|██████████| 45460/45460 [01:47<00:00, 422.10it/s]


Computing signal "sbert" took 107.781s.
Wrote signal output to ./data/datasets/local/the_movies_dataset/overview/sbert


### Keyword search


In [14]:
query = ll.KeywordQuery(search='Aliens')
r = dataset.select_rows(['overview'], searches=[ll.Search(path='overview', query=query)], limit=5)
display(r.df())

Computing signal "substring_search" took 0.001s.


Unnamed: 0,overview,__rowid__,substring_search(query=Aliens)(overview)
0,When Environmental Protection Agency inspector...,255f04de78f54fb9a437b83fa06919b9,"[{'__value__': {'start': 422, 'end': 428}}]"
1,"With enormous cone-shaped heads, robotlike wal...",e2529b1b087b480386ed4e5a5b3a7630,"[{'__value__': {'start': 83, 'end': 89}}]"
2,Aliens who've come to earth to spawn deep bene...,2d00f6d7407b43c89b34a0d6fb7fe632,"[{'__value__': {'start': 0, 'end': 6}}]"
3,A team from the intergalactic fast food chain ...,8842f2f053e94e1489d2063fa4df77ea,"[{'__value__': {'start': 435, 'end': 441}}, {'..."
4,Marcus is a kid on Manhattan's mean streets. H...,e152cad5c7fc4a0e8acaa7f417605392,"[{'__value__': {'start': 133, 'end': 139}}]"


### Semantic search


In [7]:
query = ll.SemanticQuery(search='Aliens have invaded the earth', embedding='sbert')
r = dataset.select_rows(['overview'], searches=[ll.Search(path='overview', query=query)], limit=5)
display(r.df())

Computing signal "semantic_similarity" took 0.002s.


Unnamed: 0,overview,__rowid__,semantic_similarity(query=Aliens have invaded the earth)(overview.sbert.*.embedding)
0,"The Earth is invaded by alien parasites aka ""s...",2f322658c8b240709ed6350731c977cd,[0.7876087948679924]
1,"Aliens invade, this time delivering a clear ul...",a6bc9ca99dfe47668268695614e74292,[0.7808258235454559]
2,Aliens pretending to be friendly come to Earth...,63b6d34b48194b1eb355cb257859b54e,[0.7718495875597]
3,The nations of the Earth unite in a common cau...,5071f369239848ca9daa9b2c92907782,[0.7678595408797264]
4,"Aliens have landed and are hiding on Earth, bu...",b09339247bd946ea8b0e081c2218dc9e,[0.7546965628862381]


### Conceptual search


In [11]:
query = ll.ConceptQuery(concept_namespace='lilac', concept_name='profanity', embedding='sbert')
r = dataset.select_rows(['overview'], searches=[ll.Search(path='overview', query=query)], limit=5)
display(r.df())

Computing signal "concept_labels" took 0.011s.
Computing signal "concept_score" took 0.022s.


Unnamed: 0,overview,__rowid__,lilac/profanity/labels(overview),lilac/profanity(overview.sbert.*.embedding)
0,A traumatized young man abducts Korean leaders...,c924a9408c6547e9b65706740d3e4925,,"[0.1425706569142622, 0.9767540489817912]"
1,The story centers around a graduating class of...,bbdf5894d8c74e1db9f1759e975358b0,,"[0.0009874361053396775, 0.9702729196295821]"
2,What happens when a generation's ultimate anti...,40d80c411fbb4e959a8be7233eab1300,,"[0.46421371760421426, 0.9675712519471154]"
3,"Welcome to T &amp; A High, where the entire st...",1e2e233f68674e1498576442f64443ed,,[0.9675146942396857]
4,Baby Bink couldn't ask for more; he has adorin...,06680c9be7d74c92b01f0a975ac862b4,,"[0.23914685418353973, 0.9597516982156834]"


## Downloading the enriched dataset


In [3]:
dataset.to_json('the_movies_dataset.json')

Dataset exported to the_movies_dataset.json


In [5]:
dataset.to_parquet('the_movies_dataset.parquet')

Dataset exported to the_movies_dataset.parquet


In [6]:
dataset.to_csv('the_movies_dataset.csv')

Dataset exported to the_movies_dataset.csv


In [9]:
dataset.to_pandas()[:5]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,__line_number__,__rowid__
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415,0,21f248b70d4e4efbbeebe41027f1acf7
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413,1,5b3df6d6824646ecb91941971930e625
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92,2,3f2051460298402e817ed859fe425528
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34,3,f83112ea28fc4a41b44bbd2784ccb130
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173,4,bffb8f13612a46d1b995ac405b82f27c


## End to end example


1. I have a csv dataset
2. I want toxicity on field "text"
3. I want to download it


## Using concepts


### Use the positive-sentiment concept


In [8]:
signal = ll.signals.ConceptSignal(
  namespace='lilac', concept_name='positive-sentiment', embedding='gte-small')

result = list(signal.compute(['This product is amazing, thank you!']))

print(result)


[[{'__value__': {'start': 0, 'end': 35}, 'score': 0.9302721936202625}]]


### Create a positive product reviews concept


In [3]:
db = ll.DiskConceptDB()

concepts = db.list()
# Don't create the concept twice.
if not filter(lambda c: c.name == 'local' and c.name == 'positive-product-reviews', concepts):
  db.create('local', 'positive-product-reviews')

#### Add a few training examples


In [11]:
train_data = [
  ll.ExampleIn(label=False, text='The quick brown fox jumps over the lazy dog.'),
  ll.ExampleIn(label=True, text='This product is amazing!'),
  ll.ExampleIn(label=True, text='Thank you for your awesome work on this UI.')
]
db.edit('local', 'positive-product-reviews', ll.ConceptUpdate(insert=train_data))

Concept(namespace='local', concept_name='positive-product-reviews', type=text, data={'d86e4cb53c70443b8d8782a6847f4752': Example(label=False, text='The quick brown fox jumps over the lazy dog.', img=None, origin=None, draft='main', id='d86e4cb53c70443b8d8782a6847f4752'), '7d12d83bb06747698381d6326fc7fd27': Example(label=True, text='This product is amazing!', img=None, origin=None, draft='main', id='7d12d83bb06747698381d6326fc7fd27'), 'ad14145642004f0da0cdb079e38d74f0': Example(label=True, text='Thank you for your awesome work on this UI.', img=None, origin=None, draft='main', id='ad14145642004f0da0cdb079e38d74f0'), '9dbfedbc6ecf4a3dba1ef495acd2d579': Example(label=False, text='The quick brown fox jumps over the lazy dog.', img=None, origin=None, draft='main', id='9dbfedbc6ecf4a3dba1ef495acd2d579'), '1537f725228a49879ff811b9a8b8f82e': Example(label=True, text='This product is amazing!', img=None, origin=None, draft='main', id='1537f725228a49879ff811b9a8b8f82e'), 'c4cf7a874d91466aade2e70

#### Show the examples in the concept


In [8]:
concept = db.get('local', 'positive-product-reviews')

if concept:
  print(concept.data)

{'d86e4cb53c70443b8d8782a6847f4752': Example(label=False, text='The quick brown fox jumps over the lazy dog.', img=None, origin=None, draft='main', id='d86e4cb53c70443b8d8782a6847f4752'), '7d12d83bb06747698381d6326fc7fd27': Example(label=True, text='This product is amazing!', img=None, origin=None, draft='main', id='7d12d83bb06747698381d6326fc7fd27'), 'c4cf7a874d91466aade2e705dea6cc79': Example(label=True, text='Thank you for your awesome work on this UI.', img=None, origin=None, draft='main', id='c4cf7a874d91466aade2e705dea6cc79'), '5cfab7b6ac5d4ef49cb2561f7fd90ba5': Example(label=False, text='We publicly share our dataset and code for future research.', img=None, origin=None, draft='main', id='5cfab7b6ac5d4ef49cb2561f7fd90ba5')}


#### Remove examples


In [None]:
db.edit('local', 'positive-product-reviews',
        ll.ConceptUpdate(remove=['d86e4cb53c70443b8d8782a6847f4752']))

##### Use the new concept


In [13]:
signal = ll.signals.ConceptSignal(
  namespace='local', concept_name='positive-product-reviews', embedding='gte-small')

result = list(signal.compute(['This product is amazing, thank you!']))

print(result)

[[{'__value__': {'start': 0, 'end': 35}, 'score': 0.6601540385288754}]]


#### Concept metrics

To compute metrics for a concept, we first have to instantiate a concept model.


In [4]:
model_db = ll.DiskConceptModelDB(ll.DiskConceptDB())

model = model_db.get('local', 'positive-product-reviews', embedding_name='gte-small')

if model:
  print(model.get_metrics())


f1=0.9183673469387754 precision=0.9 recall=1.0 roc_auc=0.9595959595959596 overall=<OverallScore.VERY_GOOD: 'very_good'>


#### Remove the concept


In [9]:
db.remove('local', 'positive-product-reviews')