In [43]:
%load_ext autoreload
%autoreload 2
from expressiveness_benchmark.types import Plan, Task, Language, SourceRange, Program, load_all_programs
from code_widget.example import CodeWidget
from dataclasses import replace
import json
import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [44]:
# CHANGE ME!
TASK_ID = 'documents_with_infrequent_words'
AUTHOR = 'will'

In [209]:
task = Task(
    id=TASK_ID,
    category="Strings",
    description="Documents with infrequent words",
    plan=[],
    sample_input={
        "documents": [
            {"id": 1, "text": "Hello world"},
            {"id": 2, "text": "Hello friend"},
            {"id": 3, "text": "friend of the world"},
            {"id": 4, "text": "Hola"}
        ]
    },
    sample_output=[3, 4],
)
task.save()

prototype = Program(
    task=TASK_ID,
    author=AUTHOR,
    language=''    
)

In [206]:
python_imperative = replace(prototype,
    language='python-imperative',
    source='''
def documents_with_infrequent_words(documents):
  words = {}
  freq = defaultdict(int)
  for doc in documents:
    words[doc["id"]] = doc["text"].split(" ")
    for word in words[doc["id"]]:
      freq[word] += 1
      
  infrequent_words = set()
  for word, count in freq.items():
    if count == 1:
      infrequent_words.add(word)
      
  infrequent_docs = []
  for doc in documents:
    for word in words[doc["id"]]:
      if word in infrequent_words:
        infrequent_docs.append(doc["id"])
        break
        
  return infrequent_docs
'''
).load_plan()
python_imperative.execute(task)
python_imperative.save()

In [47]:
python_functional = replace(prototype,
    language='python-functional',
    source='''
def documents_with_infrequent_words(documents):
  words = [doc["text"].split(" ") for doc in documents]
  words_flat = [w for ws in words for w in ws]
  freq = {word: words_flat.count(word) for word in set(words_flat)}
  infrequent_words = set([word for word, count in freq.items() if count == 1])
  infrequent_docs = [documents[i]["id"] for i, ws in enumerate(words) if len(set(ws) & infrequent_words) > 0]
  return infrequent_docs
''').load_plan()
python_functional.execute(task)
python_functional.save()

In [86]:
python_pandas = replace(prototype,
    language='python-pandas',
    source='''
def documents_with_infrequent_words(documents):
  words = documents.text.str.split(" ", expand=True)
  freq = words.stack().value_counts()
  infrequent_words = freq[freq == 1].index.values
  infrequent_docs = documents[np.isin(words.values, infrequent_words)]
  return infrequent_docs.id.unique().tolist()  
''').load_plan()
python_pandas.execute(task)
python_pandas.save()

In [162]:
sql = replace(prototype,
    language='sql',
    source='''
CREATE VIRTUAL TABLE doc_index USING fts4(text, id, content=documents, tokenize=simple);    
INSERT INTO doc_index(doc_index) VALUES('rebuild');
CREATE VIRTUAL TABLE words USING fts4aux(doc_index);    

SELECT DISTINCT id
FROM 
  documents
  CROSS JOIN
  (SELECT DISTINCT term
   FROM words
   WHERE occurrences = 1) unique_words
WHERE
  (LOWER(text) LIKE '% ' || term || ' %') OR
  (LOWER(text) LIKE term || ' %') OR
  (LOWER(text) LIKE '% ' || term) OR
  (LOWER(text) LIKE term)
''').load_plan()
sql.execute(task)
sql.save()

In [210]:
datalog = replace(prototype,
    language='datalog',
    source='''
.decl substrs(text:symbol, idx:number, len:number)
substrs(Text, 0, 1) :- documents(_, Text), strlen(Text) > 0.
substrs(Text, 0, Len+1) :- substrs(Text, 0, Len), Len + 1 <= strlen(Text).
substrs(Text, Idx+1, Len) :- substrs(Text, Idx, Len), Idx + Len + 1 <= strlen(Text).

.decl token(docid:number, text:symbol, idx:number, word:symbol)
token(Docid, Text, Idx, Word) :-
  documents(Docid, Text),
  substrs(Text, Idx, Len),
  Prev = Idx - 1, Next = Idx + Len,
  (Prev < 0; " " = substr(Text, Prev, 1)),
  (Next = strlen(Text); " " = substr(Text, Next, 1)),
  Word = substr(Text, Idx, Len),
  !contains(" ", Word).

documents_with_infrequent_words(Id) :-
  documents(Id, _),
  token(Id, _, _, Word),
  1 = count : token(_, _, _, Word).
''').load_plan()
datalog.execute(task)
datalog.save()