In [60]:
documents = {
    1: "Ein Film über Sterne mit Harrison Ford.",
    2: "Ein Film über Sterne von Antoine de Saint-Exupéry.",
    3: "In diesem Film gewinnt James Bond gegen Dr. No.",
}

def tokenize(text):
    import re, string
    text = text.lower()
    for token in text.split():
        yield token.strip(string.punctuation)
    #yield from re.findall("\w+", text, )

from tqdm.auto import tqdm
 
def build_text_index(dataset, index=None):
    """Builds and returns full-text index for all documents in a dataset. Adds to an existing index if one is given."""
    if index is None:
        index = dict()
    for id, document in tqdm(dataset):
        for token in tokenize(document):
            index.setdefault(token, set()).add(id)
    return index

def query_index(index, query):
    result_set = set()
    return index.get(query, set())



In [62]:
for t in tokenize(documents[2]): print(t)

ein
film
über
sterne
von
antoine
de
saint-exupéry


In [18]:
index = build_text_index(documents.items())
query_index(index, "Ford")
index

  0%|          | 0/3 [00:00<?, ?it/s]

{'ein': {1, 2},
 'film': {1, 2, 3},
 'über': {1, 2},
 'sterne': {1, 2},
 'mit': {1},
 'harrison': {1},
 'ford': {1},
 'von': {2},
 'antoine': {2},
 'de': {2},
 'saint': {2},
 'exupéry': {2},
 'in': {3},
 'diesem': {3},
 'gewinnt': {3},
 'james': {3},
 'bond': {3},
 'gegen': {3},
 'dr': {3},
 'no': {3}}

In [20]:
def read_movie_csv(filename):
    import csv
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile)
        for url, text in reader:
            yield url, text

In [41]:
movie_idx = build_text_index(read_movie_csv('data/movies_1985.csv'))

0it [00:00, ?it/s]

In [50]:
query_index(movie_idx, 'delorean')

{'https://de.wikipedia.org/wiki/Zur%C3%BCck_in_die_Zukunft_(Film)'}

In [51]:
n = sum(1 for _ in read_movie_csv('data/movies_1985.csv'))
cutoff = 0.5
freqs = [(word, len(docs) / n) for word, docs in movie_idx.items() if len(docs) / n > cutoff]
import operator
freqs.sort(reverse=True, key=operator.itemgetter(1))

stop_words = {freq[0] for freq in freqs}
stop_words

def tokenize(text, stop_words=stop_words):
    import re
    for token in re.findall("\w+", text):
        if not token in stop_words:
            yield token.lower()
    

In [52]:
movie_idx = build_text_index(read_movie_csv('data/movies_1985.csv'))

0it [00:00, ?it/s]

In [47]:
query_index(movie_idx, 'der')

set()

In [48]:
query_index(movie_idx, 'delorean')

{'https://de.wikipedia.org/wiki/Zur%C3%BCck_in_die_Zukunft_(Film)'}