In [None]:
import re
import random

ledes = ' '.join(Story.objects.values_list('lede', flat=True))
words = re.findall(r'\w+', ledes.lower().replace('_', ''))
words = list(set(words))
words

def random_query(length=2):
    return ' '.join(random.sample(words, length))

In [None]:
for n in range(50):
    q = random_query(1)
    print(q, Story.objects.search(q).count())

In [None]:
from django.contrib.postgres.indexes import GinIndex
from django.contrib.postgres.search import ( SearchQuery, SearchRank, SearchVector, SearchVectorField, TrigramSimilarity, TrigramBase )
from django.db.models import ( Case, ExpressionWrapper, F, Func, Model, QuerySet, Value, When, )
from django.db.models import FloatField, CharField
from django.db.models.functions import Concat
from django.utils.timezone import now

class TrigramWordSimilarity(Func):
    output_field = FloatField()
    function='WORD_SIMILARITY'
    def __init__(self, expression, string, **extra):
        if not hasattr(string, 'resolve_expression'):
            string = Value(string)
        super().__init__(string, expression, **extra)
        

class LogAge(Func):
    """Calculate log 10 of hours since datetime column"""
    MIN_AGE = 2.0
    # Minimum age 2 hours. Prevent log of zero error and unintended large
    # effect of log of very small inputs.
    output_field = FloatField()

    template = (
        f'log(greatest({MIN_AGE},'
        'abs(extract(epoch FROM (TIMESTAMP '
        "'%(when)s' - %(table)s.%(timefield)s)))"
        '/ (60 * 60)))::real'
    )

    # PostgreSQL query explanation:

    # log(greatest(x, y))
    # base 10 logarithm of largest number of x and y

    # @ extract(epoch FROM (when - then))
    # Extract total seconds from timedelta `now - then`
    # `epoch` = 1970-01-01 = unix epoch = total seconds
    # @ is absolute number math operator

    # / (60 * 60)
    # Divide by minutes and seconds: seconds -> hours

    # ::real
    #  Cast result as `real` using PostgreSQL type cast notation
    # `real` = 32 bits floating point number

In [None]:
from functools import reduce
from operator import and_

FullTextSearchQuerySet(model=Story).search('iseln nybø').count()

class FullTextSearchQuerySet(QuerySet):
    """Queryset mixin for performing search and indexing for the Story model"""
    config = 'norwegian'
    case_config = Case(
        When(language='en', then=Value('english')), default=Value(config)
    )
    vector = ( 
        SearchVector('working_title', 'title', 'kicker', 'theme_word', weight='A', config=case_config,) + 
        SearchVector('lede', weight='B', config=case_config) + 
        SearchVector('bodytext_markup', weight='C', config=case_config, )
    )




    def with_age(self, field='created', when=None):
        if when is None:
            when = now()
        return self.annotate(
            age=LogAge(
                when=when, timefield=field, table=self.model._meta.db_table
            )
        )

    def update_search_vector(self):
        """Calculate and store search vector in the database."""
        return self.update(search_vector=self.vector)
    
    
    def search(self, query):
        """Perform postgresql full text search using search vector."""
        if not isinstance(query, str):
            msg = f'expected query to be str, got {type(query)}, {query!r}'
            raise ValueError(msg)
        qs = self.with_age('created')
        if len(query) > 5:
            result = qs.with_search_rank(query).annotate(
                rank=ExpressionWrapper(F('search_rank') / F('age'), FloatField())
            ).order_by('-rank')
            if result:
                return result
        # fallback to title trigram search
        return qs.trigram_search(query).order_by('-age')
    
    def with_search_rank(self, query):
        search = SearchQuery(query, config=self.config)
        return self.annotate( search_rank=SearchRank(F('search_vector'), search), ).filter(search_rank__gt=0.2)
    
    def trigram_search(self, query, cutoff=None):
        """Perform postgresql trigram similarity lookup
        https://docs.djangoproject.com/en/2.0/ref/contrib/postgres/search/
        """
        if cutoff is None:
            #cutoff = 0.5
            cutoff = 1 - min(5, len(query)) / 10 
        head = Concat(
            F('kicker'), Value(' '), F('title'), Value(' '), F('lede'))
        return self.annotate(
            rank=TrigramWordSimilarity(head, query),
        ).exclude(rank__lt=cutoff).order_by('-rank')


FullTextSearchQuerySet(model=Story).search('iseln nybø').count()

In [None]:
%timeit -n30 -r3 FullTextSearchQuerySet(model=Story).search(random_query(1))
%timeit -n30 -r3 FullTextSearchQuerySet(model=Story).search(random_query(2))
%timeit -n30 -r3 FullTextSearchQuerySet(model=Story).search(random_query(3))