In [118]:
import nltk
from nltk.corpus import wordnet as wn
import re

In [119]:
query = "What are the ins/outs of writing equipment purchases off as business expenses in a home based business?"
synonym_dict = {}
query

'What are the ins/outs of writing equipment purchases off as business expenses in a home based business?'

In [120]:
#Performs basic data cleaning on query: standardize capitalization to lower, remove punctuations, remove redundant whitespaces
def basic_cleaning(query):
    query = str(query)
    query = query.lower()
    query = re.sub(r'[^\w\s]','',query)
    query = ' '.join(query.split())
    return query

In [121]:
def nouns_only(query):
    try:
        tagged_text = nltk.tag.pos_tag(query.split())
        nouns_list = [word for word,tag in tagged_text if  tag == 'NNP' or tag == 'NNPS' or tag=="NN" or tag=="NNS"]
        return list(set(nouns_list))
    except:
        return []

In [122]:
cleaned_query = basic_cleaning(query)
query_nouns_list = nouns_only(cleaned_query)
query_nouns

['expenses', 'equipment', 'insouts', 'purchases', 'home', 'business']

In [123]:
def query_noun_mapping(query_nouns):
    for query_noun in query_nouns:
        try:
            closest_noun = wn.synsets(query_noun)[0].lemmas()[1].name()
            synonym_dict[query_noun] = closest_noun
        except:
            pass
    return synonym_dict

In [124]:
synonym_dict

{'expenses': 'disbursal', 'home': 'place', 'business': 'concern'}

In [126]:
def query_expansion(query):
    clean_query = basic_cleaning(query)
    query_nouns_list = nouns_only(clean_query)
    if len(query_nouns_list)==0:
        return query
    else:
        synonym_dict = query_noun_mapping(query_nouns_list)
    if len(synonym_dict.keys()) == 0:
        return query
    else:
        for k,v in synonym_dict.items():
            idx = query.lower().index(k)
            query = query[:idx] + f"and {v}" + query[idx:]
    return query