In [280]:
import requests
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer, util

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

prepositions_and_conjunctions = [
    "about", "above", "across", "after", "against", "along", "among", "around", "at",
    "before", "behind", "below", "beneath", "beside", "between", "beyond", "by",
    "during", "for", "from", "in", "inside", "into", "near", "of", "off", "on",
    "out", "outside", "over", "through", "throughout", "to", "toward", "under",
    "until", "up", "with", "within", "without", "and", "but", "or", "for", "nor",
    "so", "yet", "although", "because", "as", "since", "unless", "while", "when",
    "where", "after", "before"
]
pattern = r'\b(?:' + '|'.join(prepositions_and_conjunctions) + r')\b'

def remove_prepositions_and_conjunctions(text):
    cleaned_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r'\d+', '', cleaned_text)
    return re.sub(r'\s+', ' ', cleaned_text).strip()

def get_important_words(document):
    document = remove_prepositions_and_conjunctions(document)
    document = [document]

    vectorizer = TfidfVectorizer(ngram_range=(1, 2))
    tfidf_matrix = vectorizer.fit_transform(document)

    feature_names = vectorizer.get_feature_names_out()
    first_document_vector = tfidf_matrix[0]
    important_words = sorted(
        [(feature_names[i], first_document_vector[0, i]) for i in first_document_vector.nonzero()[1]],
        key=lambda x: x[1], reverse=True
    )
    return important_words

def count_matching_words(query_terms, important_words):
    matching_count = 0
    important_words_list = [word[0] for word in important_words]
    for term in query_terms:
        matching_count += important_words_list.count(term)
    return matching_count
    
def count_word_occurrences(query_terms, text):
    matching_count = 0
    
    # Projděte seznam slov
    for word in query_terms:
        # Spočítejte výskyty každého slova v textu
        matching_count += text.lower().count(word.lower())
    
    return matching_count

def proportion_of_key_terms(query_terms, deal_text):
    deal_text_lower = deal_text.lower()
    matching_count = sum(term.lower() in deal_text_lower for term in query_terms)
    proportion = matching_count / len(query_terms) if query_terms else 0
    
    return proportion

def calculate_distance(query_terms, deal_text):
    query_terms = [" ".join(query_terms)]
    text_embedding = model.encode(deal_text, convert_to_tensor=True)
    word_embeddings = model.encode(query_terms, convert_to_tensor=True)

    cos_similarities = util.pytorch_cos_sim(text_embedding, word_embeddings)
    distances = 1 - cos_similarities.cpu().numpy()
    
    return distances[0][0]
    
def average_word_length(text):
    words = text.split()
    
    if not words:
        return 0.0
    
    total_length = sum(len(word) for word in words)
    average_length = total_length / len(words)
    return average_length



In [281]:
get_url = 'http://localhost:9999/search?q=synthetic%20oil&limit=1000'
response = requests.get(get_url)

if response.status_code == 200:
    data = response.json()
    post_url = 'http://localhost:9999/title'
    post_response = requests.post(post_url, json=data)

    if post_response.status_code == 200:
        deals = []
        for deal in post_response.json():
            deals.append(deal)
        deals_df_ = pd.DataFrame(deals)
    else:
        print("HTTP POST error:", post_response.status_code)

else:
    print("Error status code:", response.status_code)

#def compute_tfidf(text_series):
#    vectorizer = TfidfVectorizer(ngram_range=(1, 2))
#    tfidf_matrix = vectorizer.fit_transform(text_series)
#    return tfidf_matrix, vectorizer.get_feature_names_out()

In [289]:
deals_df = deals_df_[["id", "title", "title_general", "merchant_name"]]
deals_df.loc[:, "comb_text"] = (deals_df['title'].astype(str) + ' ' + 
                                deals_df['title_general'].astype(str) + ' ' + 
                                deals_df['merchant_name'].astype(str)).str.lower()
deals_df = deals_df[["id", "comb_text"]]
deals_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  deals_df.loc[:, "comb_text"] = (deals_df['title'].astype(str) + ' ' +


Unnamed: 0,id,comb_text
0,224-massage-palatine-17,"experience relaxation at 224 massage palatine with couples massage options including hot stone and essential oil, up to 27% off 224 massage"
1,224-massage-palatine-23,"experience relaxation at 224 massage palatine with a 60min swedish massage, including hot stone and oil, up to 0% 224 massage"
2,2nd-chance-auto-services,"ensure vehicle safety with 2nd chance auto services' mobile brake pad replacement and oil change, up to 50% off 2nd chance auto services"
3,4-season-massage,60 min couples body massage with chinese relief oil & hot stone for two at four seasons massage (up to 50% off) 4 season massage
4,4-season-massage-3,"4 season massage offers therapeutic massage options including cupping, moxibustion, and cbd oil for up to 28% off. 4 season massage"


#### features prosessing

In [291]:
query_terms = ['oil', 'massage']
q_bigrams = ['massage oil', 'oil massage']

average_query_length = average_word_length(" ".join(query_terms))                                                 # proporcionality of search term
deals_df['mc'] = deals_df['comb_text'].apply(lambda row: count_word_occurrences(query_terms, row))                # simple match count of query x text
deals_df['imp_w'] = deals_df['comb_text'].apply(get_important_words)                                              # match count query x text
deals_df['mc_impo'] = deals_df['imp_w'].apply(lambda row: count_matching_words(query_terms, row))                 # match count query x important unigrams and bigrams
deals_df['mc_impo_bigr'] = deals_df['imp_w'].apply(lambda row: count_matching_words(q_bigrams, row))              # match count query bigrams x important unigrams and bigrams
deals_df['mc_bigr'] = deals_df['comb_text'].apply(lambda row: count_word_occurrences(q_bigrams, row))             # match count query bigrams x text
deals_df['pro_of_term'] = deals_df['comb_text'].apply(lambda row: proportion_of_key_terms(query_terms, row))      # proporcionality of text
deals_df['we_dist'] = deals_df['comb_text'].apply(lambda row: calculate_distance(query_terms, row))               # word embedings cos distance
deals_df['avg_w_len'] = deals_df['comb_text'].apply(average_word_length)                                          # average word length
deals_df['avg_q_len'] = average_query_length                                                                      # average query length

deals_df.head(100)

Unnamed: 0,id,comb_text,imp_w,mc,mc_impo,mc_impo_bigr,mc_bigr,pro_of_term,we_dist,avg_w_len,avg_q_len
0,224-massage-palatine-17,"experience relaxation at 224 massage palatine with couples massage options including hot stone and essential oil, up to 27% off 224 massage","[(massage, 0.5388159060803247), (experience, 0.1796053020267749), (relaxation, 0.1796053020267749), (palatine, 0.1796053020267749), (couples, 0.1796053020267749), (options, 0.1796053020267749), (including, 0.1796053020267749), (hot, 0.1796053020267749), (stone, 0.1796053020267749), (essential, 0.1796053020267749), (oil, 0.1796053020267749), (experience relaxation, 0.1796053020267749), (relaxation massage, 0.1796053020267749), (massage palatine, 0.1796053020267749), (palatine couples, 0.1796053020267749), (couples massage, 0.1796053020267749), (massage options, 0.1796053020267749), (options including, 0.1796053020267749), (including hot, 0.1796053020267749), (hot stone, 0.1796053020267749), (stone essential, 0.1796053020267749), (essential oil, 0.1796053020267749), (oil massage, 0.1796053020267749)]",4,2,1,0,1.0,0.26459,5.363636,5.0
1,224-massage-palatine-23,"experience relaxation at 224 massage palatine with a 60min swedish massage, including hot stone and oil, up to 0% 224 massage","[(massage, 0.5570860145311556), (experience, 0.18569533817705186), (relaxation, 0.18569533817705186), (palatine, 0.18569533817705186), (min, 0.18569533817705186), (swedish, 0.18569533817705186), (including, 0.18569533817705186), (hot, 0.18569533817705186), (stone, 0.18569533817705186), (oil, 0.18569533817705186), (experience relaxation, 0.18569533817705186), (relaxation massage, 0.18569533817705186), (massage palatine, 0.18569533817705186), (palatine min, 0.18569533817705186), (min swedish, 0.18569533817705186), (swedish massage, 0.18569533817705186), (massage including, 0.18569533817705186), (including hot, 0.18569533817705186), (hot stone, 0.18569533817705186), (stone oil, 0.18569533817705186), (oil massage, 0.18569533817705186)]",4,2,1,0,1.0,0.293621,5.0,5.0
2,2nd-chance-auto-services,"ensure vehicle safety with 2nd chance auto services' mobile brake pad replacement and oil change, up to 50% off 2nd chance auto services","[(nd, 0.2917299829957891), (chance, 0.2917299829957891), (auto, 0.2917299829957891), (services, 0.2917299829957891), (nd chance, 0.2917299829957891), (chance auto, 0.2917299829957891), (auto services, 0.2917299829957891), (ensure, 0.14586499149789456), (vehicle, 0.14586499149789456), (safety, 0.14586499149789456), (mobile, 0.14586499149789456), (brake, 0.14586499149789456), (pad, 0.14586499149789456), (replacement, 0.14586499149789456), (oil, 0.14586499149789456), (change, 0.14586499149789456), (ensure vehicle, 0.14586499149789456), (vehicle safety, 0.14586499149789456), (safety nd, 0.14586499149789456), (services mobile, 0.14586499149789456), (mobile brake, 0.14586499149789456), (brake pad, 0.14586499149789456), (pad replacement, 0.14586499149789456), (replacement oil, 0.14586499149789456), (oil change, 0.14586499149789456), (change nd, 0.14586499149789456)]",1,1,0,0,0.5,0.697636,4.956522,5.0
3,4-season-massage,60 min couples body massage with chinese relief oil & hot stone for two at four seasons massage (up to 50% off) 4 season massage,"[(massage, 0.50709255283711), (min, 0.1690308509457033), (couples, 0.1690308509457033), (body, 0.1690308509457033), (chinese, 0.1690308509457033), (relief, 0.1690308509457033), (oil, 0.1690308509457033), (hot, 0.1690308509457033), (stone, 0.1690308509457033), (two, 0.1690308509457033), (four, 0.1690308509457033), (seasons, 0.1690308509457033), (season, 0.1690308509457033), (min couples, 0.1690308509457033), (couples body, 0.1690308509457033), (body massage, 0.1690308509457033), (massage chinese, 0.1690308509457033), (chinese relief, 0.1690308509457033), (relief oil, 0.1690308509457033), (oil hot, 0.1690308509457033), (hot stone, 0.1690308509457033), (stone two, 0.1690308509457033), (two four, 0.1690308509457033), (four seasons, 0.1690308509457033), (seasons massage, 0.1690308509457033), (massage season, 0.1690308509457033), (season massage, 0.1690308509457033)]",4,2,0,0,1.0,0.337149,4.16,5.0
4,4-season-massage-3,"4 season massage offers therapeutic massage options including cupping, moxibustion, and cbd oil for up to 28% off. 4 season massage","[(massage, 0.50709255283711), (season, 0.3380617018914066), (season massage, 0.3380617018914066), (offers, 0.1690308509457033), (therapeutic, 0.1690308509457033), (options, 0.1690308509457033), (including, 0.1690308509457033), (cupping, 0.1690308509457033), (moxibustion, 0.1690308509457033), (cbd, 0.1690308509457033), (oil, 0.1690308509457033), (massage offers, 0.1690308509457033), (offers therapeutic, 0.1690308509457033), (therapeutic massage, 0.1690308509457033), (massage options, 0.1690308509457033), (options including, 0.1690308509457033), (including cupping, 0.1690308509457033), (cupping moxibustion, 0.1690308509457033), (moxibustion cbd, 0.1690308509457033), (cbd oil, 0.1690308509457033), (oil season, 0.1690308509457033)]",4,2,0,0,1.0,0.348609,5.285714,5.0
5,44-heal-well-6,"experience 60 min massage with cbd oil at 44th street health & wellness, offering up to 89% off for a tranquil escape 44th street health & wellness","[(th, 0.29814239699997197), (street, 0.29814239699997197), (health, 0.29814239699997197), (wellness, 0.29814239699997197), (th street, 0.29814239699997197), (street health, 0.29814239699997197), (health wellness, 0.29814239699997197), (experience, 0.14907119849998599), (min, 0.14907119849998599), (massage, 0.14907119849998599), (cbd, 0.14907119849998599), (oil, 0.14907119849998599), (offering, 0.14907119849998599), (tranquil, 0.14907119849998599), (escape, 0.14907119849998599), (experience min, 0.14907119849998599), (min massage, 0.14907119849998599), (massage cbd, 0.14907119849998599), (cbd oil, 0.14907119849998599), (oil th, 0.14907119849998599), (wellness offering, 0.14907119849998599), (offering tranquil, 0.14907119849998599), (tranquil escape, 0.14907119849998599), (escape th, 0.14907119849998599)]",2,2,0,0,1.0,0.345913,4.481481,5.0
6,a-1-quality-car-care-6,enjoy up to 46% off on a-1 quality car care's full synthetic oil change with a 29-point inspection in palm springs a-1 quality car care,"[(quality, 0.32025630761017426), (car, 0.32025630761017426), (care, 0.32025630761017426), (quality car, 0.32025630761017426), (car care, 0.32025630761017426), (enjoy, 0.16012815380508713), (full, 0.16012815380508713), (synthetic, 0.16012815380508713), (oil, 0.16012815380508713), (change, 0.16012815380508713), (point, 0.16012815380508713), (inspection, 0.16012815380508713), (palm, 0.16012815380508713), (springs, 0.16012815380508713), (enjoy quality, 0.16012815380508713), (care full, 0.16012815380508713), (full synthetic, 0.16012815380508713), (synthetic oil, 0.16012815380508713), (oil change, 0.16012815380508713), (change point, 0.16012815380508713), (point inspection, 0.16012815380508713), (inspection palm, 0.16012815380508713), (palm springs, 0.16012815380508713), (springs quality, 0.16012815380508713)]",1,1,0,0,0.5,0.576673,4.44,5.0
7,a-a-oil-wash-operations-inc-3,"a & a-oil & wash operations inc offers full synthetic oil change with filter replacement, up to 49% off a & a-oil & wash operations inc","[(oil, 0.4375949744936837), (wash, 0.2917299829957891), (operations, 0.2917299829957891), (inc, 0.2917299829957891), (oil wash, 0.2917299829957891), (wash operations, 0.2917299829957891), (operations inc, 0.2917299829957891), (offers, 0.14586499149789456), (full, 0.14586499149789456), (synthetic, 0.14586499149789456), (change, 0.14586499149789456), (filter, 0.14586499149789456), (replacement, 0.14586499149789456), (inc offers, 0.14586499149789456), (offers full, 0.14586499149789456), (full synthetic, 0.14586499149789456), (synthetic oil, 0.14586499149789456), (oil change, 0.14586499149789456), (change filter, 0.14586499149789456), (filter replacement, 0.14586499149789456), (replacement oil, 0.14586499149789456)]",3,1,0,0,0.5,0.657617,4.230769,5.0
8,a-one-auto-body-and-repair,up to 10% off on oil change at a one auto body and repair a1 auto body & repair,"[(auto, 0.3849001794597505), (body, 0.3849001794597505), (repair, 0.3849001794597505), (auto body, 0.3849001794597505), (body repair, 0.3849001794597505), (oil, 0.19245008972987526), (change, 0.19245008972987526), (one, 0.19245008972987526), (oil change, 0.19245008972987526), (change one, 0.19245008972987526), (one auto, 0.19245008972987526), (repair auto, 0.19245008972987526)]",1,1,0,0,0.5,0.610099,3.210526,5.0
9,a-plus-massage-12,rejuvenate with 60 minutes deep tissue massage with cbd oil or couples massage (up to 28% off) a plus massage,"[(massage, 0.5773502691896257), (rejuvenate, 0.19245008972987526), (minutes, 0.19245008972987526), (deep, 0.19245008972987526), (tissue, 0.19245008972987526), (cbd, 0.19245008972987526), (oil, 0.19245008972987526), (couples, 0.19245008972987526), (plus, 0.19245008972987526), (rejuvenate minutes, 0.19245008972987526), (minutes deep, 0.19245008972987526), (deep tissue, 0.19245008972987526), (tissue massage, 0.19245008972987526), (massage cbd, 0.19245008972987526), (cbd oil, 0.19245008972987526), (oil couples, 0.19245008972987526), (couples massage, 0.19245008972987526), (massage plus, 0.19245008972987526), (plus massage, 0.19245008972987526)]",4,2,0,0,1.0,0.309439,4.5,5.0


('zde', 'je'): 1
('je', 'příklad'): 1
('příklad', 'textu'): 1
('textu', 'který'): 1
('který', 'obsahuje'): 1
('obsahuje', 'některá'): 1
('některá', 'slova'): 1
('slova', 'toto'): 1
('toto', 'slovo'): 1
('slovo', 'bude'): 1
('bude', 'analyzováno'): 1
('analyzováno', 'a'): 1
('a', 'budete'): 1
('budete', 'hledat'): 1
('hledat', 'bigramy'): 1
('bigramy', 'kolem'): 1
('kolem', 'něj'): 1
('něj', 'můžete'): 1
('můžete', 'také'): 1
('také', 'přidat'): 1
('přidat', 'další'): 1
('další', 'text'): 1
('text', 'k'): 1
('k', 'analýze'): 1


[nltk_data] Downloading package punkt to /Users/zphilipp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
