In [239]:
import requests
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

prepositions_and_conjunctions = [
    "about", "above", "across", "after", "against", "along", "among", "around", "at",
    "before", "behind", "below", "beneath", "beside", "between", "beyond", "by",
    "during", "for", "from", "in", "inside", "into", "near", "of", "off", "on",
    "out", "outside", "over", "through", "throughout", "to", "toward", "under",
    "until", "up", "with", "within", "without", "and", "but", "or", "for", "nor",
    "so", "yet", "although", "because", "as", "since", "unless", "while", "when",
    "where", "after", "before"
]
pattern = r'\b(?:' + '|'.join(prepositions_and_conjunctions) + r')\b'

def remove_prepositions_and_conjunctions(text):
    cleaned_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r'\d+', '', cleaned_text)
    return re.sub(r'\s+', ' ', cleaned_text).strip()

def get_important_words(document):
    document = remove_prepositions_and_conjunctions(document)
    document = [document]

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(document)

    feature_names = vectorizer.get_feature_names_out()
    first_document_vector = tfidf_matrix[0]
    important_words = sorted(
        [(feature_names[i], first_document_vector[0, i]) for i in first_document_vector.nonzero()[1]],
        key=lambda x: x[1], reverse=True
    )
    return important_words

def count_matching_words(query_terms, important_words):
    matching_count = 0
    important_words_list = [word[0] for word in important_words]
    for term in query_terms:
        matching_count += important_words_list.count(term)
    return matching_count

In [211]:
get_url = 'http://localhost:9999/search?q=synthetic%20oil&limit=1000'
response = requests.get(get_url)

if response.status_code == 200:
    data = response.json()
    post_url = 'http://localhost:9999/title'
    post_response = requests.post(post_url, json=data)

    if post_response.status_code == 200:
        deals = []
        for deal in post_response.json():
            deals.append(deal)
        deals_df_ = pd.DataFrame(deals)
    else:
        print("HTTP POST error:", post_response.status_code)

else:
    print("Error status code:", response.status_code)

def compute_tfidf(text_series):
    vectorizer = TfidfVectorizer(ngram_range=(1, 2))
    tfidf_matrix = vectorizer.fit_transform(text_series)
    return tfidf_matrix, vectorizer.get_feature_names_out()

Unnamed: 0,id,title,title_general,med_image,value,price,discount_percent,rating_count,rating_value,merchant_name
0,224-massage-palatine-17,One 60-Minute Couples Massage w/ Hot Stone and Essential Oil,"Experience relaxation at 224 Massage Palatine with couples massage options including hot stone and essential oil, up to 27% off",https://img.grouponcdn.com/bynder/H5kwJHxCyTfvBMA3xg6PPjD8XGm/H5-2048x1229/v1/t300x182.jpg,18900.0,18900.0,0.0,104,4.6,224 Massage Palatine
1,224-massage-palatine-23,One 60-Minute Swedish Massage with Hot Stone & Oil,"Experience relaxation at 224 Massage Palatine with a 60min Swedish massage, including hot stone and oil, up to 0%",https://img.grouponcdn.com/deal/2HU9v1gcCc9GGoVU1vqNwoWBa1M9/2H-2048x1229/v1/t300x182.jpg,8500.0,8500.0,0.0,104,4.6,224 Massage Palatine


In [237]:
deals_df = deals_df_[["id", "title", "title_general", "merchant_name"]]
deals_df.loc[:, "combined_text"] = (deals_df['title'].astype(str) + ' ' + 
                                    deals_df['title_general'].astype(str) + ' ' + 
                                    deals_df['merchant_name'].astype(str)).str.lower()
deals_df = deals_df[["id", "combined_text"]]
deals_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  deals_df.loc[:, "combined_text"] = (deals_df['title'].astype(str) + ' ' +


Unnamed: 0,id,combined_text
0,224-massage-palatine-17,"one 60-minute couples massage w/ hot stone and essential oil experience relaxation at 224 massage palatine with couples massage options including hot stone and essential oil, up to 27% off 224 massage palatine"
1,224-massage-palatine-23,"one 60-minute swedish massage with hot stone & oil experience relaxation at 224 massage palatine with a 60min swedish massage, including hot stone and oil, up to 0% 224 massage palatine"
2,2nd-chance-auto-services,"premium mobile front & back brake pad replacement & oil change ensure vehicle safety with 2nd chance auto services' mobile brake pad replacement and oil change, up to 50% off 2nd chance auto services"
3,4-season-massage,60-minute couples therapeutic body massage with chinese pain-relief oil and hot-stone treatment for two 60 min couples body massage with chinese relief oil & hot stone for two at four seasons massage (up to 50% off) four seasons massage
4,44-heal-well-6,"60-minute massage with cbd oil experience 60 min massage with cbd oil at 44th street health & wellness, offering up to 89% off for a tranquil escape 44th street health & wellness"


In [240]:
query_terms = ['oli', 'massage', 'massage oil']

deals_df['important_words'] = deals_df['combined_text'].apply(get_important_words)
deals_df['matching_count'] = deals_df['important_words'].apply(lambda row: count_matching_words(query_terms, row))



deals_df.head()

Unnamed: 0,id,combined_text,important_words,matching_count
0,224-massage-palatine-17,"one 60-minute couples massage w/ hot stone and essential oil experience relaxation at 224 massage palatine with couples massage options including hot stone and essential oil, up to 27% off 224 massage palatine","[(massage, 0.5897678246195885), (couples, 0.29488391230979427), (hot, 0.29488391230979427), (stone, 0.29488391230979427), (essential, 0.29488391230979427), (oil, 0.29488391230979427), (palatine, 0.29488391230979427), (one, 0.14744195615489714), (minute, 0.14744195615489714), (experience, 0.14744195615489714), (relaxation, 0.14744195615489714), (options, 0.14744195615489714), (including, 0.14744195615489714)]",1
1,224-massage-palatine-23,"one 60-minute swedish massage with hot stone & oil experience relaxation at 224 massage palatine with a 60min swedish massage, including hot stone and oil, up to 0% 224 massage palatine","[(massage, 0.6172133998483676), (swedish, 0.3086066999241838), (hot, 0.3086066999241838), (stone, 0.3086066999241838), (oil, 0.3086066999241838), (palatine, 0.3086066999241838), (one, 0.1543033499620919), (minute, 0.1543033499620919), (experience, 0.1543033499620919), (relaxation, 0.1543033499620919), (min, 0.1543033499620919), (including, 0.1543033499620919)]",1
2,2nd-chance-auto-services,"premium mobile front & back brake pad replacement & oil change ensure vehicle safety with 2nd chance auto services' mobile brake pad replacement and oil change, up to 50% off 2nd chance auto services","[(mobile, 0.29488391230979427), (brake, 0.29488391230979427), (pad, 0.29488391230979427), (replacement, 0.29488391230979427), (oil, 0.29488391230979427), (change, 0.29488391230979427), (nd, 0.29488391230979427), (chance, 0.29488391230979427), (auto, 0.29488391230979427), (services, 0.29488391230979427), (premium, 0.14744195615489714), (front, 0.14744195615489714), (back, 0.14744195615489714), (ensure, 0.14744195615489714), (vehicle, 0.14744195615489714), (safety, 0.14744195615489714)]",0
3,4-season-massage,60-minute couples therapeutic body massage with chinese pain-relief oil and hot-stone treatment for two 60 min couples body massage with chinese relief oil & hot stone for two at four seasons massage (up to 50% off) four seasons massage,"[(massage, 0.5121475197315839), (couples, 0.25607375986579195), (body, 0.25607375986579195), (chinese, 0.25607375986579195), (relief, 0.25607375986579195), (oil, 0.25607375986579195), (hot, 0.25607375986579195), (stone, 0.25607375986579195), (two, 0.25607375986579195), (four, 0.25607375986579195), (seasons, 0.25607375986579195), (minute, 0.12803687993289598), (therapeutic, 0.12803687993289598), (pain, 0.12803687993289598), (treatment, 0.12803687993289598), (min, 0.12803687993289598)]",1
4,44-heal-well-6,"60-minute massage with cbd oil experience 60 min massage with cbd oil at 44th street health & wellness, offering up to 89% off for a tranquil escape 44th street health & wellness","[(massage, 0.34299717028501764), (cbd, 0.34299717028501764), (oil, 0.34299717028501764), (th, 0.34299717028501764), (street, 0.34299717028501764), (health, 0.34299717028501764), (wellness, 0.34299717028501764), (minute, 0.17149858514250882), (experience, 0.17149858514250882), (min, 0.17149858514250882), (offering, 0.17149858514250882), (tranquil, 0.17149858514250882), (escape, 0.17149858514250882)]",1
