In [280]:
import requests
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer, util

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

prepositions_and_conjunctions = [
    "about", "above", "across", "after", "against", "along", "among", "around", "at",
    "before", "behind", "below", "beneath", "beside", "between", "beyond", "by",
    "during", "for", "from", "in", "inside", "into", "near", "of", "off", "on",
    "out", "outside", "over", "through", "throughout", "to", "toward", "under",
    "until", "up", "with", "within", "without", "and", "but", "or", "for", "nor",
    "so", "yet", "although", "because", "as", "since", "unless", "while", "when",
    "where", "after", "before"
]
pattern = r'\b(?:' + '|'.join(prepositions_and_conjunctions) + r')\b'

def remove_prepositions_and_conjunctions(text):
    cleaned_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r'\d+', '', cleaned_text)
    return re.sub(r'\s+', ' ', cleaned_text).strip()

def get_important_words(document):
    document = remove_prepositions_and_conjunctions(document)
    document = [document]

    vectorizer = TfidfVectorizer(ngram_range=(1, 2))
    tfidf_matrix = vectorizer.fit_transform(document)

    feature_names = vectorizer.get_feature_names_out()
    first_document_vector = tfidf_matrix[0]
    important_words = sorted(
        [(feature_names[i], first_document_vector[0, i]) for i in first_document_vector.nonzero()[1]],
        key=lambda x: x[1], reverse=True
    )
    return important_words

def count_matching_words(query_terms, important_words):
    matching_count = 0
    important_words_list = [word[0] for word in important_words]
    for term in query_terms:
        matching_count += important_words_list.count(term)
    return matching_count
    
def count_word_occurrences(query_terms, text):
    matching_count = 0
    
    # Projděte seznam slov
    for word in query_terms:
        # Spočítejte výskyty každého slova v textu
        matching_count += text.lower().count(word.lower())
    
    return matching_count

def proportion_of_key_terms(query_terms, deal_text):
    deal_text_lower = deal_text.lower()
    matching_count = sum(term.lower() in deal_text_lower for term in query_terms)
    proportion = matching_count / len(query_terms) if query_terms else 0
    
    return proportion

def calculate_distance(query_terms, deal_text):
    query_terms = [" ".join(query_terms)]
    text_embedding = model.encode(deal_text, convert_to_tensor=True)
    word_embeddings = model.encode(query_terms, convert_to_tensor=True)

    cos_similarities = util.pytorch_cos_sim(text_embedding, word_embeddings)
    distances = 1 - cos_similarities.cpu().numpy()
    
    return distances[0][0]
    
def average_word_length(text):
    words = text.split()
    
    if not words:
        return 0.0
    
    total_length = sum(len(word) for word in words)
    average_length = total_length / len(words)
    return average_length



In [281]:
get_url = 'http://localhost:9999/search?q=synthetic%20oil&limit=1000'
response = requests.get(get_url)

if response.status_code == 200:
    data = response.json()
    post_url = 'http://localhost:9999/title'
    post_response = requests.post(post_url, json=data)

    if post_response.status_code == 200:
        deals = []
        for deal in post_response.json():
            deals.append(deal)
        deals_df_ = pd.DataFrame(deals)
    else:
        print("HTTP POST error:", post_response.status_code)

else:
    print("Error status code:", response.status_code)

#def compute_tfidf(text_series):
#    vectorizer = TfidfVectorizer(ngram_range=(1, 2))
#    tfidf_matrix = vectorizer.fit_transform(text_series)
#    return tfidf_matrix, vectorizer.get_feature_names_out()

In [282]:
deals_df = deals_df_[["id", "title", "title_general", "merchant_name"]]
deals_df.loc[:, "combined_text"] = (deals_df['title'].astype(str) + ' ' + 
                                    deals_df['title_general'].astype(str) + ' ' + 
                                    deals_df['merchant_name'].astype(str)).str.lower()
deals_df = deals_df[["id", "combined_text"]]
deals_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  deals_df.loc[:, "combined_text"] = (deals_df['title'].astype(str) + ' ' +


Unnamed: 0,id,combined_text
0,224-massage-palatine-17,"experience relaxation at 224 massage palatine with couples massage options including hot stone and essential oil, up to 27% off 224 massage"
1,224-massage-palatine-23,"experience relaxation at 224 massage palatine with a 60min swedish massage, including hot stone and oil, up to 0% 224 massage"
2,2nd-chance-auto-services,"ensure vehicle safety with 2nd chance auto services' mobile brake pad replacement and oil change, up to 50% off 2nd chance auto services"
3,4-season-massage,60 min couples body massage with chinese relief oil & hot stone for two at four seasons massage (up to 50% off) 4 season massage
4,4-season-massage-3,"4 season massage offers therapeutic massage options including cupping, moxibustion, and cbd oil for up to 28% off. 4 season massage"


#### features prosessing

In [None]:
query_terms = ['oil', 'massage']
q_bigrams = ['massage oil', 'oil massage']

average_query_length = average_word_length(" ".join(query_terms))
deals_df['important_words'] = deals_df['combined_text'].apply(get_important_words)
deals_df['matching_count_important'] = deals_df['important_words'].apply(lambda row: count_matching_words(query_terms, row))
deals_df['matching_count_important_bigrams'] = deals_df['important_words'].apply(lambda row: count_matching_words(q_bigrams, row))
deals_df['matching_count'] = deals_df['combined_text'].apply(lambda row: count_word_occurrences(query_terms, row))
deals_df['matching_count_bigram'] = deals_df['combined_text'].apply(lambda row: count_word_occurrences(q_bigrams, row))
deals_df['proportion_of_key_terms'] = deals_df['combined_text'].apply(lambda row: proportion_of_key_terms(query_terms, row))
deals_df['word_embeding_distance'] = deals_df['combined_text'].apply(lambda row: calculate_distance(query_terms, row))
deals_df['average_word_lengths'] = deals_df['combined_text'].apply(average_word_length)
deals_df['average_word_query_lengths'] = average_query_length

deals_df.head(100)

('zde', 'je'): 1
('je', 'příklad'): 1
('příklad', 'textu'): 1
('textu', 'který'): 1
('který', 'obsahuje'): 1
('obsahuje', 'některá'): 1
('některá', 'slova'): 1
('slova', 'toto'): 1
('toto', 'slovo'): 1
('slovo', 'bude'): 1
('bude', 'analyzováno'): 1
('analyzováno', 'a'): 1
('a', 'budete'): 1
('budete', 'hledat'): 1
('hledat', 'bigramy'): 1
('bigramy', 'kolem'): 1
('kolem', 'něj'): 1
('něj', 'můžete'): 1
('můžete', 'také'): 1
('také', 'přidat'): 1
('přidat', 'další'): 1
('další', 'text'): 1
('text', 'k'): 1
('k', 'analýze'): 1


[nltk_data] Downloading package punkt to /Users/zphilipp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
