In [1]:
pd.set_option('display.max_colwidth', -1) # damit der komplette Output im Notebook angezeigt wird

NameError: name 'pd' is not defined

## Parrot Libary Test (Pretrained Model)

In [1]:
from parrot import Parrot
import torch
import warnings
warnings.filterwarnings("ignore")

In [2]:
def random_state(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

random_state(1234)

In [3]:
parrot = Parrot(model_tag="prithivida/parrot_paraphraser_on_T5", use_gpu=True)

KeyboardInterrupt: 

In [5]:
phrases = ["a suit for a casual meeting", "beautiful dresses", "a t-shirt with logo print"]

In [6]:
for phrase in phrases:
    print("\n", "-"*100)
    print("Input_phrase: ", phrase)
    print("-"*100)
    para_phrases = parrot.augment(input_phrase=phrase)
    for para_phrase in para_phrases:
        print(para_phrase)


 ----------------------------------------------------------------------------------------------------
Input_phrase:  a suit for a casual meeting
----------------------------------------------------------------------------------------------------
('dress for casual meetings', 18)
('a suit for a casual meeting', 12)

 ----------------------------------------------------------------------------------------------------
Input_phrase:  beautiful dresses
----------------------------------------------------------------------------------------------------
('wonderful dresses', 17)
('pretty dresses', 17)

 ----------------------------------------------------------------------------------------------------
Input_phrase:  a t-shirt with logo print
----------------------------------------------------------------------------------------------------
('a t-shirt with the logo printed', 18)
('a t-shirt with logo print', 12)


## BART selbst trainieren

In [2]:
import os
from datetime import datetime
import logging

import pandas as pd
from sklearn.model_selection import train_test_split
from simpletransformers.seq2seq import Seq2SeqModel, Seq2SeqArgs

In [3]:
import warnings

import pandas as pd


def load_data(
    file_path, input_text_column, target_text_column, label_column, keep_label=1
):
    df = pd.read_csv(file_path, sep="\t", error_bad_lines=False)
    df = df.loc[df[label_column] == keep_label]
    df = df.rename(
        columns={input_text_column: "input_text", target_text_column: "target_text"}
    )
    df = df[["input_text", "target_text"]]
    df["prefix"] = "paraphrase"

    return df


def clean_unnecessary_spaces(out_string):
    if not isinstance(out_string, str):
        warnings.warn(f">>> {out_string} <<< is not a string.")
        out_string = str(out_string)
    out_string = (
        out_string.replace(" .", ".")
        .replace(" ?", "?")
        .replace(" !", "!")
        .replace(" ,", ",")
        .replace(" ' ", "'")
        .replace(" n't", "n't")
        .replace(" 'm", "'m")
        .replace(" 's", "'s")
        .replace(" 've", "'ve")
        .replace(" 're", "'re")
    )
    return out_string


In [4]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.ERROR)

In [5]:
# Google Data
train_df = pd.read_csv("data/train.tsv", sep="\t").astype(str)
eval_df = pd.read_csv("data/dev.tsv", sep="\t").astype(str)

train_df = train_df.loc[train_df["label"] == "1"]
eval_df = eval_df.loc[eval_df["label"] == "1"]

train_df = train_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)
eval_df = eval_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)

train_df = train_df[["input_text", "target_text"]]
eval_df = eval_df[["input_text", "target_text"]]

train_df["prefix"] = "paraphrase"
eval_df["prefix"] = "paraphrase"

In [6]:
print(train_df.shape)
print(eval_df.shape)

(21829, 3)
(3539, 3)


In [7]:
print(train_df.iloc[:1]['input_text'])
print(train_df.iloc[:1]['target_text'])
print(eval_df.iloc[:1]['input_text'])
print(eval_df.iloc[:1]['target_text'])

1    The NBA season of 1975 -- 76 was the 30th seas...
Name: input_text, dtype: object
1    The 1975 -- 76 season of the National Basketba...
Name: target_text, dtype: object
1    They were there to enjoy us and they were ther...
Name: input_text, dtype: object
1    They were there for us to enjoy and they were ...
Name: target_text, dtype: object


In [8]:
# MSRP Data
train_df = pd.concat(
    [
        train_df,
        load_data("data/msr_paraphrase_train.txt", "#1 String", "#2 String", "Quality"),
    ]
)
eval_df = pd.concat(
    [
        eval_df,
        load_data("data/msr_paraphrase_test.txt", "#1 String", "#2 String", "Quality"),
    ]
)

b'Skipping line 102: expected 5 fields, saw 6\nSkipping line 656: expected 5 fields, saw 6\nSkipping line 867: expected 5 fields, saw 6\nSkipping line 880: expected 5 fields, saw 6\nSkipping line 980: expected 5 fields, saw 6\nSkipping line 1439: expected 5 fields, saw 6\nSkipping line 1473: expected 5 fields, saw 6\nSkipping line 1822: expected 5 fields, saw 6\nSkipping line 1952: expected 5 fields, saw 6\nSkipping line 2009: expected 5 fields, saw 6\nSkipping line 2230: expected 5 fields, saw 6\nSkipping line 2506: expected 5 fields, saw 6\nSkipping line 2523: expected 5 fields, saw 6\nSkipping line 2809: expected 5 fields, saw 6\nSkipping line 2887: expected 5 fields, saw 6\nSkipping line 2920: expected 5 fields, saw 6\nSkipping line 2944: expected 5 fields, saw 6\nSkipping line 3241: expected 5 fields, saw 6\nSkipping line 3358: expected 5 fields, saw 6\nSkipping line 3459: expected 5 fields, saw 6\nSkipping line 3491: expected 5 fields, saw 6\nSkipping line 3643: expected 5 fields

In [9]:
# Quora Data

# The Quora Dataset is not separated into train/test, so we do it manually the first time.
# df = load_data(
#     "data/quora_duplicate_questions.tsv", "question1", "question2", "is_duplicate"
# )
# q_train, q_test = train_test_split(df)

# q_train.to_csv("data/quora_train.tsv", sep="\t")
# q_test.to_csv("data/quora_test.tsv", sep="\t")

In [10]:
q_train = pd.read_csv("data/quora_train.tsv", sep="\t")
q_test = pd.read_csv("data/quora_test.tsv", sep="\t")

In [11]:
train_df = pd.concat([train_df, q_train])
eval_df = pd.concat([eval_df, q_test])

train_df = train_df[["prefix", "input_text", "target_text"]]
eval_df = eval_df[["prefix", "input_text", "target_text"]]

In [12]:
train_df = train_df.dropna()
eval_df = eval_df.dropna()

In [13]:
train_df["input_text"] = train_df["input_text"].apply(clean_unnecessary_spaces)
train_df["target_text"] = train_df["target_text"].apply(clean_unnecessary_spaces)

eval_df["input_text"] = eval_df["input_text"].apply(clean_unnecessary_spaces)
eval_df["target_text"] = eval_df["target_text"].apply(clean_unnecessary_spaces)

In [14]:
print(train_df)

            prefix                                         input_text  \
1       paraphrase  The NBA season of 1975 -- 76 was the 30th seas...   
3       paraphrase  When comparable rates of flow can be maintaine...   
4       paraphrase  It is the seat of Zerendi District in Akmola R...   
5       paraphrase  William Henry Henry Harman was born on 17 Febr...   
7       paraphrase  With a discrete amount of probabilities Formul...   
...            ...                                                ...   
111942  paraphrase  What was the craziest dream that you've ever had?   
111943  paraphrase             How do I increase height at age of 16?   
111944  paraphrase  If superconductors have infinite permeability ...   
111945  paraphrase  How can I contact someone on Quora and send pr...   
111946  paraphrase  Why should Jayalalittha be awarded by Bharat R...   

                                              target_text  
1       The 1975 -- 76 season of the National Basketba...  
3  

In [15]:
model_args = Seq2SeqArgs()
model_args.eval_batch_size = 16
model_args.evaluate_during_training = True
model_args.evaluate_during_training_steps = 2500
model_args.evaluate_during_training_verbose = True
model_args.fp16 = False
model_args.learning_rate = 5e-5
model_args.max_seq_length = 128
model_args.num_train_epochs = 2
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.save_eval_checkpoints = False
model_args.save_steps = -1
model_args.train_batch_size = 4
model_args.use_multiprocessing = False

model_args.do_sample = True
model_args.num_beams = None
model_args.num_return_sequences = 3
model_args.max_length = 128
model_args.top_k = 30
model_args.top_p = 0.95

model_args.wandb_project = "Paraphrasing with BART"
model_args.WANDB_NOTEBOOK_NAME = "Paraphrasing for Semantic Search"


model = Seq2SeqModel(
    encoder_decoder_type="bart",
    encoder_decoder_name="facebook/bart-large",
    args=model_args,
)

In [33]:
import gc
import torch
gc.collect()
# torch.cuda.empty_cache()

4299

In [None]:
model.train_model(train_df, eval_data=eval_df)

In [None]:
to_predict = [
    prefix + ": " + str(input_text)
    for prefix, input_text in zip(
        eval_df["prefix"].tolist(), eval_df["input_text"].tolist()
    )
]
truth = eval_df["target_text"].tolist()

preds = model.predict(to_predict)

# Saving the predictions if needed
os.makedirs("predictions", exist_ok=True)

with open(f"predictions/predictions_{datetime.now()}.txt", "w") as f:
    for i, text in enumerate(eval_df["input_text"].tolist()):
        f.write(str(text) + "\n\n")

        f.write("Truth:\n")
        f.write(truth[i] + "\n\n")

        f.write("Prediction:\n")
        for pred in preds[i]:
            f.write(str(pred) + "\n")
        f.write(
            "________________________________________________________________________________\n"
        )


## Pretrained Sentence Transformers

In [1]:
import json
import pandas as pd
import re

In [2]:
with open(r"E:\Users\Lucas xD\Downloads\Products_Q_US_edited.json", encoding="utf8") as json_file:
    data = json.load(json_file)

In [3]:
df = pd.json_normalize(data)
df = df.drop(columns=['brand', 'colors', 'gender', 'productId', 'sizes', 'styleName', 'variants', 'image', 'id', 'longDescription'])

df.head()

Unnamed: 0,name,shortDescription
0,Stretch Cotton V-Neck T-Shirt | Dredosos,This t-shirt by HUGO is crafted from cotton wi...
1,Leather belt with embossed detail,Upgrade your everyday collection with this tim...
2,Italian Leather Derby Dress Shoe | Prindo,Crafted from fine Italian calfskin with a prin...
3,"Virgin Wool Tuxedo, Regular Fit | Stars/Glamour",This regular fit tuxedo by BOSS is crafted in ...
4,"Italian Virgin Wool Suit, Slim Fit | Huge/Genius",Our best-selling suit just got better with an ...


In [4]:
df = df.stack().reset_index() # all in one column
df = df.drop(columns=['level_0', 'level_1'])
df.columns = ['sentences']
df.head()

Unnamed: 0,sentences
0,Stretch Cotton V-Neck T-Shirt | Dredosos
1,This t-shirt by HUGO is crafted from cotton wi...
2,Leather belt with embossed detail
3,Upgrade your everyday collection with this tim...
4,Italian Leather Derby Dress Shoe | Prindo


In [5]:
def pre_process(text):
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("</?.*?>"," <> ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    return text

In [6]:
df['sentences'] = df['sentences'].apply(lambda x:pre_process(x))
df.to_csv('./data/processedData.csv', index=False)
df.head()

Unnamed: 0,sentences
0,stretch cotton v neck t shirt dredosos
1,this t shirt by hugo is crafted from cotton wi...
2,leather belt with embossed detail
3,upgrade your everyday collection with this tim...
4,italian leather derby dress shoe prindo


In [7]:
df = pd.read_csv('./data/processedData.csv', )
df['sentences'] = df['sentences'].astype(str)
df.head()

Unnamed: 0,sentences
0,stretch cotton v neck t shirt dredosos
1,this t shirt by hugo is crafted from cotton wi...
2,leather belt with embossed detail
3,upgrade your everyday collection with this tim...
4,italian leather derby dress shoe prindo


### mit Tensorflow

In [19]:
from tensorflow.keras.losses import cosine_similarity
from tensorflow import math
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer('paraphrase-MiniLM-L6-v2')
corpus = df['sentences'].tolist()
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

# queries = ['white shoes that are confortable to wear', 
#             'a black suit for a formal event',
#             'a dress for a summer evening walk on the beach']
queries = ['shoes that are comfortable to wear']
top_k = 3
for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True)

    cos_scores = math.abs(cosine_similarity(
        query_embedding.cpu(),
        corpus_embeddings.cpu(),
        axis=-1))
    top_results = math.top_k(cos_scores, k=5)

    print("\n======================")
    print("Query:", query)
    print("Top 5 most similar sentences in product data:")

    for score, idx in zip(top_results.values.numpy(), top_results.indices.numpy()):
        if(score > 0.5):
            print(corpus[idx][:67], "(Score: {:.4f})".format(score))
  


Query: shoes that are comfortable to wear
Top 5 most similar sentences in product data:
low top sneakers in mixed materials (Score: 0.7513)
running style sneakers in tonal nappa leather and mesh (Score: 0.7508)
modern sneakers by boss crafted with uppers in nappa leather suede  (Score: 0.7499)
leather oxford shoes with modern broguing (Score: 0.7470)
low top sneakers in hybrid materials (Score: 0.7427)


### mit Torch

In [44]:
embedder = SentenceTransformer('paraphrase-MiniLM-L6-v2')

corpus = df['sentences'].tolist()
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

# corpus_embeddings = sentence_embeddings

queries = ['white shoes that are confortable to wear', 
            'a black suit for a formal event',
            'a dress for a summer evening walk on the beach']
top_k = 3
for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True)

    cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("\n======================")
    print("Query:", query)
    print("Top 3 most similar sentences in product data:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(corpus[idx][:50], "(Score: {:.4f})".format(score))



Query: white shoes that are confortable to wear
Top 3 most similar sentences in corpus:
modern sneakers by boss crafted with uppers in nap (Score: 0.7399)
low top sneakers in mixed materials (Score: 0.7361)
running style sneakers in tonal nappa leather and  (Score: 0.7302)

Query: a black suit for a formal event
Top 3 most similar sentences in corpus:
a contemporary tuxedo by boss menswear cut to a de (Score: 0.6458)
slim fit tuxedo with silk trims and pocket square (Score: 0.6263)
a two piece tuxedo by boss menswear cut to a narro (Score: 0.6233)

Query: a dress for a summer evening walk on the beach
Top 3 most similar sentences in corpus:
short sleeved dress with sparkly pleated skirt (Score: 0.5366)
long length sleeveless dress in silk with waterfal (Score: 0.5312)
tie neck evening dress in lustrous fabric (Score: 0.5299)


In [10]:
def calcZeit(t):
  return f"{t/60/60} Stunden :("

calcZeit(5400)

'1.5 Stunden :('

In [20]:
import pandas as pd
import re
from tensorflow.keras.losses import cosine_similarity
from tensorflow import math
from sentence_transformers import SentenceTransformer


queries = 'white shoes that are confortable to wear'

def loadProdData():
    df = pd.read_csv('./data/processedData.csv', )
    df['sentences'] = df['sentences'].astype(str)
    return df['sentences'].tolist()


def loadProdTensor():
    tensor = torch.load('./data/corpus_embeddings.pt')
    return tensor


def calc_similarity(query):
    embedder = SentenceTransformer('paraphrase-MiniLM-L6-v2')

    prod_list = loadProdData()
    corpus_embeddings = loadProdTensor()

    query_embedding = embedder.encode(query, convert_to_tensor=True)
    cos_scores = math.abs(cosine_similarity(
        query_embedding.cpu(),
        corpus_embeddings.cpu(),
        axis=-1))
    top_results = math.top_k(cos_scores, k=3)

    relevant_documents = []
    for score, idx in zip(top_results.values.numpy(), top_results.indices.numpy()):
        if(score > 0.7):
            relevant_documents.append({'text': prod_list[idx], 'score': score})

    return relevant_documents

rel = calc_similarity(queries)
for t in rel:
    print(f"der text:{t['text'][:30]}, der score {t['score']}")

der text:modern sneakers by boss crafte, der score 0.739911675453186
der text:low top sneakers in mixed mate, der score 0.7360946536064148
der text:running style sneakers in tona, der score 0.7301977872848511


In [22]:
bla = [{'id': 2, 'score': 2.3},{'id': 3, 'score': 2.3},{'id': 1, 'score': 2.3}]
if not any(d['id'] == 21 for d in bla):
    print('nicht drin')
else:
    print('drin')

nicht drin
