In [1]:
import time
from pathlib import Path
import textwrap

import numpy as np
import torch
import transformers
from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
                          BitsAndBytesConfig)


from modules.GPTQ_loader import load_quantized
from modules.text_generation import generate_reply
import modules.shared as shared
from modules.model import load_model

import hashlib
import spacy
# Load the English language model
nlp = spacy.load("en_core_web_sm")


shared.model_name = "_Mistral-7b-gptq-4bit-32g-actorder_True"
shared.model_name = "_LLama2-7b-gptq-4bit-32g-actorder_True"

print(f"Loading {shared.model_name}...")
t0 = time.time()


shared.groupsize = 32
shared.wbits = 4
shared.model, shared.tokenizer = load_model(shared.model_name, gptq = True)

print(f"Loaded the model in {(time.time()-t0):.2f} seconds.")


import pandas as pd
df = pd.read_csv('Lancaster_sensorimotor_norms_for_39707_words.csv', header=0)  #the header is in the first row

shared.sensorimotor = df.set_index('Word').T.to_dict('dict')
shared.classes = ['Auditory.mean', 'Gustatory.mean','Haptic.mean','Interoceptive.mean','Olfactory.mean','Visual.mean','Foot_leg.mean','Hand_arm.mean','Head.mean','Mouth.mean','Torso.mean']



from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli",device=-1)

def secure_hash_to_numbers(input_string, range_list):

    
    nlp = spacy.load("en_core_web_sm")

    if (input_string is None):
        return [5,5]
    doc = nlp(input_string)

    core = [token.lemma_ for token in doc if not token.is_stop]

    core_str = " ".join(core)

    input_string = core_str

    

    labels = ['Scientific Concepts',
        'Technical Explanations',
        'Historical Context',
        'Cultural Insights',
        'Environmental Issues',
        'Health and Medicine',
        'Technological Developments',
        'Economic Theories',
        'Political Analysis',
        'Philosophical Concepts',
        'Educational Methods',
        'Psychological Theories',
        'Artistic Movements',
        'Literary Analysis',
        'Global Events',
        'Culinary Traditions',
        'Mathematical Concepts',
        'Physical Principles',
        'Astronomical Discoveries',
        'Geographical Information',
        'Social Dynamics',
        'Legal Interpretations',
        'Business Strategies',
        'Sports and Fitness',
        'Linguistic Features',
        'Techniques in Science and Technology']

    labels2 = ['Science and Technology',
        'Health and Environmental Issues',
        'Arts, Culture, and History',
        'Economic and Political Analysis',
        'Philosophy and Psychology',
        'Education and Learning Methods',
        'Global and Social Dynamics',
        'Legal and Ethical Discussions',
        'Business and Management',
        'Sports, Fitness, and Recreation',
        'Language and Literature']
        

    results = classifier(input_string, labels)
    predicted_label = results['labels'][0]
    label_index = labels.index(predicted_label)

    results2 = classifier(input_string, labels2)
    predicted_label2 = results2['labels'][0]
    label_index2 = labels2.index(predicted_label2)
   
    #print(input_string)
    #print([label_index2, label_index])
    #print("--------------------------")
    return [label_index2, label_index]
    
    
    #return result_numbers

def get_last_sentence(text):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    if sentences:
        return sentences[-1]
    else:
        return None  # Return None if there are no sentences


Loading _LLama2-7b-gptq-4bit-32g-actorder_True...
Loading _LLama2-7b-gptq-4bit-32g-actorder_True...
Auto-assiging --gpu-memory 23 for your GPU to try to prevent out-of-memory errors. You can manually set other values.
The AutoGPTQ params are: {'model_basename': 'model', 'device': 'cuda:0', 'use_triton': False, 'inject_fused_attention': False, 'inject_fused_mlp': False, 'use_safetensors': True, 'trust_remote_code': True, 'max_memory': {0: '23GiB', 'cpu': '64GiB'}, 'quantize_config': None, 'disable_exllama': True}


You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Loaded the model in 28.70 seconds.

Loaded the model in 28.70 seconds.


In [3]:

prompts = [\
    #"How can you get from Graz to Vienna?"
    "What is a watermark?"
    #"What is motorsports, is it a real sport?",
]



results = []


for prompt in prompts:
#prompt = "What is the capital of France?"
    question = f'''<|im_start|>system
You are a helpful assistant, who always provide explanation. Don't enumerate anwsers, talk for the user, write links or urls or use numbers.<|im_end|>
<|im_start|>user
{prompt}<|im_end|>
<|im_start>assistant
'''

    generate_params = {
        'max_new_tokens' : 200,
        'add_bos_token' : False,
        'truncation_length' : 4096,
        'custom_stopping_strings' : ["### Human:", "Human:", "user:", "Q:","<|im_end|>","<|im_start|>system"],
        'ban_eos_token' : False,
        'skip_special_tokens' : False,
        'do_sample': True,
        'temperature': 0.7,
        'top_p': 0.95,
        'typical_p': 1,
        'repetition_penalty': 1.10,
        'encoder_repetition_penalty': 1,
        'top_k': 40,
        'num_beams': 1,
        'penalty_alpha': 0,
        'min_length': 0,
        'length_penalty': 1,
        'no_repeat_ngram_size': 0,
        'early_stopping': False,
        'seed' : 0,
    }

    #no watermark
    shared.delta_char = 0.0
    shared.delta_first = 0.0
    shared.secret_key = [0,0]
    done = False
    reply_base = ""
    reply_current = ""
    current_question = question
    shared.new_sentence = False
    shared.delta_senso = 0
    shared.start = 0
    i = 0
    while (done is not True and i < 4):
        reply_current, done = generate_reply(current_question, generate_params, eos_token='<|im_end|>')

        last_sentence = get_last_sentence(reply_current)
        #print("------------------found end of sentence, last sentence is:")
        #print(f'''[{reply_current}]''')
        shared.secret_key = secure_hash_to_numbers(last_sentence,[(0, 10), (0, 25)])
        shared.new_sentence = True
        shared.start = 0
        reply_base += reply_current
        current_question = f'''{current_question}{reply_current}'''
        i+=1



    #both
    shared.delta_char = 2.5#2.5
    shared.delta_first = 25.0#50
    shared.secret_key = [0,0]
    done = False
    reply_watermark = ""
    reply_current = ""
    current_question = question
    shared.new_sentence = False
    shared.delta_senso = 0
    shared.start = 0
    i = 0
    while (done is not True and i < 4):
        reply_current, done = generate_reply(current_question, generate_params, eos_token='<|im_end|>')

        last_sentence = get_last_sentence(reply_current)
        #print("------------------found end of sentence, last sentence is:")
        #print(f'''[{reply_current}]''')
        shared.secret_key = secure_hash_to_numbers(last_sentence,[(0, 10), (0, 25)])
        shared.new_sentence = True
        shared.start = 0
        reply_watermark += reply_current
        current_question = f'''{current_question}{reply_current}'''
        i+=1



    #results.append([prompt,reply_base,reply_watermark])
    results.append([prompt,reply_base,reply_watermark])


    #reply_watermark = generate_reply(question+reply_watermark, generate_params, eos_token='<|im_end|>')
    print(prompt)
    print(f'''[{reply_base}]''')
    print(f'''[{reply_watermark}]''')

What is a watermark?
[A watermark is a picture, logo or image embedded into your photo (within the photograph). It is used to identify the author of the photo and protect against plagiarism. It also helps to avoid copyright infringement when using others works (for instance, photos on blogs, articles, presentations, websites) without permission from the owner.]
[A watermark is a picture, logo or image embedded into your photo (within the photograph). It is used to identify the author of the photo and protect against plagiarism. But you can put anything you like on your photo: artwork, text message, signature, barcode, graphic design, etc. Many people use funny phrases, quotes or funny images as their watermarks. You can see examples here: http://www.]


In [19]:
import spacy
import hashlib
from scipy.stats import zscore
from scipy.stats import norm
from itertools import permutations
from collections import Counter
import math
import modules.shared as shared

from scipy.stats import binom

def split_into_sentences(text):
    doc = nlp(text)
    sentences = []
    current_sentence = []

    for token in doc:
        current_sentence.append(token.text)
        if token.text in (".", "?", "!"):
            sentences.append(" ".join(current_sentence))
            current_sentence = []

    # Add the last sentence if not followed by a punctuation mark
    if current_sentence:
        sentences.append(" ".join(current_sentence))

    return sentences

def get_words_in_sentence(sentence):
    doc = nlp(sentence)
    words = [token.text for token in doc if not token.is_punct and not token.is_space]
    return words

import pandas as pd
df = pd.read_csv('Lancaster_sensorimotor_norms_for_39707_words.csv', header=0)  #the header is in the first row

shared.sensorimotor = df.set_index('Word').T.to_dict('dict')
shared.classes = ['Auditory.mean', 'Gustatory.mean','Haptic.mean','Interoceptive.mean','Olfactory.mean','Visual.mean','Foot_leg.mean','Hand_arm.mean','Head.mean','Mouth.mean','Torso.mean']

mean_value = [1.51,0.32,1.07,1.03,0.39,2.90,0.81,1.45,2.28,1.26,0.82]
std_deviation = [0.99,0.70,0.93,0.88,0.62,0.90,0.75,0.91,0.72,0.90,0.67]


def calculate_probs(reply):
    z_scores = []
    probabilities = []
    correct_acrosticons = 0
    old_class = 0

    sentences = split_into_sentences(reply)
    for idx, sentence in enumerate(sentences, start=1):
        print(sentence)
        if idx > 1:
            if(chr(ord('A') + generated_numbers[1]) == sentence[:1]):
                
                correct_acrosticons += 1

        range_list = [(0, 10), (0, 25)] 
        generated_numbers = secure_hash_to_numbers(sentence, range_list)

        print("////////////////////////////////////////////////////////////////////////////////")
        print(shared.classes[generated_numbers[0]])
        


        sum_of_word_mean = 0
        valid_words = 0

        color_text = ""

        words = get_words_in_sentence(sentence)
        for word in words:
            if word.upper() in shared.sensorimotor:
                valid_words += 1
                sum_of_word_mean += shared.sensorimotor[word.upper()][shared.classes[old_class]]
                #print(f'''{word} : P={shared.sensorimotor[word.upper()][shared.classes[old_class]]}''')
                
                #color_text += f'''\colorword{{{val}}}{{{word}}} '''
                if shared.sensorimotor[word.upper()][shared.classes[old_class]] > mean_value[old_class]:
                    val = (shared.sensorimotor[word.upper()][shared.classes[old_class]]/5)*100
                    color_text += f'''\colorhighlight{{red}}{{{val}}}{{{word}}}'''
                else:
                    val = (shared.sensorimotor[word.upper()][shared.classes[old_class]]/5)*100
                    color_text += f'''\colorhighlight{{red}}{{{0}}}{{{word}}}'''
            else:
                color_text += f'''\colorhighlight{{red}}{{{0}}}{{{word}}}'''

        #print(color_text)
        
        if(valid_words > 0):
            sentence_value = sum_of_word_mean/valid_words
            print(f'''valid words {valid_words}, sentence_value {sentence_value}''')

            
            # Calculate Z-score for the new data point
            z_score = (sentence_value - mean_value[old_class]) / std_deviation[old_class]
        else:
            z_score = 0

        print(chr(ord('A') + generated_numbers[1]))
        # Calculate the probability using the cumulative distribution function (CDF)
        probabilities.append(1-norm.cdf(z_score))
        z_scores.append(z_score)

        old_class = generated_numbers[0]



    def probability_of_acrosticon(num_sentences_starting_with_A, total_sentences, probability_of_A=1/26):

        # Calculate the binomial probability
        return binom.pmf(num_sentences_starting_with_A, total_sentences, probability_of_A)


    acrosticon_restult = probability_of_acrosticon(correct_acrosticons, len(sentences)-1)
    #print(f"The Acrosticon probability is: {acrosticon_restult} for {correct_acrosticons} correct acrosticons in {len(sentences)-1} valid sentences.")

    # Example usage:
    stouffer_result = 1-norm.cdf(sum(z_scores)/ math.sqrt(len(probabilities)))
    #print(probabilities)
    #print(f"The Stouffer's method combinded z_score is the probability of: {stouffer_result}")
    #print(f"The total probability is: {stouffer_result*acrosticon_restult}")

    #return stouffer_result*acrosticon_restult, len(sentences), correct_acrosticons
    return stouffer_result*acrosticon_restult, stouffer_result, acrosticon_restult, len(sentences), correct_acrosticons

scores = []
for tuple in results:
    print("Base")
    score_base, stouffer_result_base, acrosticon_restult_base, len_base, acros_base = calculate_probs(tuple[1])
    print()
    print("Watermark")
    score_watermark, stouffer_result_watermark, acrosticon_restult_watermark, len_watermark, acros_watermark = calculate_probs(tuple[2])

    scores.append([[score_base,stouffer_result_base, acrosticon_restult_base,len_base,acros_base],[score_watermark, stouffer_result_watermark, acrosticon_restult_watermark, len_watermark, acros_watermark]])

print(scores)

Base
You can take the train ( ÖBB ) from Graz to Vienna , it takes 2 hours and 45 minutes and costs around 35 euro one way .
////////////////////////////////////////////////////////////////////////////////
Foot_leg.mean
valid words 14, sentence_value 1.792890282142857
T
The ÖBB is also offering a discount , if you buy your tickets online in advance .
////////////////////////////////////////////////////////////////////////////////
Head.mean
valid words 13, sentence_value 0.9628076602307694
W

Watermark
You can take the train ( ÖBB ) from Graz to Vienna , it takes 2 hours and 45 minutes and costs around 35 euro one way .
////////////////////////////////////////////////////////////////////////////////
Foot_leg.mean
valid words 14, sentence_value 1.792890282142857
T
The bus ride is faster ( 1h 45 min ) , but you have to change in Klagenfurt and you pay about 30 euros .
////////////////////////////////////////////////////////////////////////////////
Foot_leg.mean
valid words 15, sentence_va