In [20]:
import numpy as np
import pandas as pd
from openai import OpenAI

In [None]:
client = OpenAI()

In [3]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [7]:
df = pd.read_csv('papers_embedded.csv', index_col=0)

In [10]:
df.head()

Unnamed: 0,text,n_tokens,embedding
0,573 \n\nBIT - SERIAL NEURAL NETWORKS \n\nAlan...,7959.0,"[0.007057549431920052, 0.022557897493243217, -..."
1,1 \n\nCONNECTIVITY VERSUS ENTROPY \n\nYaser S...,5220.0,"[0.017669761553406715, -0.02821267582476139, 0..."
2,278 \n\nTHE HOPFIELD MODEL WITH MUL TI-LEVEL N...,4445.0,"[0.019363459199666977, -0.004505184479057789, ..."
3,442 \n\nAlan Lapedes \nRobert Farber \n\nThe...,7942.0,"[0.006703744176775217, -0.002929536160081625, ..."
4,740 \n\nSPATIAL ORGANIZATION OF NEURAL NEn...,7980.0,"[-0.019718386232852936, 0.021891098469495773, ..."


In [14]:
df.dropna(inplace=True)

In [15]:
df.isna().sum()

text         0
n_tokens     0
embedding    0
dtype: int64

In [16]:
df['embedding'] = df['embedding'].apply(eval).apply(np.array)

In [17]:
df.head()

Unnamed: 0,text,n_tokens,embedding
0,573 \n\nBIT - SERIAL NEURAL NETWORKS \n\nAlan...,7959.0,"[0.007057549431920052, 0.022557897493243217, -..."
1,1 \n\nCONNECTIVITY VERSUS ENTROPY \n\nYaser S...,5220.0,"[0.017669761553406715, -0.02821267582476139, 0..."
2,278 \n\nTHE HOPFIELD MODEL WITH MUL TI-LEVEL N...,4445.0,"[0.019363459199666977, -0.004505184479057789, ..."
3,442 \n\nAlan Lapedes \nRobert Farber \n\nThe...,7942.0,"[0.006703744176775217, -0.002929536160081625, ..."
4,740 \n\nSPATIAL ORGANIZATION OF NEURAL NEn...,7980.0,"[-0.019718386232852936, 0.021891098469495773, ..."


In [19]:
df.dtypes

text          object
n_tokens     float64
embedding     object
dtype: object

In [74]:
def create_context(
    question, max_len=8000
):
    """
    Create a context for a question by finding the most similar context from the dataframe
    """

    # Get the embeddings for the question
    q_embeddings = client.embeddings.create(input=question, model='text-embedding-3-small').data[0].embedding

    # Get the distances from the embeddings
    df['distances'] = df['embedding'].apply(lambda x: cosine_similarity(x, q_embeddings))


    returns = []
    cur_len = 0

    # Sort by distance and add the text to the context until the context is too long
    for _, row in df.sort_values('distances', ascending=False).iterrows():

        # Add the length of the text to the current length
        cur_len += row['n_tokens'] + 4

        # If the context is too long, break
        if cur_len > max_len:
            break

        # Else add it to the text that is being returned
        returns.append(row["text"])
    
    print(len(returns))

    # Return the context
    return "\n\n###\n\n".join(returns)

In [75]:
context = create_context("Can you summarize the main findings of 'CRoSS: Diffusion Model Makes Controllable, Robust and Secure Image Steganography'?", max_len=64000)

10


In [76]:
print(context)

CRoSS: Diffusion Model Makes
Controllable, Robust and Secure Image Steganography
Jiwen Yu1
Xuanyu Zhang1
Youmin Xu1,2
Jian Zhang1†
1 Peking University Shenzhen Graduate School
2 Peng Cheng Laboratory
Abstract
Current image steganography techniques are mainly focused on cover-based meth-
ods, which commonly have the risk of leaking secret images and poor robustness
against degraded container images. Inspired by recent developments in diffu-
sion models, we discovered that two properties of diffusion models, the ability to
achieve translation between two images without training, and robustness to noisy
data, can be used to improve security and natural robustness in image steganogra-
phy tasks. For the choice of diffusion model, we selected Stable Diffusion, a type
of conditional diffusion model, and fully utilized the latest tools from open-source
communities, such as LoRAs and ControlNets, to improve the controllability and
diversity of container images. In summary, we propose a novel i

In [44]:
q_embeddings = client.embeddings.create(input="CRoSS: Diffusion Model", model='text-embedding-3-small').data[0].embedding

In [45]:
df['distances'] = df['embedding'].apply(lambda x: cosine_similarity(x, q_embeddings))

In [None]:
sorted = df.sort_values('distances', ascending=False)

In [54]:
sorted.head()

Unnamed: 0,text,n_tokens,embedding,distances
1154,W$21\nCË\n)CJ\nYK|\n7u\n¢¶F¡\nU 6Ó\n...,7645.0,"[0.008818204514682293, -0.009038213640451431, ...",0.078187
22493,HAN\nJoMoLD\nVALOR\nVALOR++\nMethods\n0\n10\n2...,7977.0,"[0.006200759205967188, -0.01920808106660843, -...",0.078344
940,\nß\né\nã\nð\n\nï\n\nä\nã\nð\ná\n\nß\nü\nß...,6141.0,"[0.020025920122861862, 0.008722320199012756, -...",0.078858
20837,REALTIME QA: What’s the Answer Right Now?\nJun...,7920.0,"[0.0003461475716903806, 0.02710149809718132, 0...",0.079549
20168,Makes 4 servings (about 1 ½ cups each) Recipe ...,7857.0,"[0.019085267558693886, 0.018926342949271202, -...",0.080477


In [68]:
def answer_question(
    model="gpt-4o-mini",
    question="Can you summarize the main findings of 'CRoSS: Diffusion Model Makes Controllable, Robust and Secure Image Steganography'?",
    max_len=8000,
    debug=False,
    max_tokens=150,
    stop_sequence=None
):
    """
    Answer a question based on the most similar context from the dataframe texts
    """
    context = create_context(
        question,
        max_len=max_len
    )
    # If debug, print the raw model response
    if debug:
        print("Context:\n" + context)
        print("\n\n")

    try:
        # Create a chat completion using the question and context
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know\"\n\n"},
                {"role": "user", "content": f"Question: {question}\n\n---\n\nContext: {context}"}
            ]
            # temperature=0,
            # max_tokens=max_tokens,
            # top_p=1,
            # frequency_penalty=0,
            # presence_penalty=0,
            # stop=stop_sequence,
        )
        return response.choices[0].message.content
    except Exception as e:
        print(e)
        return ""

In [70]:
answer = answer_question()

1


In [71]:
print(answer)

The main findings of the paper "CRoSS: Diffusion Model Makes Controllable, Robust and Secure Image Steganography" are as follows:

1. **Introduction of CRoSS Framework**: The authors propose a novel image steganography framework called CRoSS, which utilizes diffusion models to enhance controllability, robustness, and security without requiring additional training.

2. **Limitations of Existing Methods**: Current image steganography techniques often face issues such as information leakage, poor robustness against degradation, and limited controllability. These issues motivate the need for new approaches.

3. **Advantages of Diffusion Models**: The framework leverages two key properties of diffusion models: their ability to translate images without training and their robustness against noise. This makes diffusion models suitable for steganography tasks.

4. **Unification of Goals**: CRoSS aims to achieve security (ensuring the hidden image cannot be leaked), controllability (allowing the