In [58]:
import json
import sqlite3

import chromadb
import numpy as np
from chromadb.utils import embedding_functions
from keras_tuner import RandomSearch
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential

In [59]:
TRAIN_CUTOFF = 2711  # 2424 2155 1880 1572 1016 817 502 260

In [60]:
def find_last_interested_entry(database_path):
    # Create a database connection
    conn = sqlite3.connect(database_path)
    cursor = conn.cursor()

    # SQL query to find the last occurrence of "interested" = 1
    query = """
    SELECT paper_id FROM papers 
    WHERE interested = 1 
    ORDER BY paper_id ASC 
    LIMIT 1;
    """

    try:
        cursor.execute(query)
        last_interested = cursor.fetchone()
        if last_interested:
            print("Last interested entry:", last_interested)
            return {"paper_id": last_interested[0]}
        else:
            print("No interested entries found.")
            return nil
    except sqlite3.Error as e:
        print("Database error:", e)
    finally:
        conn.close()

In [61]:
db = "../data/arxiv_papers.db"
last_interested = find_last_interested_entry(db)
print(json.dumps(last_interested, indent=4))

Last interested entry: ('http://arxiv.org/abs/2403.17287v1',)
{
    "paper_id": "http://arxiv.org/abs/2403.17287v1"
}


In [62]:
def get_data(database_path):
    # Create a database connection
    conn = sqlite3.connect(database_path)
    cursor = conn.cursor()
    cursor.row_factory = sqlite3.Row

    # SQL query to find the last occurrence of "interested" = 1
    query = """
    SELECT paper_id, concise_summary, interested FROM papers 
    ORDER BY paper_id ASC 
    LIMIT 2711;
    """

    try:
        cursor.execute(query)
        articles = cursor.fetchall()
        if articles:
            print(f"Got {len(articles)}.")
            return articles
        else:
            print("No interested entries found.")
            return nil
    except sqlite3.Error as e:
        print("Database error:", e)
    finally:
        conn.close()

In [63]:
def get_embedding(paper_id, vdb_path="../data/arxiv_embeddings.chroma"):
    vdb = chromadb.PersistentClient(vdb_path)
    sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="all-MiniLM-L6-v2"
    )
    embedding_func = sentence_transformer_ef
    vectors = vdb.get_or_create_collection(
        name="arxiver", embedding_function=embedding_func
    )

    res = vectors.get(ids=[paper_id], limit=1, include=["embeddings"])
    # print(res)
    # print(f'{res["ids"][0]} {res["embeddings"][0]}')
    return res["embeddings"][0]

In [64]:
# get_embedding(articles[0]["paper_id"])

In [65]:
articles = get_data(db)
X_article = []
y_article = []
for article in articles:
    # print(
    #     f'{article["paper_id"]}, {article["interested"]}\n{article["concise_summary"]}'
    # )
    X_article.append(get_embedding(article["paper_id"]))
    y_article.append(article["interested"])

# print(X_article[:3])
# print(y_article[:3])

X = np.array(X_article)
y = np.array(y_article)

# print(X[:1])
# print(y[:1])
print(X.shape, y.shape)

Got 2711.
(2711, 384) (2711,)


In [66]:
print(len(X[0]))

384


In [67]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [68]:
# Model definition
model = Sequential(
    [
        Dense(384, activation="relu", input_shape=(X_train.shape[1],)),
        Dropout(0.2),
        Dense(64, activation="relu"),
        Dense(1, activation="sigmoid"),
    ]
)

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [69]:
# Model training
model.fit(X_train, y_train, epochs=30, batch_size=32, validation_split=0.2)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7f40848370d0>

In [70]:
# Save
import datetime

formatted_time = datetime.datetime.now().strftime(f"%Y%m%d_%H%M")
model.save(f"model-{formatted_time}-{TRAIN_CUTOFF}.keras")

In [71]:
# Evaluation
predictions = model.predict(X_test) > 0.5
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.95      0.98      0.96       514
           1       0.17      0.07      0.10        29

    accuracy                           0.93       543
   macro avg       0.56      0.52      0.53       543
weighted avg       0.91      0.93      0.92       543



In [72]:
def get_new_data(database_path):
    # Create a database connection
    conn = sqlite3.connect(database_path)
    cursor = conn.cursor()
    cursor.row_factory = sqlite3.Row

    # SQL query to find the last occurrence of "interested" = 1
    query = """
    SELECT paper_id, concise_summary FROM papers
    ORDER BY paper_id ASC
    LIMIT 2000 OFFSET 2711;
    """

    try:
        cursor.execute(query)
        articles = cursor.fetchall()
        if articles:
            print(f"Got {len(articles)}.")
            return articles
        else:
            print("No interested entries found.")
            return nil
    except sqlite3.Error as e:
        print("Database error:", e)
    finally:
        conn.close()

In [73]:
new_ids = get_new_data(db)

new_ids[0]["paper_id"]

Got 2000.


'http://arxiv.org/abs/2404.03478v1'

In [93]:
# Print the predicted articles

new_x = []
formatted = []
for id in new_ids:
    new_x.append(get_embedding(id["paper_id"]))

new_preds = model.predict(new_x) > 0.5



In [94]:
for i in range(len(new_preds)):
    if new_preds[i] == True:
        paper_id = new_ids[i]["paper_id"]
        summary = new_ids[i]["concise_summary"]
        print(f"{paper_id}: {new_preds[i]}\n{summary}")
        formatted.append({"id": paper_id, "summary": summary})

http://arxiv.org/abs/2404.03502v1: [ True]
Artificial intelligence can boost productivity and generate insights, but its widespread use may have unintended consequences, such as harming public understanding. The reliance on recursive AI systems could lead to "knowledge collapse," impacting innovation and human culture, as demonstrated by a model where a discount on AI-generated content leads to public beliefs further from the truth. Further research is needed to address and mitigate these outcomes.
http://arxiv.org/abs/2404.03558v1: [ True]
Large language models have demonstrated the ability to perform new tasks with limited examples through in-context learning, with multi-task learning offering promising transfer learning potential. This research explores combining multi-task learning with in-context learning to create models that efficiently learn tasks and are robust to out-of-distribution examples, proposing curriculum learning strategies for higher data efficiency and stable conve

In [95]:
# for i in range(len(new_preds)):
#     if new_preds[i] == True:
#         print(f'{new_ids[i]["paper_id"]}')

print(formatted)

[{'id': 'http://arxiv.org/abs/2404.03502v1', 'summary': 'Artificial intelligence can boost productivity and generate insights, but its widespread use may have unintended consequences, such as harming public understanding. The reliance on recursive AI systems could lead to "knowledge collapse," impacting innovation and human culture, as demonstrated by a model where a discount on AI-generated content leads to public beliefs further from the truth. Further research is needed to address and mitigate these outcomes.'}, {'id': 'http://arxiv.org/abs/2404.03558v1', 'summary': 'Large language models have demonstrated the ability to perform new tasks with limited examples through in-context learning, with multi-task learning offering promising transfer learning potential. This research explores combining multi-task learning with in-context learning to create models that efficiently learn tasks and are robust to out-of-distribution examples, proposing curriculum learning strategies for higher da

In [78]:
# Retrieve article titles

import sys

# Add the parent directory to the Python path
sys.path.insert(0, "/home/woojay/P/ML/arxiver")

from arxiver.database import create_connection

conn = create_connection("../data/arxiv_papers.db")

if conn is not None:
    cursor = conn.cursor()

    for i in range(len(new_preds)):
        if new_preds[i] == True:
            # Fetch the specific entry
            cursor.execute(
                "SELECT paper_id, title, summary, concise_summary FROM papers WHERE paper_id = ?",
                (new_ids[i]["paper_id"],),
            )
            entry = cursor.fetchone()

            if not entry:
                conn.close()
                raise HTTPException(status_code=404, detail="Paper not found")

            paper_id, title, summary, concise_summary = entry

            print(f"{paper_id}: {title}")

http://arxiv.org/abs/2404.03502v1: AI and the Problem of Knowledge Collapse
http://arxiv.org/abs/2404.03558v1: How does Multi-Task Training Affect Transformer In-Context Capabilities?
  Investigations with Function Classes
http://arxiv.org/abs/2404.03631v1: Robust Concept Erasure Using Task Vectors
http://arxiv.org/abs/2404.03880v1: Semantic SQL -- Combining and optimizing semantic predicates in SQL
http://arxiv.org/abs/2404.03995v1: Balancing Progress and Responsibility: A Synthesis of Sustainability
  Trade-Offs of AI-Based Systems
http://arxiv.org/abs/2404.04204v1: Social Skill Training with Large Language Models
http://arxiv.org/abs/2404.04237v1: Cleared for Takeoff? Compositional & Conditional Reasoning may be the
  Achilles Heel to (Flight-Booking) Language Agents
http://arxiv.org/abs/2404.04286v1: Language Model Evolution: An Iterated Learning Perspective
http://arxiv.org/abs/2404.04289v1: Designing for Human-Agent Alignment: Understanding what humans want from
  their agents
ht

In [101]:
# Ask openAI to pick the best articles:

import json

from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()
client = OpenAI()


def choose_summaries(summaries, k):
    try:
        response = client.chat.completions.create(
            model="gpt-4-1106-preview",
            messages=[
                {
                    "role": "system",
                    "content": "You are an expert summarizer capable of distilling complex information into its essence and a skilled evaluator of cutting edge ideas. Your choices should be based on the most interesting, novel, and cutting edge ideas.",
                },
                {
                    "role": "user",
                    "content": f"From the following article summaries, pick the {k} most interesting, novel, and cutting edge ideas and return a json list with 'id' and 'summary' for each. The id should contain the article id. You may also include a 'reason' for each choice.: {summaries}",
                },
            ],
            max_tokens=4096,
            temperature=0.0,
        )
        print(response.choices[0].message.content)
        response_content = (
            response.choices[0]
            .message.content.strip("`")
            .strip()
            .removeprefix("json\n")
        )

        # Debugging
        # print("Raw response content:", response_content)

        if response_content:
            parsed_response = json.loads(response_content)
            return parsed_response
        else:
            print("Response content is empty.")
            return []

    except json.JSONDecodeError as e:
        print("Failed to decode JSON:", e)
        return []
    except Exception as e:
        print("An error occurred:", e)
        return []

In [102]:
picks = choose_summaries(formatted, 5)

```json
[
  {
    "id": "http://arxiv.org/abs/2404.03502v1",
    "summary": "Artificial intelligence can boost productivity and generate insights, but its widespread use may have unintended consequences, such as harming public understanding. The reliance on recursive AI systems could lead to 'knowledge collapse,' impacting innovation and human culture, as demonstrated by a model where a discount on AI-generated content leads to public beliefs further from the truth. Further research is needed to address and mitigate these outcomes.",
    "reason": "The concept of 'knowledge collapse' due to AI is a novel and critical issue that could have profound implications on society and culture. It's a cutting-edge idea that challenges the current trajectory of AI development and calls for a deeper investigation into the long-term effects of AI on human knowledge and understanding."
  },
  {
    "id": "http://arxiv.org/abs/2404.04286v1",
    "summary": "Advancements in Large Language Models (LLMs)

In [89]:
picks

[]

In [107]:
from tensorflow.keras.layers import BatchNormalization, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.regularizers import l2

new_model = Sequential(
    [
        Dense(
            320,
            activation="relu",
            input_shape=(X_train.shape[1],),
            kernel_regularizer=l2(0.001),
        ),
        Dropout(0.0),
        BatchNormalization(),
        Dense(224, activation="relu", kernel_regularizer=l2(0.001)),
        Dropout(0.4),
        Dense(1, activation="sigmoid"),
    ]
)

new_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [108]:
# Model training
model.fit(X_train, y_train, epochs=30, batch_size=32, validation_split=0.2)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7f4061106d50>

In [109]:
# Print the predicted articles

new_x = []
formatted = []
for id in new_ids:
    new_x.append(get_embedding(id["paper_id"]))

new_preds = model.predict(new_x) > 0.5



In [110]:
for i in range(len(new_preds)):
    if new_preds[i] == True:
        paper_id = new_ids[i]["paper_id"]
        summary = new_ids[i]["concise_summary"]
        print(f"{paper_id}: {new_preds[i]}\n{summary}")
        formatted.append({"id": paper_id, "summary": summary})

http://arxiv.org/abs/2404.03502v1: [ True]
Artificial intelligence can boost productivity and generate insights, but its widespread use may have unintended consequences, such as harming public understanding. The reliance on recursive AI systems could lead to "knowledge collapse," impacting innovation and human culture, as demonstrated by a model where a discount on AI-generated content leads to public beliefs further from the truth. Further research is needed to address and mitigate these outcomes.
http://arxiv.org/abs/2404.03631v1: [ True]
The text discusses the use of Task Vectors (TV) for unconditionally erasing concepts from a text-to-image model, showing that TV-based erasure is more robust to unexpected user inputs but can impact the model's core performance. The proposed Diverse Inversion method helps estimate the required edit strength of the TV edit by finding diverse word embeddings that induce the generation of the target concept, allowing for selective erasure of model weig