In [2]:
import json
import sqlite3

import chromadb
import numpy as np
from chromadb.utils import embedding_functions
from keras_tuner import RandomSearch
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

2024-05-15 20:11:08.831791: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-15 20:11:08.862027: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-15 20:11:08.862051: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-15 20:11:08.862729: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-15 20:11:08.867692: I tensorflow/core/platform/cpu_feature_guar

In [3]:
TRAIN_CUTOFF = 3003  # 2711 2424 2155 1880 1572 1016 817 502 260

In [4]:
def find_last_interested_entry(database_path):
    # Create a database connection
    conn = sqlite3.connect(database_path)
    cursor = conn.cursor()

    # SQL query to find the last occurrence of "interested" = 1
    query = """
    SELECT paper_id FROM papers 
    WHERE interested = 1 
    ORDER BY paper_id ASC 
    LIMIT 1;
    """

    try:
        cursor.execute(query)
        last_interested = cursor.fetchone()
        if last_interested:
            print("Last interested entry:", last_interested)
            return {"paper_id": last_interested[0]}
        else:
            print("No interested entries found.")
            return nil
    except sqlite3.Error as e:
        print("Database error:", e)
    finally:
        conn.close()

In [5]:
db = "../data/arxiv_papers.db"
last_interested = find_last_interested_entry(db)
print(json.dumps(last_interested, indent=4))

Last interested entry: ('http://arxiv.org/abs/1706.03762v7',)
{
    "paper_id": "http://arxiv.org/abs/1706.03762v7"
}


In [6]:
def get_data(database_path):
    # Create a database connection
    conn = sqlite3.connect(database_path)
    cursor = conn.cursor()
    cursor.row_factory = sqlite3.Row

    query = f"""
    SELECT paper_id, concise_summary, interested FROM papers 
    ORDER BY paper_id ASC 
    LIMIT {TRAIN_CUTOFF};
    """

    try:
        cursor.execute(query)
        articles = cursor.fetchall()
        if articles:
            print(f"Got {len(articles)}.")
            return articles
        else:
            print("No interested entries found.")
            return nil
    except sqlite3.Error as e:
        print("Database error:", e)
    finally:
        conn.close()

In [7]:
def get_embedding(paper_id, vdb_path="../data/arxiv_embeddings.chroma"):
    vdb = chromadb.PersistentClient(vdb_path)
    sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="all-MiniLM-L6-v2"
    )
    embedding_func = sentence_transformer_ef
    vectors = vdb.get_or_create_collection(
        name="arxiver", embedding_function=embedding_func
    )

    res = vectors.get(ids=[paper_id], limit=1, include=["embeddings"])
    # print(res)
    # print(res["embeddings"][0])
    # print(f'{res["ids"][0]} {res["embeddings"][0]}')
    return res["embeddings"][0]

In [9]:
articles = get_data(db)
X_article = []
y_article = []
for article in articles:
    # print(
    #     f'{article["paper_id"]}, {article["interested"]}\n{article["concise_summary"]}'
    # )
    embedding = get_embedding(article["paper_id"])
    interested = article["interested"]

    if np.any(np.isnan(embedding)):
        print(f'{article["paper_id"]} embedding is NaN: {embedding}')
    if len(embedding) == 0:
        print(f'{article["paper_id"]} embedding is empty')
    # if interested.dtype == "object":
    #     print(f'{article["paper_id"]} embedding is object: {interested}')
    X_article.append(embedding)
    y_article.append(interested)

# print(X_article[:3])
# print(y_article[:3])

X = np.array(X_article)
y = np.array(y_article)

# print(X[:1])
# print(y[:1])
print(X.shape, y.shape)

Got 3003.




(3003, 384) (3003,)


In [10]:
get_embedding(articles[0]["paper_id"])

[-0.0028905950020998716,
 -0.05982412025332451,
 -0.008419168181717396,
 -0.01921667717397213,
 0.014117264188826084,
 0.031683020293712616,
 -0.04353299364447594,
 0.07392609864473343,
 0.07040511816740036,
 -0.05495050922036171,
 -0.011163090355694294,
 -0.010397370904684067,
 0.04608140140771866,
 0.009848348796367645,
 -0.010315321385860443,
 0.013171806000173092,
 0.0007975147454999387,
 0.023529132828116417,
 -0.08075197786092758,
 -0.06419838964939117,
 0.040608182549476624,
 0.04551651328802109,
 0.03515498712658882,
 -0.011132987216114998,
 0.03609246760606766,
 0.014492835849523544,
 -0.04349182918667793,
 -0.07201968133449554,
 -0.03566231578588486,
 -0.0035466880071908236,
 -0.0017921729013323784,
 -1.6525307728443295e-05,
 -0.003994284197688103,
 0.11520082503557205,
 -0.042225077748298645,
 0.021955693140625954,
 -0.0885772779583931,
 0.0006367009482346475,
 -0.0018035508692264557,
 -0.04127293825149536,
 -0.03715188056230545,
 -0.04181300103664398,
 0.004717599134892225,

In [11]:
print(len(X[0]), y[0])

384 1


In [12]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
# # Convert data types

if y_train.dtype == object:
    y_train = y_train.astype(float)

# X_train = X_train.astype('float32')
# y_train = y_train.astype('int32')
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(2402, 384) (2402,) (601, 384) (601,)


In [13]:
print("Data type of X_train:", X_train.dtype)
print("Data type of y_train:", y_train.dtype)

Data type of X_train: float64
Data type of y_train: int64


In [14]:
print("NaN in X_train:", np.any(np.isnan(X_train)))
print("NaN in y_train:", np.any(np.isnan(y_train)))
print("Infinite in X_train:", np.all(np.isfinite(X_train)))
print("Infinite in y_train:", np.all(np.isfinite(y_train)))

NaN in X_train: False
NaN in y_train: False
Infinite in X_train: True
Infinite in y_train: True


In [15]:
# Model definition
model = Sequential(
    # [
    #     Dense(384, activation="relu", input_shape=(X_train.shape[1],)),
    #     Dropout(0.2),
    #     Dense(64, activation="relu"),
    #     Dense(1, activation="sigmoid"),
    # ]
    [
        Dense(384, activation="relu", input_shape=(X_train.shape[1],)),
        Dense(224, activation="relu"),
        Dropout(0.4),
        Dense(1, activation="sigmoid"),
    ]
)

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss="binary_crossentropy",
    metrics=["accuracy"],
)

In [16]:
# Model training
model.fit(X_train, y_train, epochs=30, batch_size=32, validation_split=0.2)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7fc6b5d95210>

In [17]:
# Save
import datetime

formatted_time = datetime.datetime.now().strftime(f"%Y%m%d_%H%M")
model.save(f"model-{formatted_time}-{TRAIN_CUTOFF}.keras")

In [18]:
# Evaluation
predictions = model.predict(X_test) > 0.5
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.95      0.98      0.97       567
           1       0.29      0.12      0.17        34

    accuracy                           0.93       601
   macro avg       0.62      0.55      0.57       601
weighted avg       0.91      0.93      0.92       601



In [19]:
def get_new_data(database_path):
    # Create a database connection
    conn = sqlite3.connect(database_path)
    cursor = conn.cursor()
    cursor.row_factory = sqlite3.Row

    # SQL query to find the last occurrence of "interested" = 1
    query = f"""
    SELECT paper_id, concise_summary FROM papers
    ORDER BY paper_id ASC
    LIMIT 2000 OFFSET {TRAIN_CUTOFF};
    """

    try:
        cursor.execute(query)
        articles = cursor.fetchall()
        if articles:
            print(f"Got {len(articles)}.")
            return articles
        else:
            print("No interested entries found.")
            return nil
    except sqlite3.Error as e:
        print("Database error:", e)
    finally:
        conn.close()

In [20]:
new_ids = get_new_data(db)

new_ids[0]["paper_id"]

Got 2000.


'http://arxiv.org/abs/2404.04234v2'

In [21]:
# Print the predicted articles

new_x = []
formatted = []
for id in new_ids:
    new_x.append(get_embedding(id["paper_id"]))

new_preds = model.predict(new_x) > 0.5



In [22]:
for i in range(len(new_preds)):
    if new_preds[i] == True:
        paper_id = new_ids[i]["paper_id"]
        summary = new_ids[i]["concise_summary"]
        print(f"{paper_id}: {new_preds[i]}\n{summary}")
        formatted.append({"id": paper_id, "summary": summary})

http://arxiv.org/abs/2404.04286v1: [ True]
Advancements in Large Language Models (LLMs) are leading to increased iterative interactions between models, with multi-round self-improving methods allowing for new example generation. By drawing parallels between LLM behavior and human cultural evolution using a Bayesian framework like Iterated Learning (IL), researchers aim to predict and guide the evolution of LLMs towards desired outcomes based on experimental verification.
http://arxiv.org/abs/2404.04298v1: [ True]
The study investigates whether Language Models (LLMs) can enhance their performance by refining previous outputs. Despite introducing a framework to evaluate generative and discriminative abilities, the experimental analysis of various LLMs suggests that they do not consistently excel in discrimination over initial generation, which could impact the advancement of self-improving AI systems.
http://arxiv.org/abs/2404.04361v1: [ True]
The study examines Large Language Models' (L

In [23]:
# for i in range(len(new_preds)):
#     if new_preds[i] == True:
#         print(f'{new_ids[i]["paper_id"]}')

print(formatted)

[{'id': 'http://arxiv.org/abs/2404.04286v1', 'summary': 'Advancements in Large Language Models (LLMs) are leading to increased iterative interactions between models, with multi-round self-improving methods allowing for new example generation. By drawing parallels between LLM behavior and human cultural evolution using a Bayesian framework like Iterated Learning (IL), researchers aim to predict and guide the evolution of LLMs towards desired outcomes based on experimental verification.'}, {'id': 'http://arxiv.org/abs/2404.04298v1', 'summary': 'The study investigates whether Language Models (LLMs) can enhance their performance by refining previous outputs. Despite introducing a framework to evaluate generative and discriminative abilities, the experimental analysis of various LLMs suggests that they do not consistently excel in discrimination over initial generation, which could impact the advancement of self-improving AI systems.'}, {'id': 'http://arxiv.org/abs/2404.04361v1', 'summary':

In [147]:
# Retrieve article titles

import sys

# Add the parent directory to the Python path
sys.path.insert(0, "/home/woojay/P/ML/arxiver")

from arxiver.database import create_connection

conn = create_connection("../data/arxiv_papers.db")

if conn is not None:
    cursor = conn.cursor()

    for i in range(len(new_preds)):
        if new_preds[i] == True:
            # Fetch the specific entry
            cursor.execute(
                "SELECT paper_id, title, summary, concise_summary FROM papers WHERE paper_id = ?",
                (new_ids[i]["paper_id"],),
            )
            entry = cursor.fetchone()

            if not entry:
                conn.close()
                raise HTTPException(status_code=404, detail="Paper not found")

            paper_id, title, summary, concise_summary = entry

            print(f"{paper_id}: {title}")

http://arxiv.org/abs/2404.04298v1: SELF-[IN]CORRECT: LLMs Struggle with Refining Self-Generated Responses
http://arxiv.org/abs/2404.04361v1: Deciphering Political Entity Sentiment in News with Large Language
  Models: Zero-Shot and Few-Shot Strategies
http://arxiv.org/abs/2404.04540v1: The Case for Developing a Foundation Model for Planning-like Tasks from
  Scratch
http://arxiv.org/abs/2404.04570v1: A Map of Exploring Human Interaction patterns with LLM: Insights into
  Collaboration and Creativity
http://arxiv.org/abs/2404.04750v2: Now, Later, and Lasting: Ten Priorities for AI Research, Policy, and
  Practice
http://arxiv.org/abs/2404.04821v1: A Data-to-Product Multimodal Conceptual Framework to Achieve Automated
  Software Evolution for Context-rich Intelligent Applications
http://arxiv.org/abs/2404.04834v1: LLM-Based Multi-Agent Systems for Software Engineering: Vision and the
  Road Ahead
http://arxiv.org/abs/2404.04854v1: Contextual Chart Generation for Cyber Deception
http://ar

In [None]:
# Ask openAI to pick the best articles:

import json

from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()
client = OpenAI()


def choose_summaries(summaries, k):
    try:
        response = client.chat.completions.create(
            model="gpt-4-1106-preview",
            messages=[
                {
                    "role": "system",
                    "content": "You are an expert summarizer capable of distilling complex information into its essence and a skilled evaluator of cutting edge ideas. Your choices should be based on the most interesting, novel, and cutting edge ideas.",
                },
                {
                    "role": "user",
                    "content": f"From the following article summaries, pick the {k} most interesting, novel, and cutting edge ideas and return a json list with 'id' and 'summary' for each. The id should contain the article id. You may also include a 'reason' for each choice.: {summaries}",
                },
            ],
            max_tokens=4096,
            temperature=0.0,
        )
        print(response.choices[0].message.content)
        response_content = (
            response.choices[0]
            .message.content.strip("`")
            .strip()
            .removeprefix("json\n")
        )

        # Debugging
        # print("Raw response content:", response_content)

        if response_content:
            parsed_response = json.loads(response_content)
            return parsed_response
        else:
            print("Response content is empty.")
            return []

    except json.JSONDecodeError as e:
        print("Failed to decode JSON:", e)
        return []
    except Exception as e:
        print("An error occurred:", e)
        return []

In [None]:
picks = choose_summaries(formatted, 5)

In [None]:
picks

In [None]:
from tensorflow.keras.layers import BatchNormalization, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.regularizers import l2

new_model = Sequential(
    [
        Dense(
            320,
            activation="relu",
            input_shape=(X_train.shape[1],),
            kernel_regularizer=l2(0.001),
        ),
        Dropout(0.0),
        BatchNormalization(),
        Dense(224, activation="relu", kernel_regularizer=l2(0.001)),
        Dropout(0.4),
        Dense(1, activation="sigmoid"),
    ]
)

new_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
# Model training
model.fit(X_train, y_train, epochs=30, batch_size=32, validation_split=0.2)

In [None]:
# Print the predicted articles

new_x = []
formatted = []
for id in new_ids:
    new_x.append(get_embedding(id["paper_id"]))

new_preds = model.predict(new_x) > 0.5

In [None]:
for i in range(len(new_preds)):
    if new_preds[i] == True:
        paper_id = new_ids[i]["paper_id"]
        summary = new_ids[i]["concise_summary"]
        print(f"{paper_id}: {new_preds[i]}\n{summary}")
        formatted.append({"id": paper_id, "summary": summary})