In [19]:
import json
import sqlite3

import chromadb
import numpy as np
from chromadb.utils import embedding_functions
from keras_tuner import RandomSearch
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

In [2]:
TRAIN_CUTOFF = 3003  # 2711 2424 2155 1880 1572 1016 817 502 260

In [3]:
def find_last_interested_entry(database_path):
    # Create a database connection
    conn = sqlite3.connect(database_path)
    cursor = conn.cursor()

    # SQL query to find the last occurrence of "interested" = 1
    query = """
    SELECT paper_id FROM papers 
    WHERE interested = 1 
    ORDER BY paper_id ASC 
    LIMIT 1;
    """

    try:
        cursor.execute(query)
        last_interested = cursor.fetchone()
        if last_interested:
            print("Last interested entry:", last_interested)
            return {"paper_id": last_interested[0]}
        else:
            print("No interested entries found.")
            return nil
    except sqlite3.Error as e:
        print("Database error:", e)
    finally:
        conn.close()

In [4]:
# db = "../data/arxiv_papers.db"
# last_interested = find_last_interested_entry(db)
# print(json.dumps(last_interested, indent=4))

Last interested entry: ('http://arxiv.org/abs/1706.03762v7',)
{
    "paper_id": "http://arxiv.org/abs/1706.03762v7"
}


In [7]:
def get_data(database_path):
    # Create a database connection
    conn = sqlite3.connect(database_path)
    cursor = conn.cursor()
    cursor.row_factory = sqlite3.Row

    query = """
SELECT paper_id, concise_summary, interested FROM papers 
WHERE updated < "2024-06-11T16:01:07Z"
ORDER BY paper_id ASC;
    """

    try:
        cursor.execute(query)
        articles = cursor.fetchall()
        if articles:
            print(f"Got {len(articles)}.")
            return articles
        else:
            print("No interested entries found.")
            return nil
    except sqlite3.Error as e:
        print("Database error:", e)
    finally:
        conn.close()

In [5]:
# def get_data(database_path):
#     # Create a database connection
#     conn = sqlite3.connect(database_path)
#     cursor = conn.cursor()
#     cursor.row_factory = sqlite3.Row

#     query = f"""
#     SELECT paper_id, concise_summary, interested FROM papers
#     ORDER BY paper_id ASC
#     LIMIT {TRAIN_CUTOFF};
#     """

#     try:
#         cursor.execute(query)
#         articles = cursor.fetchall()
#         if articles:
#             print(f"Got {len(articles)}.")
#             return articles
#         else:
#             print("No interested entries found.")
#             return nil
#     except sqlite3.Error as e:
#         print("Database error:", e)
#     finally:
#         conn.close()

In [41]:
def get_embedding(paper_id, vdb_path="../data/arxiv_embeddings.chroma"):
    vdb = chromadb.PersistentClient(vdb_path)
    sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="all-MiniLM-L6-v2"
    )
    embedding_func = sentence_transformer_ef
    vectors = vdb.get_or_create_collection(
        name="arxiver", embedding_function=embedding_func
    )

    res = vectors.get(ids=[paper_id], limit=1, include=["embeddings"])
    # print(res)
    # print(res["embeddings"][0])
    # print(f'{res["ids"][0]} {res["embeddings"][0]}')
    return res["embeddings"][0]

In [42]:
db = "../data/arxiv_papers.db"
articles = get_data(db)

Got 31783.


In [10]:
len(articles)

31783

In [11]:
articles = get_data(db)
X_article = []
y_article = []
for article in articles:
    # print(
    #     f'{article["paper_id"]}, {article["interested"]}\n{article["concise_summary"]}'
    # )
    embedding = get_embedding(article["paper_id"])
    interested = article["interested"]

    if np.any(np.isnan(embedding)):
        print(f'{article["paper_id"]} embedding is NaN: {embedding}')
    if len(embedding) == 0:
        print(f'{article["paper_id"]} embedding is empty')
    # if interested.dtype == "object":
    #     print(f'{article["paper_id"]} embedding is object: {interested}')
    X_article.append(embedding)
    y_article.append(interested)

# print(X_article[:3])
# print(y_article[:3])

X = np.array(X_article)
y = np.array(y_article)

# print(X[:1])
# print(y[:1])
print(X.shape, y.shape)

⚠️ It looks like you upgraded from a version below 0.6 and could benefit from vacuuming your database. Run chromadb utils vacuum --help for more information.


Got 31783.
(31783, 384) (31783,)


In [12]:
get_embedding(articles[0]["paper_id"])

array([-2.89059500e-03, -5.98241203e-02, -8.41916818e-03, -1.92166772e-02,
        1.41172642e-02,  3.16830203e-02, -4.35329936e-02,  7.39260986e-02,
        7.04051182e-02, -5.49505092e-02, -1.11630904e-02, -1.03973709e-02,
        4.60814014e-02,  9.84834880e-03, -1.03153214e-02,  1.31718060e-02,
        7.97514745e-04,  2.35291328e-02, -8.07519779e-02, -6.41983896e-02,
        4.06081825e-02,  4.55165133e-02,  3.51549871e-02, -1.11329872e-02,
        3.60924676e-02,  1.44928358e-02, -4.34918292e-02, -7.20196813e-02,
       -3.56623158e-02, -3.54668801e-03, -1.79217290e-03, -1.65253077e-05,
       -3.99428420e-03,  1.15200825e-01, -4.22250777e-02,  2.19556931e-02,
       -8.85772780e-02,  6.36700948e-04, -1.80355087e-03, -4.12729383e-02,
       -3.71518806e-02, -4.18130010e-02,  4.71759913e-03,  2.00797487e-02,
        1.26386300e-01,  1.17579000e-02,  1.30072087e-02,  1.12903332e-02,
        4.14143801e-02,  1.41969565e-02, -9.67511609e-02, -4.20608148e-02,
        1.26446513e-02,  

In [13]:
print(len(X[0]), y[0])

384 1


In [14]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
# # Convert data types

if y_train.dtype == object:
    y_train = y_train.astype(float)

# X_train = X_train.astype('float32')
# y_train = y_train.astype('int32')
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(25426, 384) (25426,) (6357, 384) (6357,)


In [15]:
print("Data type of X_train:", X_train.dtype)
print("Data type of y_train:", y_train.dtype)

Data type of X_train: float64
Data type of y_train: int64


In [16]:
print("NaN in X_train:", np.any(np.isnan(X_train)))
print("NaN in y_train:", np.any(np.isnan(y_train)))
print("Infinite in X_train:", np.all(np.isfinite(X_train)))
print("Infinite in y_train:", np.all(np.isfinite(y_train)))

NaN in X_train: False
NaN in y_train: False
Infinite in X_train: True
Infinite in y_train: True


In [21]:
# Model definition
model = Sequential(
    # [
    #     Dense(384, activation="relu", input_shape=(X_train.shape[1],)),
    #     Dropout(0.2),
    #     Dense(64, activation="relu"),
    #     Dense(1, activation="sigmoid"),
    # ]
    # [
    #     Dense(384, activation="relu", input_shape=(X_train.shape[1],)),
    #     Dense(224, activation="relu"),
    #     Dropout(0.4),
    #     Dense(1, activation="sigmoid"),
    # ]
    [
        Input(shape=(X_train.shape[1],)),  # Explicitly define input shape
        Dense(384, activation="relu"),
        Dense(224, activation="relu"),
        Dropout(0.4),
        Dense(1, activation="sigmoid"),
    ]
)

# model.compile(
#     optimizer=keras.optimizers.Adam(learning_rate=0.001),
#     loss="binary_crossentropy",
#     metrics=["accuracy"],
# )

model.compile(
    optimizer=Adam(learning_rate=0.001),  # Ensure using tf.keras.optimizers.Adam
    loss="binary_crossentropy",
    metrics=["accuracy"],
)

In [23]:
# Model training
model.fit(X_train, y_train, epochs=14, batch_size=32, validation_split=0.2)

Epoch 1/14
[1m636/636[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 1.0000 - loss: 2.8144e-07 - val_accuracy: 0.9919 - val_loss: 0.2151
Epoch 2/14
[1m636/636[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 1.0000 - loss: 1.7439e-07 - val_accuracy: 0.9919 - val_loss: 0.2188
Epoch 3/14
[1m636/636[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 1.0000 - loss: 1.0264e-07 - val_accuracy: 0.9919 - val_loss: 0.2222
Epoch 4/14
[1m636/636[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 1.0000 - loss: 6.6862e-08 - val_accuracy: 0.9919 - val_loss: 0.2268
Epoch 5/14
[1m636/636[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 1.0000 - loss: 6.4625e-08 - val_accuracy: 0.9917 - val_loss: 0.2295
Epoch 6/14
[1m636/636[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 1.0000 - loss: 6.4075e-08 - val_accuracy: 0.9904 - val_loss: 0.2354
Epoc

<keras.src.callbacks.history.History at 0x759e343bab10>

In [25]:
# Save
import datetime

formatted_time = datetime.datetime.now().strftime(f"%Y%m%d_%H%M")
# model.save(f"model-{formatted_time}-{TRAIN_CUTOFF}.keras")
model.save(f"model-{formatted_time}-{len(articles)}.keras")

In [26]:
# Evaluation
predictions = model.predict(X_test) > 0.5
print(classification_report(y_test, predictions))

[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6332
           1       0.00      0.00      0.00        25

    accuracy                           0.99      6357
   macro avg       0.50      0.50      0.50      6357
weighted avg       0.99      0.99      0.99      6357



In [17]:
def get_new_data(database_path):
    # Create a database connection
    conn = sqlite3.connect(database_path)
    cursor = conn.cursor()
    cursor.row_factory = sqlite3.Row

    # SQL query to find the last occurrence of "interested" = 1
    query = f"""
    SELECT paper_id, concise_summary FROM papers
    ORDER BY paper_id ASC
    LIMIT 2000 OFFSET {TRAIN_CUTOFF};
    """

    try:
        cursor.execute(query)
        articles = cursor.fetchall()
        if articles:
            print(f"Got {len(articles)}.")
            return articles
        else:
            print("No interested entries found.")
            return nil
    except sqlite3.Error as e:
        print("Database error:", e)
    finally:
        conn.close()

In [37]:
def get_new_data_since(database_path, since):
    # Create a database connection
    conn = sqlite3.connect(database_path)
    cursor = conn.cursor()
    cursor.row_factory = sqlite3.Row

    # SQL query to find the last occurrence of "interested" = 1
    query = f"""
    SELECT *
    FROM papers
    WHERE updated > "{since}"
    ORDER BY updated ASC
    LIMIT 2000;
    """

    try:
        cursor.execute(query)
        articles = cursor.fetchall()
        if articles:
            print(f"Got {len(articles)}.")
            return articles
        else:
            print("No interested entries found.")
            return nil
    except sqlite3.Error as e:
        print("Database error:", e)
    finally:
        conn.close()

In [38]:
new_ids = get_new_data_since(db, "2024-06-11T16:01:07Z")

new_ids[0]["paper_id"]

Got 2000.


'http://arxiv.org/abs/2406.07395v1'

In [45]:
new_ids[0].keys()

['paper_id', 'title', 'summary', 'updated', 'concise_summary', 'interested']

In [46]:
embedding = get_embedding(id["paper_id"])
print(embedding.shape)

(384,)


In [39]:
# new_ids = get_new_data(db)

# new_ids[0]["paper_id"]

In [47]:
# Print the predicted articles
# new_x = []
# formatted = []
# for id in new_ids:
#     new_x.append(get_embedding(id["paper_id"]))

# new_preds = model.predict(new_x) > 0.5


# Explanation
# - get_embedding returns a single vector with a shape of (384,) for each paper.
# - Appending these vectors to new_x creates a list of (384,) vectors, which is not the format expected by the model.
# - Converting new_x to a numpy array ensures it has the correct shape (num_samples, 384).

import numpy as np

# Collect embeddings into a list
new_x = [get_embedding(id["paper_id"]) for id in new_ids]

# Convert the list of embeddings to a 2D numpy array
new_x = np.array(new_x)

# Ensure the shape is (num_samples, 384)
print(new_x.shape)  # Should print something like (num_samples, 384)

# Pass to the model for prediction
new_preds = model.predict(new_x) > 0.5

(2000, 384)
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


In [48]:
for i in range(len(new_preds)):
    if new_preds[i] == True:
        paper_id = new_ids[i]["paper_id"]
        summary = new_ids[i]["concise_summary"]
        print(f"{paper_id}: {new_preds[i]}\n{summary}")
        formatted.append({"id": paper_id, "summary": summary})

http://arxiv.org/abs/2406.08477v1: [ True]
Vector representations of users and items are essential for recommender systems, with recent advancements exploring the use of Large Language Models (LLMs) to represent them in a question-and-answer format. By incorporating out-of-vocabulary (OOV) tokens in addition to in-vocabulary tokens, user-item relationships can be better captured and distinguished, leading to improved performance in downstream recommendation tasks compared to existing methods.
http://arxiv.org/abs/2406.08854v1: [ True]
Digital Twins and reinforcement learning are increasingly recognized for their potential in agriculture, with a focus on optimizing decision-making and resource management. Current research explores applications like robotics, greenhouse management, and irrigation systems, highlighting opportunities to enhance farming efficiency and sustainability through integrated technologies.
http://arxiv.org/abs/2406.09519v1: [ True]
The study explores how transforme

In [49]:
# for i in range(len(new_preds)):
#     if new_preds[i] == True:
#         print(f'{new_ids[i]["paper_id"]}')

print(formatted)

[{'id': 'http://arxiv.org/abs/2406.08477v1', 'summary': 'Vector representations of users and items are essential for recommender systems, with recent advancements exploring the use of Large Language Models (LLMs) to represent them in a question-and-answer format. By incorporating out-of-vocabulary (OOV) tokens in addition to in-vocabulary tokens, user-item relationships can be better captured and distinguished, leading to improved performance in downstream recommendation tasks compared to existing methods.'}, {'id': 'http://arxiv.org/abs/2406.08854v1', 'summary': 'Digital Twins and reinforcement learning are increasingly recognized for their potential in agriculture, with a focus on optimizing decision-making and resource management. Current research explores applications like robotics, greenhouse management, and irrigation systems, highlighting opportunities to enhance farming efficiency and sustainability through integrated technologies.'}, {'id': 'http://arxiv.org/abs/2406.09519v1',

In [50]:
# Retrieve article titles

import sys

# Add the parent directory to the Python path
sys.path.insert(0, "/home/woojay/P/ML/arxiver")

from arxiver.database import create_connection

conn = create_connection("../data/arxiv_papers.db")

if conn is not None:
    cursor = conn.cursor()

    for i in range(len(new_preds)):
        if new_preds[i] == True:
            # Fetch the specific entry
            cursor.execute(
                "SELECT paper_id, title, summary, concise_summary FROM papers WHERE paper_id = ?",
                (new_ids[i]["paper_id"],),
            )
            entry = cursor.fetchone()

            if not entry:
                conn.close()
                raise HTTPException(status_code=404, detail="Paper not found")

            paper_id, title, summary, concise_summary = entry

            print(f"{paper_id}: {title}")

http://arxiv.org/abs/2406.08477v1: Improving LLMs for Recommendation with Out-Of-Vocabulary Tokens
http://arxiv.org/abs/2406.08854v1: Current applications and potential future directions of reinforcement
  learning-based Digital Twins in agriculture
http://arxiv.org/abs/2406.09519v1: Talking Heads: Understanding Inter-layer Communication in Transformer
  Language Models
http://arxiv.org/abs/2406.09997v1: Towards Scalable and Versatile Weight Space Learning


In [31]:
# Ask openAI to pick the best articles:

import json

from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()
client = OpenAI()


def choose_summaries(summaries, k):
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",  # "gpt-4-1106-preview",
            messages=[
                {
                    "role": "system",
                    "content": "You are an expert summarizer capable of distilling complex information into its essence and a skilled evaluator of cutting edge ideas. Your choices should be based on the most interesting, novel, and cutting edge ideas.",
                },
                {
                    "role": "user",
                    "content": f"From the following article summaries, pick the {k} most interesting, novel, and cutting edge ideas and return a json list with 'id' and 'summary' for each. The id should contain the article id. You may also include a 'reason' for each choice.: {summaries}",
                },
            ],
            max_tokens=4096,
            temperature=0.0,
        )
        print(response.choices[0].message.content)
        response_content = (
            response.choices[0]
            .message.content.strip("`")
            .strip()
            .removeprefix("json\n")
        )

        # Debugging
        # print("Raw response content:", response_content)

        if response_content:
            parsed_response = json.loads(response_content)
            return parsed_response
        else:
            print("Response content is empty.")
            return []

    except json.JSONDecodeError as e:
        print("Failed to decode JSON:", e)
        return []
    except Exception as e:
        print("An error occurred:", e)
        return []

In [32]:
picks = choose_summaries(formatted, 5)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


```json
[
    {
        "id": "http://arxiv.org/abs/2404.04286v1",
        "summary": "Advancements in Large Language Models (LLMs) are leading to increased iterative interactions between models, with multi-round self-improving methods allowing for new example generation. By drawing parallels between LLM behavior and human cultural evolution using a Bayesian framework like Iterated Learning (IL), researchers aim to predict and guide the evolution of LLMs towards desired outcomes based on experimental verification.",
        "reason": "Innovative approach of applying human cultural evolution concepts to guide the evolution of Large Language Models."
    },
    {
        "id": "http://arxiv.org/abs/2404.04442v1",
        "summary": "Large Language Models (LLMs) are advancing artificial intelligence by enabling autonomous agents with human-like text skills, poised to revolutionize industries like healthcare and customer service. Despite challenges like multimodality and value alignment, t

In [33]:
picks

[{'id': 'http://arxiv.org/abs/2404.04286v1',
  'summary': 'Advancements in Large Language Models (LLMs) are leading to increased iterative interactions between models, with multi-round self-improving methods allowing for new example generation. By drawing parallels between LLM behavior and human cultural evolution using a Bayesian framework like Iterated Learning (IL), researchers aim to predict and guide the evolution of LLMs towards desired outcomes based on experimental verification.',
  'reason': 'Innovative approach of applying human cultural evolution concepts to guide the evolution of Large Language Models.'},
 {'id': 'http://arxiv.org/abs/2404.04442v1',
  'summary': 'Large Language Models (LLMs) are advancing artificial intelligence by enabling autonomous agents with human-like text skills, poised to revolutionize industries like healthcare and customer service. Despite challenges like multimodality and value alignment, techniques such as prompting and reasoning are being explo

In [26]:
from tensorflow.keras.layers import BatchNormalization, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.regularizers import l2

new_model = Sequential(
    [
        Dense(
            320,
            activation="relu",
            input_shape=(X_train.shape[1],),
            kernel_regularizer=l2(0.001),
        ),
        Dropout(0.0),
        BatchNormalization(),
        Dense(224, activation="relu", kernel_regularizer=l2(0.001)),
        Dropout(0.4),
        Dense(1, activation="sigmoid"),
    ]
)

new_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [27]:
# Model training
model.fit(X_train, y_train, epochs=30, batch_size=32, validation_split=0.2)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7f0db6155990>

In [28]:
# Print the predicted articles

new_x = []
formatted = []
for id in new_ids:
    new_x.append(get_embedding(id["paper_id"]))

new_preds = model.predict(new_x) > 0.5

INFO:chromadb.api.segment:Collection arxiver is not created.
INFO:chromadb.api.segment:Collection arxiver is not created.
INFO:chromadb.api.segment:Collection arxiver is not created.
INFO:chromadb.api.segment:Collection arxiver is not created.
INFO:chromadb.api.segment:Collection arxiver is not created.
INFO:chromadb.api.segment:Collection arxiver is not created.
INFO:chromadb.api.segment:Collection arxiver is not created.
INFO:chromadb.api.segment:Collection arxiver is not created.
INFO:chromadb.api.segment:Collection arxiver is not created.
INFO:chromadb.api.segment:Collection arxiver is not created.
INFO:chromadb.api.segment:Collection arxiver is not created.
INFO:chromadb.api.segment:Collection arxiver is not created.
INFO:chromadb.api.segment:Collection arxiver is not created.
INFO:chromadb.api.segment:Collection arxiver is not created.
INFO:chromadb.api.segment:Collection arxiver is not created.
INFO:chromadb.api.segment:Collection arxiver is not created.
INFO:chromadb.api.segmen



In [29]:
for i in range(len(new_preds)):
    if new_preds[i] == True:
        paper_id = new_ids[i]["paper_id"]
        summary = new_ids[i]["concise_summary"]
        print(f"{paper_id}: {new_preds[i]}\n{summary}")
        formatted.append({"id": paper_id, "summary": summary})

http://arxiv.org/abs/2404.04286v1: [ True]
Advancements in Large Language Models (LLMs) are leading to increased iterative interactions between models, with multi-round self-improving methods allowing for new example generation. By drawing parallels between LLM behavior and human cultural evolution using a Bayesian framework like Iterated Learning (IL), researchers aim to predict and guide the evolution of LLMs towards desired outcomes based on experimental verification.
http://arxiv.org/abs/2404.04298v1: [ True]
The study investigates whether Language Models (LLMs) can enhance their performance by refining previous outputs. Despite introducing a framework to evaluate generative and discriminative abilities, the experimental analysis of various LLMs suggests that they do not consistently excel in discrimination over initial generation, which could impact the advancement of self-improving AI systems.
http://arxiv.org/abs/2404.04361v1: [ True]
The study examines Large Language Models' (L