In [36]:
import json
import sqlite3

import chromadb
import keras
import numpy as np
from chromadb.utils import embedding_functions
from keras_tuner import RandomSearch
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential

In [37]:
TRAIN_CUTOFF = 3003  # 2711 2424 2155 1880 1572 1016 817 502 260

In [38]:
db = "../data/arxiv_papers.db"
# last_interested = find_last_interested_entry(db)
# print(json.dumps(last_interested, indent=4))

In [39]:
def get_data(database_path):
    # Create a database connection
    conn = sqlite3.connect(database_path)
    cursor = conn.cursor()
    cursor.row_factory = sqlite3.Row

    # SQL query to find the last occurrence of "interested" = 1
    query = f"""
    SELECT paper_id, concise_summary, interested FROM papers 
    ORDER BY paper_id ASC 
    LIMIT {TRAIN_CUTOFF};
    """

    try:
        cursor.execute(query)
        articles = cursor.fetchall()
        if articles:
            print(f"Got {len(articles)}.")
            return articles
        else:
            print("No interested entries found.")
            return nil
    except sqlite3.Error as e:
        print("Database error:", e)
    finally:
        conn.close()

In [40]:
def get_embedding(paper_id, vdb_path="../data/arxiv_embeddings.chroma"):
    vdb = chromadb.PersistentClient(vdb_path)
    sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="all-MiniLM-L6-v2"
    )
    embedding_func = sentence_transformer_ef
    vectors = vdb.get_or_create_collection(
        name="arxiver", embedding_function=embedding_func
    )

    res = vectors.get(ids=[paper_id], limit=1, include=["embeddings"])
    # print(res)
    # print(f'{res["ids"][0]} {res["embeddings"][0]}')
    return res["embeddings"][0]

In [41]:
articles = get_data(db)
X_article = []
y_article = []
for article in articles:
    # print(
    #     f'{article["paper_id"]}, {article["interested"]}\n{article["concise_summary"]}'
    # )
    X_article.append(get_embedding(article["paper_id"]))
    y_article.append(article["interested"])

# print(X_article[:3])
# print(y_article[:3])

X = np.array(X_article)
y = np.array(y_article)

# print(X[:1])
# print(y[:1])
print(X.shape, y.shape)

Got 3003.
(3003, 384) (3003,)


In [42]:
print(len(X[0]))

384


In [43]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [44]:
def build_model(hp):
    model = Sequential()
    model.add(
        Dense(
            units=hp.Int("units1", min_value=32, max_value=512, step=32),
            activation="relu",
            input_shape=(X_train.shape[1],),
        )
    )
    model.add(
        Dropout(rate=hp.Float("dropout1", min_value=0.0, max_value=0.5, step=0.1))
    )
    model.add(
        Dense(
            units=hp.Int("units2", min_value=32, max_value=256, step=32),
            activation="relu",
        )
    )
    model.add(
        Dropout(rate=hp.Float("dropout2", min_value=0.0, max_value=0.5, step=0.1))
    )
    model.add(Dense(1, activation="sigmoid"))

    # Compile model
    hp_learning_rate = hp.Choice("learning_rate", values=[1e-2, 1e-3, 1e-4])
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model

In [45]:
tuner = RandomSearch(
    build_model,
    objective="val_accuracy",
    max_trials=80,
    executions_per_trial=2,
    directory="my_dir",
    project_name="arxiv_article_embeddings",
)

Reloading Tuner from my_dir/arxiv_article_embeddings/tuner0.json


In [46]:
from tensorflow.keras.callbacks import EarlyStopping

stop_early = EarlyStopping(monitor="val_loss", patience=5)

tuner.search(X_train, y_train, epochs=50, validation_split=0.2, callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

In [47]:
# Build the model with the optimal hyperparameters and train it on the data
new_model = tuner.hypermodel.build(best_hps)
new_model.fit(X_train, y_train, epochs=50, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7f9e803b5b10>

In [48]:
best_hps.values

{'units1': 320,
 'dropout1': 0.0,
 'units2': 224,
 'dropout2': 0.4,
 'learning_rate': 0.001}

In [49]:
# # best_hps.values @ 20 iter

# {'units1': 320,
#  'dropout1': 0.0,
#  'units2': 224,
#  'dropout2': 0.4,
#  'learning_rate': 0.001}

In [50]:
# Build the model with the optimal hyperparameters but with matching input units
better_hps = {
    "units1": 384,
    "dropout1": 0.0,
    "units2": 224,
    "dropout2": 0.4,
    "learning_rate": 0.001,
}
better_model = tuner.hypermodel.build(best_hps)
better_model.fit(X_train, y_train, epochs=50, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7f9e58062050>