### Data Ingestion with LangChain4j
<img src="https://docs.langchain4j.dev/img/logo.svg" alt="LangChain4J" width="200" height="200">

In [1]:
%use dataframe
%useLatestDescriptors

import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper
import dev.langchain4j.data.segment.TextSegment
import dev.langchain4j.store.embedding.EmbeddingStore
import dev.langchain4j.store.embedding.pgvector.PgVectorEmbeddingStore
import java.util.Arrays

In [3]:

val host = "localhost"
val port = 5431
val user = "user"
val password = "password"
val mapper = jacksonObjectMapper()


### Create or Split Documents

In [1]:
import org.jetbrains.kotlinx.dataframe.codeGen.generateCode

val df = DataFrame.read("/Users/urs/development/github/ai/kotlin-ai-talk/langchain4j/src/main/resources/food/italian_delaight_dishes.csv")
df.generateCode()
df.head()


Name,Category,Ingredients
Tiramisu,Desserts,"[['Mascarpone', '750g'], ['Eggs', '26..."
Cookies,Desserts,"[['Flour', '195g'], ['Butter', '100g'..."
Pancakes with Maple Syrup,Desserts,"[['Butter', '25g'], ['Flour', '125g']..."
Mascarpone Cream,Desserts,"[['Mascarpone', '500g'], ['Sugar', '1..."
Sweet and Savory Crepes (Basic Recipe),Desserts,"[['Eggs', '3'], ['Flour', '250g'], ['..."


In [5]:
import com.fasterxml.jackson.module.kotlin.readValue
import dev.langchain4j.data.document.*

val documents = df.map {
    runCatching {
        val ingredients = Ingredients.replace("'", "\"")
        val content = """${Name} ${Category} ${mapper.readValue<List<List<String>>>(ingredients).map { it[0] }}"""
        Document.document(content, Metadata(mapOf("Category" to Category, "Ingredients" to Ingredients)))
    }.getOrNull()
}.filterNotNull()
documents

[Document { text = "Tiramisu Desserts [Mascarpone, Eggs, Ladyfingers, Sugar, Coffee, Unsweetened cocoa powder]" metadata = {Category=Desserts, Ingredients=[['Mascarpone', '750g'], ['Eggs', '260g'], ['Ladyfingers', '250g'], ['Sugar', '120g'], ['Coffee', '300g'], ['Unsweetened cocoa powder', 'to taste']]} }, Document { text = "Cookies Desserts [Flour, Butter, Baking soda, Eggs, Brown sugar, Sugar, Dark chocolate chips, Fine salt]" metadata = {Category=Desserts, Ingredients=[['Flour', '195g'], ['Butter', '100g'], ['Baking soda', '1 pinch'], ['Eggs', '55g'], ['Brown sugar', '100g'], ['Sugar', '100g'], ['Dark chocolate chips', '200g'], ['Fine salt', '1 pinch']]} }, Document { text = "Pancakes with Maple Syrup Desserts [Butter, Flour, Eggs, Whole milk, Baking powder, Sugar, Maple syrup]" metadata = {Category=Desserts, Ingredients=[['Butter', '25g'], ['Flour', '125g'], ['Eggs', '2'], ['Whole milk', '200g'], ['Baking powder', '6g'], ['Sugar', '15g'], ['Maple syrup', 'to taste']]} }, Document {

### Ingest Documents into EmbeddingStore

In [6]:
import dev.langchain4j.model.embedding.onnx.allminilml6v2.AllMiniLmL6V2EmbeddingModel
import dev.langchain4j.store.embedding.EmbeddingStoreIngestor

val embeddingStore: EmbeddingStore<TextSegment> = 
    PgVectorEmbeddingStore.builder()
        .host(host)
        .port(port)
        .user(user)
        .password(password)
        .database("langchain")
        .table("italianfood")
        .dimension(384)
        .dropTableFirst(true)
        .build();

val embeddingModel = AllMiniLmL6V2EmbeddingModel()

val ingestor = EmbeddingStoreIngestor.builder()
    .embeddingModel(embeddingModel)
    .embeddingStore(embeddingStore)
    .build()

In [7]:
val rows = df.size().nrow
documents.withIndex().forEach {(idx, doc) -> 
    ingestor.ingest(doc);
    if(idx % 20 == 0) {
        println("Ingested: $idx / $rows")
    }
}
println("ingested: $rows / $rows")

Ingested: 0 / 179
Ingested: 20 / 179
Ingested: 40 / 179
Ingested: 60 / 179
Ingested: 80 / 179
Ingested: 100 / 179
Ingested: 120 / 179
Ingested: 140 / 179
Ingested: 160 / 179
ingested: 179 / 179


### ...and now we can do: Similarity Search!

In [8]:
import org.jetbrains.kotlinx.dataframe.codeGen.generateInterfaces

val url = "jdbc:postgresql://$host:$port/langchain"
val dbConfig = DatabaseConfiguration(url, user, password)
val tableName = "italianfood"

val dbDf = DataFrame.readSqlTable(dbConfig, tableName, 100)
dbDf.select("text", "embedding")

text,embedding
"Tiramisu Desserts [Mascarpone, Eggs, ...","[-0.0028785171,-0.07107066,-0.0042027..."
"Cookies Desserts [Flour, Butter, Baki...","[-0.029691242,-0.005058944,0.04953179..."
Pancakes with Maple Syrup Desserts [B...,"[-0.027092937,-0.050008707,0.02202043..."
Mascarpone Cream Desserts [Mascarpone...,"[0.05660126,-0.03724398,0.05349223,0...."
Sweet and Savory Crepes (Basic Recipe...,"[-0.009003175,-0.03619684,0.05490666,..."
Lasagna Bolognese Main courses [Remil...,"[0.0011588035,-0.032416902,-0.0175443..."
Bechamel Sauce Sauces and Gravies [Wh...,"[-0.038078986,-0.032063548,0.05718103..."
"Pizza Dough Leavened [Manitoba flour,...","[-0.0479692,0.055703554,0.033998877,0..."
Gingerbread Cookies Desserts [Ground ...,"[-0.04603928,-0.037240695,0.050910093..."
"Red Velvet Cake Desserts [Butter, Sug...","[-0.019261664,-0.050559748,0.06313152..."


In [10]:
val maxResults = 5
val minScore = 0.8


val prompt = "a dish with mozarella cheese, pasta, sea food and tomato saus"
val promptAsVector = java.util.Arrays.toString(embeddingModel.embed(prompt).content().vector())

//Welcome to the <=> 'cosine similarity' operator! It also supports inner product (<#>) and L1 distance (<+>)
val query = """SELECT * from (WITH temp AS (SELECT (2 - (embedding <=> '%s')) / 2 AS score, embedding_id, embedding, text, metadata FROM %s) SELECT * FROM temp WHERE score >= %s ORDER BY score desc LIMIT %s) as result"""
    .format(promptAsVector, tableName, minScore, maxResults)

DataFrame.readSqlQuery(dbConfig, query).select("text", "score")



text,score
Mozzarella in carrozza Antipasti [Whi...,806143
Seafood Paella Unique Dishes [Bomba R...,802768
