### Data Ingestion with LangChain4j
<img src="https://docs.langchain4j.dev/img/logo.svg" alt="LangChain4J" width="200" height="200">

In [None]:
%use dataframe
%useLatestDescriptors

import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper
import com.fasterxml.jackson.module.kotlin.readValue
import dev.langchain4j.data.segment.TextSegment
import dev.langchain4j.store.embedding.EmbeddingStore
import dev.langchain4j.store.embedding.pgvector.PgVectorEmbeddingStore
import dev.langchain4j.model.embedding.*
import dev.langchain4j.model.embedding.onnx.allminilml6v2.AllMiniLmL6V2EmbeddingModel
import dev.langchain4j.data.embedding.*
import dev.langchain4j.store.embedding.*
import java.util.Arrays


In [None]:

val host = "localhost"
val port = 5431
val user = "user"
val password = "password"
val mapper = jacksonObjectMapper()


### Embedding Models

In [None]:
import org.jetbrains.kotlinx.dataframe.codeGen.generateCode
val df = DataFrame.read("src/main/resources/food/italian_delaight_dishes.csv")
df.generateCode()
df.head()


In [None]:
val firstDish = df.first().run {
    val ingredients = Ingredients.replace("'", "\"")
    """${Name} ${Category} ${mapper.readValue<List<List<String>>>(ingredients).map { it[0] }}"""
}
firstDish


In [None]:
val embeddingModel = AllMiniLmL6V2EmbeddingModel()

In [None]:

val response = embeddingModel.embed(firstDish)
val embedding:Embedding = response.content()
embedding.dimension()


In [None]:
embedding.vector()

In [None]:
response.tokenUsage()

### Similarity between Vectors

In [None]:
infix fun String.similarityWith(text2: String): Double {
    val inputOne: Embedding = embeddingModel.embed(this).content()
    val inputTwo: Embedding = embeddingModel.embed(text2).content()
    return CosineSimilarity.between(inputOne, inputTwo)
}


In [None]:
"Seafood" similarityWith "Lobster"

In [None]:
"Seafood with Spaghetti" similarityWith "Lobster with Pasta"

### Create Documents to be Vectorized

In [None]:
import com.fasterxml.jackson.module.kotlin.readValue
import dev.langchain4j.data.document.*

val documents = df.map {
    runCatching {
        val ingredients = Ingredients.replace("'", "\"")
        val content = """${Name} ${Category} ${mapper.readValue<List<List<String>>>(ingredients).map { it[0] }}"""
        Document.document(content, Metadata(mapOf("Category" to Category, "Ingredients" to Ingredients)))
    }.getOrNull()
}.filterNotNull()
documents

### Ingest Documents into EmbeddingStore

In [None]:
import dev.langchain4j.model.embedding.onnx.allminilml6v2.AllMiniLmL6V2EmbeddingModel
import dev.langchain4j.store.embedding.EmbeddingStoreIngestor

val embeddingStore: EmbeddingStore<TextSegment> = 
    PgVectorEmbeddingStore.builder()
        .host(host)
        .port(port)
        .user(user)
        .password(password)
        .database("langchain")
        .table("italianfood")
        .dimension(384)
        .dropTableFirst(true)
        .build();



val ingestor = EmbeddingStoreIngestor.builder()
    .embeddingModel(embeddingModel)
    .embeddingStore(embeddingStore)
    .build()

In [None]:
val rows = df.size().nrow
documents.withIndex().forEach {(idx, doc) -> 
    ingestor.ingest(doc);
    if(idx % 20 == 0) {
        println("Ingested: $idx / $rows")
    }
}
println("ingested: $rows / $rows")

### ...and now we can do: Similarity Search!

In [None]:
import org.jetbrains.kotlinx.dataframe.codeGen.generateInterfaces

val url = "jdbc:postgresql://$host:$port/langchain"
val dbConfig = DatabaseConfiguration(url, user, password)
val tableName = "italianfood"

val dbDf = DataFrame.readSqlTable(dbConfig, tableName, 100)
dbDf.select("text", "embedding")

In [None]:
import org.intellij.lang.annotations.Language

val maxResults = 5
val minScore = 0.7


val prompt = "a dish with noodles, seafood, mozzarella cheese and tomato sauce"
val promptAsVector = java.util.Arrays.toString(embeddingModel.embed(prompt).content().vector())

//Welcome to the <=> 'cosine similarity' operator! It also supports inner product (<#>) and L1 distance (<+>)
val query = """SELECT * from (WITH temp AS (SELECT (2 - (embedding <=> '%s')) / 2 AS score, embedding_id, embedding, text, metadata FROM %s) SELECT * FROM temp WHERE score >= %s ORDER BY score desc LIMIT %s) as result"""
    .format(promptAsVector, tableName, minScore, maxResults)

DataFrame.readSqlQuery(dbConfig, query).select("text", "score")

