In [52]:
import pandas as pd

In [53]:
df = pd.read_pickle('../artifacts/model.pkl')

In [54]:
df = df.head(1000)

In [55]:
from haystack.dataclasses  import Document
documents = []
for index, row in df.iterrows():
    descriptions = row["clean_text3"]

    doc = Document(
        content = row["clean_text3"],
        meta = {
            'category': row['category'],
        }
    )
    documents.append(doc)

In [56]:
documents

[Document(id=036254742d19f051617be9cbe191506630ea27ecc190725928932138a80e0895, content: 'modi promised minimum government maximum governance expected begin difficult job reforming state tak...', meta: {'category': 'Negative'}),
 Document(id=95553ca6dc39e7f5e4586f17f6f21e107fb4ebd0ad4a59bd5ba9c989d8be9f46, content: 'talk nonsense continue drama vote modi', meta: {'category': 'Netral'}),
 Document(id=528ee5cb2f9479030d6b0d462b7b3f3a2f71d8834ca05b7725df80741109db2b, content: 'say vote modi welcome bjp told rahul main campaigner modi think modi relax', meta: {'category': 'Positive'}),
 Document(id=707d9c276fa85cd1aab6673ade6175884329312dfd0dcfc18f75317d94bd46f9, content: 'asking supporters prefix chowkidar names modi great service confusion read crustal clear crass filth...', meta: {'category': 'Positive'}),
 Document(id=9bf67b43da86447f0c1da794a497fa832164593896508d0286f58636993803ef, content: 'answer among powerful world leader today trump putin modi may', meta: {'category': 'Positive'})

In [57]:
import os
import json
from dotenv import load_dotenv
from getpass import getpass
os.environ["MONGO_CONNECTION_STRING"] = getpass("Masukkan MongoDB Connection String Anda: ")


In [58]:
from haystack import Pipeline
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter
from haystack.document_stores.types import DuplicatePolicy
pipeline_storing = Pipeline()
from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore
document_store = MongoDBAtlasDocumentStore(
    database_name="final_project",
    collection_name="text",
    vector_search_index="vector_index",
    full_text_search_index="search_index",
)

In [59]:
from haystack import component
from pymongo import InsertOne

@component
class FastMongoBulkWriter:
    def __init__(self, document_store, batch_size: int = 2000):
        self.document_store = document_store
        self.batch_size = batch_size

    @component.output_types(success=bool)
    def run(self, documents: list[Document]):
        # documents: `list` of Haystack Document objects
        if not documents:
            return {"success": False}

        mongo_docs = [doc.to_dict() for doc in documents]

        for i in range(0, len(mongo_docs), self.batch_size):
            batch = mongo_docs[i : i + self.batch_size]
            ops = [InsertOne(d) for d in batch]
            self.document_store.collection.bulk_write(ops, ordered=False)

        return {"success": True}


In [60]:
pipeline = Pipeline()
pipeline.add_component("embedder",SentenceTransformersDocumentEmbedder())
pipeline.add_component("writer",DocumentWriter(document_store=document_store,policy=DuplicatePolicy.OVERWRITE))

pipeline.connect("embedder.documents","writer.documents")

<haystack.core.pipeline.pipeline.Pipeline object at 0x00000281DDF0DA90>
🚅 Components
  - embedder: SentenceTransformersDocumentEmbedder
  - writer: DocumentWriter
🛤️ Connections
  - embedder.documents -> writer.documents (List[Document])

In [61]:
pipeline.run({
    "embedder":{
        "documents":documents
    }
})

Batches: 100%|██████████| 32/32 [00:34<00:00,  1.07s/it]


{'writer': {'documents_written': 1000}}

## Store Category

In [62]:
from pymongo import MongoClient
import os
client = MongoClient(os.getenv('MONGO_CONNECTION_STRING'))
db = client.final_project
category_collection = db.categories 

In [63]:
category = df['category'].unique().tolist()

In [64]:
document_category = [ {'category' : c } for c in category]

In [65]:
category_collection.insert_many(document_category)

InsertManyResult([ObjectId('692512a68cde5d08fa2778e7'), ObjectId('692512a68cde5d08fa2778e8'), ObjectId('692512a68cde5d08fa2778e9')], acknowledged=True)