In [1]:
import os
import weaviate
from transformers import AutoTokenizer, AutoModel
import torch
from docx import Document

# Set up connection to Weaviate
client = weaviate.Client("http://203.113.132.109:8080")

# Check and create the Document class if it doesn't exist
class_schema = {
    "class": "Document",
    "properties": [
        {"name": "content_txt", "dataType": ["text"]},
        {"name": "filename", "dataType": ["string"]}
    ]
}

if "Document" not in client.schema.get()['classes']:
    client.schema.create_class(class_schema)

# Load the transformer model
model_name = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def vectorize(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        vectors = model(**inputs)
    return vectors.last_hidden_state.mean(dim=1).numpy()[0]

def read_docx(file_path):
    """Read and return the text content of a .docx file."""
    doc = Document(file_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

def process_file(file_path, filename):
    """Process a file based on its format."""
    if filename.endswith('.txt'):
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
    elif filename.endswith('.docx'):
        content = read_docx(file_path)
    else:
        print(f"File format not supported: {filename}")
        return

    segments = [content[i:i+512] for i in range(0, len(content), 512)]

    for segment in segments:
        vector = vectorize(segment)
        document_data = {
            "content_txt": segment,
            "filename": filename
        }
        client.data_object.create(document_data, "Document", vector=vector)

# Read and process files
data_dir = "data"
for filename in os.listdir(data_dir):
    file_path = os.path.join(data_dir, filename)
    process_file(file_path, filename)

print("Completed storing text segments into Weaviate.")


File format not supported: .DS_Store
Completed storing text segments into Weaviate.
