In [4]:
!pip install weaviate-client==4.6.3

Collecting weaviate-client==4.6.3
  Using cached weaviate_client-4.6.3-py3-none-any.whl.metadata (3.3 kB)
Collecting httpx<=0.27.0,>=0.25.0 (from weaviate-client==4.6.3)
  Using cached httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting validators==0.28.1 (from weaviate-client==4.6.3)
  Using cached validators-0.28.1-py3-none-any.whl.metadata (3.6 kB)
Collecting authlib<2.0.0,>=1.2.1 (from weaviate-client==4.6.3)
  Downloading Authlib-1.3.2-py2.py3-none-any.whl.metadata (3.9 kB)
Collecting grpcio-tools<2.0.0,>=1.57.0 (from weaviate-client==4.6.3)
  Downloading grpcio_tools-1.67.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.3 kB)
Collecting grpcio-health-checking<2.0.0,>=1.57.0 (from weaviate-client==4.6.3)
  Downloading grpcio_health_checking-1.67.1-py3-none-any.whl.metadata (1.1 kB)
Collecting protobuf<6.0dev,>=5.26.1 (from grpcio-health-checking<2.0.0,>=1.57.0->weaviate-client==4.6.3)
  Downloading protobuf-5.28.3-cp38-abi3-manylinux2014_x86_64.whl.m

In [5]:
import os
import weaviate
import weaviate.classes as wvc
import weaviate.classes.config as wc

# Set your API keys as environment variables
OPENAI_API_KEY = ""
WEAVIATE_API_KEY = ""
WEAVIATE_CLUSTER_URL = ""

In [9]:
client = weaviate.Client(
    url=WEAVIATE_CLUSTER_URL,
    auth_client_secret=weaviate.AuthApiKey(api_key=WEAVIATE_API_KEY),
    additional_headers={"X-OpenAI-Api-Key": OPENAI_API_KEY})

In [11]:
client.schema.delete_class("Movies")

class_obj = {
    "class": "Movies",
    "vectorizer": "text2vec-openai",
    "moduleConfig": {
        "text2vec-openai": {
            "model": "ada",
            "modelVersion": "002",
            "type": "text"
        }
    }
}

In [12]:
client.schema.create_class(class_obj)

In [15]:
import csv

# Open the CSV file for reading
f = open("/content/movies.csv", "r")
current_movie = None
row_count = 0  # Counter to track the number of rows processed

try:
    with client.batch as batch:
      batch.batch_size=100
      # Create a CSV reader object
      reader = csv.reader(f)
      next(reader)  # Skip header row if present

      # Iterate through each row of data
      for movie in reader:
          if row_count >= 1000:  # Stop after processing 1000 rows
              break

          current_movie = movie
          # Define properties based on the columns in your dataset
          properties = {
              "tmdb_id": movie[1],
              "imdb_id": movie[2],
              "title": movie[3],
              "original_title": movie[4],
              "adult": movie[5],
              "budget": movie[6],
              "genres": movie[7],
              "homepage": movie[8],
              "original_language": movie[9],
              "overview": movie[10],
              "popularity": movie[11],
              "poster_path": movie[12],
              "production_companies": movie[13],
              "production_countries": movie[14],
              "release_date": movie[15],
              "revenue": movie[16],
              "runtime": movie[17],
              "spoken_languages": movie[18],
              "status": movie[19],
              "vote_average": movie[20],
              "vote_count": movie[21],
              "collection_id": movie[22],
              "collection_name": movie[23],
              "collection_poster_path": movie[24],
              "collection_backdrop_path": movie[25],
          }

          # Assuming movie_collection is an initialized database or storage object
          # that has a `data.insert` method for storing each movie entry
          batch.add_data_object(data_object=properties, class_name="Movies")
          row_count += 1  # Increment the counter
except Exception as e:
    print(f"Exception: {e}.")

f.close()
# client.close()


In [23]:
from weaviate.classes.init import AdditionalConfig, Timeout
import json

# Semantic Search
nearText = {
    "concepts": ["murder", "crime", "mystery", "detective", "spy"]
}

response = (
    client.query.get("Movies", [
        "tmdb_id",
        "imdb_id",
        "title",
        "original_title",
        "adult",
        "budget",
        "genres",
        "homepage",
        "original_language",
        "overview",
        "popularity",
        "poster_path",
        "production_companies",
        "production_countries",
        "release_date",
        "revenue",
        "runtime",
        "spoken_languages",
        "status",
        "vote_average",
        "vote_count",
        "collection_id",
        "collection_name",
        "collection_poster_path",
        "collection_backdrop_path"
    ])
    .with_near_text(nearText)
    .with_limit(10)
    .do()
)

print(json.dumps(response, indent=4))

{
    "data": {
        "Get": {
            "Movies": [
                {
                    "adult": "False",
                    "budget": "0",
                    "collection_backdrop_path": "",
                    "collection_id": "",
                    "collection_name": "",
                    "collection_poster_path": "",
                    "genres": "Crime|Drama",
                    "homepage": "null",
                    "imdb_id": "tt0119250",
                    "original_language": "ja",
                    "original_title": "\u306f\u306a\u3073",
                    "overview": "A police officer leaves the force in the face of harrowing personal and professional difficulties. Spiraling into a depression, he makes questionable decisions.",
                    "popularity": "5.044722",
                    "poster_path": "/n0aDY2SIASdgh4g1FJYSH3IUYzz.jpg",
                    "production_companies": "Bandai Visual Company|Office Kitano|TV Tokyo|Tokyo FM Broadcasting Co.",