## Import JSON data 


### JSON Schema
```json
[
  {
    "prompt_token_len": 50,
    "prompt_sent_len": 3,
    "prompt": "Answer as if you were the medical director of an anesthesia service. Develop a preoperatively testing algorithm reported in a chart format. Select specific tests that are required for each test based on both particular medical diagnoses as well as by the type of operation is planned.",
    "completion_token_len": 48,
    "completion_sent_len": 2,
    "completion": "Preoperative Testing Algorithm Patient's Medical Diagnosis: Type of Operation: Recommended Testing: Test Medical Diagnosis Type of Operation --- --- --- Complete Blood Count (CBC) All All Coagulation Panel Bleeding disorders, liver disease All major surgeries, neurosurgery, orthopedic surgeries Electrol",
    "chat_id": "efdfb254-c950-495b-afe7-d6bbae3ba172"
}
]
```

### Weaviate Schema for cleaned data

```python
# Weaviate schema for cleaned and formatted data:

{
    "classes": [
      {
        "dataType": [
          "int"
        ],
        "description": "The number of tokens in the prompt",
        "indexFilterable": True,
        "indexSearchable": True,
        "moduleConfig": {
          "text2vec-transformers": {
            "skip": True,
            "vectorizePropertyName": False
          }
        },
        "name": "prompt_token_len"
      },
      {
        "dataType": [
          "int"
        ],
        "description": "The number of sentences in the prompt",
        "indexFilterable": True,
        "indexSearchable": True,
        "moduleConfig": {
          "text2vec-transformers": {
            "skip": True,
            "vectorizePropertyName": False
          }
        },
        "name": "prompt_sent_len"
      },
      {
        "dataType": [
          "text"
        ],
        "description": "The text sent to the API.",
        "indexFilterable": True,
        "indexSearchable": True,
        "moduleConfig": {
          "text2vec-transformers": {
            "skip": True,
            "vectorizePropertyName": False
          }
        },
        "name": "prompt",
        "tokenization": "word"
      },
      {
        "dataType": [
          "int"
        ],
        "description": "The number of tokens in the completion",
        "indexFilterable": True,
        "indexSearchable": True,
        "moduleConfig": {
          "text2vec-transformers": {
            "skip": True,
            "vectorizePropertyName": False
          }
        },
        "name": "completion_token_len"
      },
      {
        "dataType": [
          "int"
        ],
        "description": "The number of sentences in the completion",
        "indexFilterable": True,
        "indexSearchable": True,
        "moduleConfig": {
          "text2vec-transformers": {
            "skip": True,
            "vectorizePropertyName": False
          }
        },
        "name": "completion_sent_len"
      },
      {
        "dataType": [
          "text"
        ],
        "description": "This is the value returned by the API and used to test similarity",
        "indexFilterable": True,
        "indexSearchable": True,
        "moduleConfig": {
          "text2vec-transformers": {
            "skip": False,
            "vectorizePropertyName": False
          }
        },
        "name": "completion",
        "tokenization": "word"
      },
      {
        "dataType": [
          "date"
        ],
        "description": "Just the date_time that the record was made.",
        "indexFilterable": True,
        "indexSearchable": False,
        "moduleConfig": {
          "text2vec-transformers": {
            "skip": True,
            "vectorizePropertyName": False
          }
        },
        "name": "date_time"
      },
      {
        "dataType": [
          "text"
        ],
        "description": "A v4 UUID",
        "indexFilterable": True,
        "indexSearchable": True,
        "moduleConfig": {
          "text2vec-transformers": {
            "skip": True,
            "vectorizePropertyName": False
          }
        },
        "name": "chat_id",
        "tokenization": "word"
      }
    ]
}
```

There is just one script to perform the import:

In [None]:
# this script will import cleaned data into weaviate using the schema, described above.

import weaviate
import json
import uuid

print("Let's a go!")

print("Connecting to weaviate instance on localhost:8080...")
client = weaviate.Client("http://localhost:8080")
print("Client created")

print("Deleting all previous schemas")
client.schema.delete_all()

print("Creating new schema")

# ===== add schema =====
class_obj = {
  "class": "DriftBenchmark",
  "invertedIndexConfig": {
    "bm25": {
      "b": 0.75,
      "k1": 1.2
    },
    "cleanupIntervalSeconds": 60,
    "stopwords": {
      "additions": None,
      "preset": "en",
      "removals": None
    }
  },
  "moduleConfig": {
    "text2vec-transformers": {
      "model": "sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
      "options": {
        "waitForModel": False
      },
      "poolingStrategy": "masked_mean",
      "vectorizeClassName": False
    }
  },
  "multiTenancyConfig": {
    "enabled": False
  },
  "properties": [
    {
      "dataType": [
        "int"
      ],
      "description": "The number of tokens in the prompt",
      "indexFilterable": True,
      "moduleConfig": {
        "text2vec-transformers": {
          "skip": True,
          "vectorizePropertyName": False
        }
      },
      "name": "prompt_token_len"
    },
    {
      "dataType": [
        "int"
      ],
      "description": "The number of sentences in the prompt",
      "indexFilterable": True,
      "moduleConfig": {
        "text2vec-transformers": {
          "skip": True,
          "vectorizePropertyName": False
        }
      },
      "name": "prompt_sent_len"
    },
    {
      "dataType": [
        "text"
      ],
      "description": "The text sent to the API.",
      "indexFilterable": True,
      "indexSearchable": True,
      "moduleConfig": {
        "text2vec-transformers": {
          "skip": True,
          "vectorizePropertyName": False
        }
      },
      "name": "prompt",
      "tokenization": "word"
    },
    {
      "dataType": [
        "int"
      ],
      "description": "The number of tokens in the completion",
      "indexFilterable": True,
      "moduleConfig": {
        "text2vec-transformers": {
          "skip": True,
          "vectorizePropertyName": False
        }
      },
      "name": "completion_token_len"
    },
    {
      "dataType": [
        "int"
      ],
      "description": "The number of sentences in the completion",
      "indexFilterable": True,
      "moduleConfig": {
        "text2vec-transformers": {
          "skip": True,
          "vectorizePropertyName": False
        }
      },
      "name": "completion_sent_len"
    },
    {
      "dataType": [
        "text"
      ],
      "description": "This is the value returned by the API and used to test similarity",
      "indexFilterable": True,
      "indexSearchable": True,
      "moduleConfig": {
        "text2vec-transformers": {
          "skip": False,
          "vectorizePropertyName": False
        }
      },
      "name": "completion",
      "tokenization": "word"
    },
    {
      "dataType": [
        "date"
      ],
      "description": "Just the date_time that the record was made.",
      "indexFilterable": True,
      "indexSearchable": False,
      "moduleConfig": {
        "text2vec-transformers": {
          "skip": True,
          "vectorizePropertyName": False
        }
      },
      "name": "date_time"
    },
    {
      "dataType": [
        "text"
      ],
      "description": "A v4 UUID",
      "indexFilterable": True,
      "indexSearchable": True,
      "moduleConfig": {
        "text2vec-transformers": {
          "skip": True,
          "vectorizePropertyName": False
        }
      },
      "name": "chat_id",
      "tokenization": "word"
    }
  ],
  "vectorizer": "text2vec-transformers"
}

from datetime import datetime, timezone
local_time = datetime.now(timezone.utc).astimezone()
print()
print(local_time.isoformat())

    
client.schema.create_class(class_obj)

# ===== import data =====
# Load data
# import requests
# url = 'https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny.json'
# resp = requests.get(url)
# testJson = json.loads(resp.text)

# start small
cleanJsonFilePath = '..\\data\\clean\\ShareGptChatPairs_dev_32_cleaned_formatted.json'

# then do the big file    
# cleanJsonFilePath = '..\\data\\clean\\ShareGptChatPairs_3330_cleaned_formatted.json'

testJson = open(cleanJsonFilePath)
  
# returns JSON object as 
# a dictionary
testData = json.load(testJson)
# Closing file
testJson.close()

# Configure a batch process
with client.batch(
    batch_size=100
) as batch:
    # Batch import all Questions
    for i, d in enumerate(testData):
        # print(f"importing sample content JSON: {i+1}")
        properties = {
            "prompt": d["prompt"],
            "completion": d["completion"],
            "date_time": local_time.isoformat(),
            "chat_id": json.dumps(uuid.uuid4(), default=str)
        }
        client.batch.add_data_object(
            properties,
            "DriftBenchmark",
        )
        
print("Completed import of cleaned data to weaviate.")
