In [65]:
import weaviate
from weaviate import Client
import csv
import json
import pandas as pd
import ast
import os
from langchain.vectorstores.weaviate import Weaviate
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import AzureOpenAI
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
from langchain.chains import ChatVectorDBChain,ConversationalRetrievalChain,RetrievalQAWithSourcesChain,RetrievalQA
from langchain.document_loaders import JSONLoader
from pathlib import Path
from pprint import pprint

In [84]:
import os
import requests
import json
import openai
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.api_base = os.getenv("OPENAI_ENDPOINT") # your endpoint should look like the following https://YOUR_RESOURCE_NAME.openai.azure.com/
openai.api_type = 'azure'
openai.api_version = '2023-05-15' # this may change in the future

deployment_name=os.getenv("OPENAI_DEPLOYMENT_NAME") #This will correspond to the custom name you chose for your deployment when you deployed a model. 

# Send a completion call to generate an answer
print('Sending a test completion job')
start_phrase = 'Write a tagline for an ice cream shop. '
response = openai.Completion.create(engine=deployment_name, prompt=start_phrase, max_tokens=10)
text = response['choices'][0]['text'].replace('\n', '').replace(' .', '.').strip()
print(start_phrase+text)

Sending a test completion job
Write a tagline for an ice cream shop. "Scoops of happiness in every cone!"


In [61]:
client = weaviate.Client(
    url = "http://localhost:8080",  # Replace with your endpoint
    additional_headers = {
        "X-Azure-Api-Key": os.getenv("OPENAI_API_KEY")
    }
    
)
# Check if your instance is live and ready
# This should return `True`
client.is_ready()

True

In [63]:
client.query.aggregate("DataClass").with_meta_count().do()

{'data': {'Aggregate': {'DataClass': [{'meta': {'count': 39561}}]}}}

# Data Import

In [4]:
client.schema.delete_class("DataClass")

In [6]:
# This creates a class Question, specifying which vectorizer to use, sets the moduleConfig for the vectorizer and specifies the generative module to be used.
class_obj = {
    "class": "DataClass",
    "vectorizer": None,  # If set to "none" you must always provide vectors yourself. Could be any other "text2vec-*" also.00ff6900-e64f-5d94-90db-c8cfa3fc851b
    "moduleConfig": {
        "text2vec-openai": {},
        "generative-openai": {
            "model": "gpt-3.5-turbo",
            "resourceName": "nwj-openai",
            "deploymentId": "gf-model-deploy-002",
            "temperatureProperty": 0,
        },
    }
}

client.schema.create_class(class_obj)

In [8]:
df = pd.read_csv("final-dup.csv")
df.head()

Unnamed: 0,comment,core_identity_groups,sub_identity_groups,embedding
0,Yes indeed. She sort of reminds me of the elde...,['race'],"['asian', 'black', 'latinx', 'middle eastern',...","[-0.015919025987386703, -0.028737619519233704,..."
1,The trans women reading this tweet right now i...,['gender'],['transgender men'],"[-0.017841871827840805, -0.013057178817689419,..."
2,Question: These 4 broads who criticize America...,['origin'],['immigrant'],"[-0.004760753363370895, -0.026184145361185074,..."
3,It is about time for all illegals to go back t...,['origin'],['undocumented'],"[0.0033386661671102047, -0.011945863254368305,..."
4,For starters bend over the one in pink and kic...,['gender'],['women'],"[-0.008337941952049732, -0.012026377953588963,..."


In [9]:
# Check the datatype
print(type(df['comment'].iloc[0]))
print(type(df['core_identity_groups'].iloc[0]))
print(type(df['sub_identity_groups'].iloc[0]))
print(type(df['embedding'].iloc[0]))

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>


In [10]:
df['core_identity_groups'] = df['core_identity_groups'].apply(ast.literal_eval)
df['sub_identity_groups'] = df['sub_identity_groups'].apply(ast.literal_eval)
df['embedding'] = df['embedding'].apply(ast.literal_eval)

In [11]:
# ===== import data =====
# Load data
data = df.to_dict(orient='records')

In [12]:
data[0]

{'comment': 'Yes indeed. She sort of reminds me of the elder lady that played the part in the movie "Titanic" who was telling her story!!! And I wouldn\'t have wanted to cover who I really am!! I would be proud!!!! WE should be proud of our race no matter what it is!!',
 'core_identity_groups': ['race'],
 'sub_identity_groups': ['asian',
  'black',
  'latinx',
  'middle eastern',
  'native american',
  'pacific islander',
  'white'],
 'embedding': [-0.015919025987386703,
  -0.028737619519233704,
  0.01290326938033104,
  -0.035980645567178726,
  -0.009131944738328457,
  0.003080891678109765,
  0.00793997198343277,
  0.0042598373256623745,
  0.00532479677349329,
  -0.020999565720558167,
  -0.011046918109059334,
  0.018381133675575256,
  0.009535782970488071,
  -0.024699240922927856,
  -0.018394161015748978,
  0.01607535034418106,
  0.0311345923691988,
  -0.0035596347879618406,
  0.00645489152520895,
  -0.03381815925240517,
  -0.010389053262770176,
  0.014811728149652481,
  0.012382187880

In [None]:
# # Read the CSV file and convert each row to a dictionary
# with open("final-dup.csv", mode='r') as file:
#     reader = csv.DictReader(file)
#     rows_list = list(reader)

# # Convert the list of dictionaries to a string representation
# data = json.dumps(rows_list, indent=4)

# # Print the string representation of the list of dictionaries
# print(data)


In [13]:
# Prepare a batch process
# Configure batch
client.batch.configure(batch_size=20,
                       dynamic=True,
                       timeout_retries=3)

with client.batch as batch:
    # Batch import all Questions
    for i, d in enumerate(data):
        print(f"importing data: {i+1}")  # To see imports

        properties = {
            "comment": d["comment"],
            "core_identity": d["core_identity_groups"],
            "sub_identity": d["sub_identity_groups"],
        }

        batch.add_data_object(
            data_object=properties,
            class_name="DataClass",
            vector=d['embedding']
        )

importing data: 1
importing data: 2
importing data: 3
importing data: 4
importing data: 5
importing data: 6
importing data: 7
importing data: 8
importing data: 9
importing data: 10
importing data: 11
importing data: 12
importing data: 13
importing data: 14
importing data: 15
importing data: 16
importing data: 17
importing data: 18
importing data: 19
importing data: 20
importing data: 21
importing data: 22
importing data: 23
importing data: 24
importing data: 25
importing data: 26
importing data: 27
importing data: 28
importing data: 29
importing data: 30
importing data: 31
importing data: 32
importing data: 33
importing data: 34
importing data: 35
importing data: 36
importing data: 37
importing data: 38
importing data: 39
importing data: 40
importing data: 41
importing data: 42
importing data: 43
importing data: 44
importing data: 45
importing data: 46
importing data: 47
importing data: 48
importing data: 49
importing data: 50
importing data: 51
importing data: 52
importing data: 53
im

# Test run vector store

In [24]:
# Test that all data has loaded – get object count
result = (
    client.query.aggregate("Comments")
    .with_fields("meta { count }")
    .do()
)
print("Object count: ", result["data"]["Aggregate"]["Comments"], "\n")



Object count:  [{'meta': {'count': 39561}}] 



In [25]:
# Find one entry
test_comment = (
    client.query
    .get("Comments", ["comment", "core_identity_groups", "sub_identity_groups"])
    .with_limit(1)
    .do()
)["data"]["Get"]["Comments"][0]

print(test_comment['comment'])
print(test_comment['core_identity_groups'])
print(test_comment['sub_identity_groups'])



Lesbian Butch should play soccer and keep her worthless political comments to herself,looks like a crusty ass looking shrimp boat captain.. just like her I couldn't refrain from my shity ass comments too
['sexuality']
['lesbian']


In [49]:
# query with relevancy score 
def query_weaviate(query, collection_name):
    
    nearText = {
        "concepts": [query],
        "distance": 0.7,
    }

    properties = [
        "comment", "core_identity", "sub_identity",
        # "_additional {certainty distance}"
    ]

    result = (
        client.query
        .get(collection_name, properties)
        .with_near_text(nearText)
        .with_limit(10)
        .do()
    )
    
    # Check for errors
    if ("errors" in result):
        print ("\033[91mYou probably have run out of OpenAI API calls for the current minute – the limit is set at 60 per minute.")
        raise Exception(result["errors"][0]['message'])
    
    return result["data"]["Get"][collection_name]

In [50]:
query_result = query_weaviate("I hate chinese and malays they are pigs!", "DataClass")

In [54]:
print(json.dumps(query_result, indent=4))

[
    {
        "comment": "Annihilate the chinese government please. They are inbred retards and shouldn't be able to call themselves, human fucking pigs..",
        "core_identity": [
            "origin"
        ],
        "sub_identity": [
            "specific country"
        ]
    },
    {
        "comment": "Fucking racist pigs.",
        "core_identity": [
            "race"
        ],
        "sub_identity": [
            "other"
        ]
    },
    {
        "comment": "Man Chinese people are the worst",
        "core_identity": [
            "race"
        ],
        "sub_identity": [
            "asian"
        ]
    },
    {
        "comment": "I dislike most malays cause they're judgemental",
        "core_identity": [
            "race",
            "origin"
        ],
        "sub_identity": [
            "asian",
            "specific country"
        ]
    },
    {
        "comment": "Fuck China. Bunch of hosers.",
        "core_identity": [
            "origin"
   

In [55]:
# Write JSON data to a file
with open("test.json", "w") as json_file:
    json.dump(query_result, json_file, indent=4)

In [17]:


for i, v in enumerate(query_result):
    print(f"{i+1}. { v['comment']} (Score: {round(v['_additional']['certainty'],3) })")

1. God I hate Americans (Score: 0.94)
2. Canadians are just snow Mexicans (Score: 0.938)
3. Of course it's the Americans that always running to Canada and they hate Americans. (Score: 0.937)
4. No Canadians wtf... you racist fuck go die in a hole... stupid bitch (Score: 0.936)
5. I hate people of color (Score: 0.928)
6. *I hate fucking minorities (Score: 0.927)
7. Kick Canadian poor business and their citizens out of U.S. soil! (Score: 0.926)
8. You Canadicunts deserve to be raped and killed for being weak (Score: 0.926)
9. I hate minorities too (Score: 0.926)
10. I hate atheists (Score: 0.925)


In [34]:
prompt_template = """
You are a professional, specialised in reviewing abusive content which could violate sections 298 and 298A of Singapore's penal code.

Section 298 of the Penal Code is a charge for uttering words, etc., with deliberate intent to wound the religious or racial feelings of any person.
Section 298A of the Penal Code is a charge for promoting enmity between different groups on grounds of religion or race or doing acts prejudicial to maintenance of harmony.
Hate speech has been defined as all forms of expression which spread, incite, promote, or justify racial hatred, xenophobia, or other forms of hatred based on intolerance.

Your instruction is to first evaluate if the query at the end is considered a hate speech or not. 
Secondly, identify the possible core identity groups and the sub identity groups.

Use the following pieces of context to answer the query. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Query: Does the following comment:{query} violate sections 298 and 298A of Singapore's penal code? Answer 'Yes' or 'No'.
Core targeted identity groups: [answer here]
Sub targeted identity groups: [answer here]
Begin!

Context:
---------
{result}
---------
Query: {query}
Answer:"""

In [None]:
hatespeech_template= """
You are a professional, specialised in reviewing abusive content which could violate sections 298 and 298A of Singapore's penal code.
Section 298 of the Penal Code is a charge for uttering words, etc., with deliberate intent to wound the religious or racial feelings of any person.
Section 298A of the Penal Code is a charge for promoting enmity between different groups on grounds of religion or race or doing acts prejudicial to maintenance of harmony.
Hate speech has been defined as all forms of expression which spread, incite, promote, or justify racial hatred, xenophobia, or other forms of hatred based on intolerance.
Your instruction is to first evaluate if the comment at the end is considered a hate speech or not. Secondly, identify the possible core identity groups and the sub identity groups.
Use only the context given below to carry out the task. If you don't know the answer, just say you don't know the answer.

Here is the context:
{context}

Here is the comment:
{question}
"""

In [93]:
# generative query (grouped)
def generative_search_group(query, collection_name):
    generateTask = hatespeech_template
    result = (
        client.query
        .get(collection_name, ["comment", "core_identity", "sub_identity"])
        .with_near_text({ "concepts": [query], "distance": 0.7 })
        .with_generate(grouped_task=generateTask)
        .with_limit(3)
        .do()
    )
    
    # Check for errors
    if ("errors" in result):
        raise Exception(result["errors"][0]['message'])
    
    return result["data"]["Get"][collection_name]

In [95]:
query_result = generative_search_group("I hate chinese and malays they are pigs!", "DataClass")

print(query_result[0]['_additional']['generate']['groupedResult'])
# print(json.dumps(query_result, indent=4))

Based on the given context, the comment "Annihilate the Chinese government please. They are inbred retards and shouldn't be able to call themselves, human fucking pigs.." can be considered hate speech. It spreads, incites, and promotes racial hatred and derogatory language towards the Chinese people.

Possible core identity group: Chinese people
Possible sub-identity groups: Chinese government, Chinese individuals

The comment "Fucking racist pigs." can also be considered hate speech as it uses derogatory language and promotes hatred based on race.

Possible core identity group: Unspecified
Possible sub-identity groups: Unspecified

The comment "Man Chinese people are the worst" can be seen as derogatory and offensive, but it does not explicitly incite or promote hatred. It may be considered disrespectful or offensive, but it does not meet the criteria of hate speech as defined.

Possible core identity group: Chinese people
Possible sub-identity groups: Unspecified


In [9]:
id = "00000ff7-fe6c-45e5-91b8-be1a19a0c393"

data_object = client.data_object.get_by_id(
    uuid=id,
    class_name='Comments',
)

print(json.dumps(data_object, indent=2))

{
  "class": "Comments",
  "creationTimeUnix": 1695966794924,
  "id": "00000ff7-fe6c-45e5-91b8-be1a19a0c393",
  "lastUpdateTimeUnix": 1695966794924,
  "properties": {
    "comment": "Lesbian Butch should play soccer and keep her worthless political comments to herself,looks like a crusty ass looking shrimp boat captain.. just like her I couldn't refrain from my shity ass comments too",
    "core_identity_groups": [
      "sexuality"
    ],
    "sub_identity_groups": [
      "lesbian"
    ]
  },
  "vectorWeights": null
}


In [14]:
# Semantic search
response = (
    client.query
    .get("Comments", ["comment", "core_identity_groups", "sub_identity_groups"])
    .with_near_text({"concepts": ["I hate Singaporeans"]})
    .with_additional(["id", "vector"])
    .with_limit(1)
    .do()
)

print(json.dumps(response, indent=4))

{
    "data": {
        "Get": {
            "Comments": [
                {
                    "_additional": {
                        "id": "91197a4c-12a3-446d-bd51-8f4eefe9d7dd",
                        "vector": [
                            -0.013833661,
                            -0.0050428594,
                            -0.016684387,
                            -0.023887563,
                            -0.014253634,
                            0.01879698,
                            -0.009506665,
                            0.017957034,
                            -0.012955535,
                            -0.013719123,
                            0.016595302,
                            0.00813857,
                            0.003337514,
                            -0.011663799,
                            -0.004797875,
                            0.026471034,
                            0.04948047,
                            -0.01142836,
                            0.0320

In [None]:
print(json.dumps(response))
print(type(json.dumps(response)))

# LLM

In [85]:
load_dotenv()
# create an instance of Azure OpenAI
llm = AzureOpenAI(
    deployment_name=os.getenv("OPENAI_DEPLOYMENT_NAME"),
    model_name=os.getenv("OPENAI_MODEL"), 
    openai_api_version=os.getenv("OPENAI_API_VERSION"),
    temperature=0
)

In [106]:
hatespeech_template= """
You are a professional, specialised in reviewing abusive content which could violate sections 298 and 298A of Singapore's penal code.
Section 298 of the Penal Code is a charge for uttering words, etc., with deliberate intent to wound the religious or racial feelings of any person.
Section 298A of the Penal Code is a charge for promoting enmity between different groups on grounds of religion or race or doing acts prejudicial to maintenance of harmony.
Hate speech has been defined as all forms of expression which spread, incite, promote, or justify racial hatred, xenophobia, or other forms of hatred based on intolerance.
Your instruction is to first evaluate if the comment at the end is considered a hate speech or not. Secondly, identify the possible core identity groups and the sub identity groups.
Use only the context given below to carry out the task. If you don't know the answer, just say you don't know the answer.

Here is the context:
{context}

Here is the comment:
{comment}

Return your answer in the following format:

Answer: [Yes or No]
Core Identity Groups: [Core identity groups here]
Sub Identity Groups: [Sub identity groups here]
"""

prompt_template = PromptTemplate(
    input_variables=['context', 'comment'],
    template=hatespeech_template
)

In [58]:
query1 = "Dwight Howard is a undercover Faggot, this guy was caught on camera fondling Isiah Canaan penis when you were playing for the Rockets. Dwight's career is over because he's softer than marmalade and he couldn't shoot a beach ball into a ocean."
query2 = "Hey Twitter!!! Islam is a tool of Satan!!! Muslims worship a child molester!!! There is only one God and he is YOUR maker and he still sits on his throne, watching all of you disgusting fools!!!! I pity all of you when you stand before him!!"
query3 = "He is a smelly chinese from CHINA with tiny eyes are so irritating, they do not belong to my country. GO BACK!!!"
query4 = "Singaporeans are mean"
query5 = "Singaporeans especially the malays are mean"
query6 = "Singaporeans especially the malays are mean and they fucking deserve to rot in hell!!!"

In [59]:
db = Weaviate(client, "Comments", "comment")
db.similarity_search_by_text(query=query1)

ValueError: Error during query: [{'locations': [{'column': 6, 'line': 1}], 'message': 'Cannot query field "Comments" on type "GetObjectsObj".', 'path': None}]

In [65]:
# pass example query to vector store and QA chain and print results
docs = db.similarity_search_by_text(query=query1)
print(docs)

[Document(page_content="Dwight Howard is a undercover Faggot, this guy was caught on camera fondling Isiah Canaan penis when you were playing for the Rockets. Dwight's career is over because he's softer than marmalade and he couldn't shoot a beach ball into a ocean.", metadata={}), Document(page_content='Shut yo gay ass up dwight', metadata={}), Document(page_content="It's drakes speed. He's an effeminate rapper. His courtside antics are like those of a disgruntled cheerleader or a drunk groupie. Don't let the beard and goofy wannabe mean face fool you. Drake's a pussy and likely gay. URL P.S. ESPN is already a gossip channel that provides catharsis to weak, inadequate, impotent, pedophilic white men. CBS morning can't outdo the nazi mouse in garbage content.", metadata={}), Document(page_content="He's not a basketball player Lmaoo buthurt ass nigga", metadata={})]


In [58]:
# instantiate QA chain
chain = load_qa_chain(llm, chain_type="map_rerank", return_intermediate_steps=True)

In [60]:
results = chain({"input_documents": docs, "question": query1}, return_only_outputs=True)
print(results['output_text'])



 This document does not answer the question


In [61]:
results["intermediate_steps"]

[{'answer': ' This document does not answer the question', 'score': '0'},
 {'answer': ' This document does not answer the question', 'score': '0'},
 {'answer': ' This document does not answer the question', 'score': '0'},
 {'answer': ' This document does not answer the question', 'score': '0'}]

In [23]:
from langchain.output_parsers import RegexParser

output_parser = RegexParser(
    regex=r"(.*?)\nScore: (.*)",
    output_keys=["answer", "score"],
)

prompt_template = """
You are a professional, specialised in reviewing abusive content which could violate sections 298 and 298A of Singapore's penal code.

Section 298 of the Penal Code is a charge for uttering words, etc., with deliberate intent to wound the religious or racial feelings of any person.
Section 298A of the Penal Code is a charge for promoting enmity between different groups on grounds of religion or race or doing acts prejudicial to maintenance of harmony.
Hate speech has been defined as all forms of expression which spread, incite, promote, or justify racial hatred, xenophobia, or other forms of hatred based on intolerance.

Your instruction is to first evaluate if the comment at the end is considered a hate speech or not. 
Secondly, identify the possible core identity groups and the sub identity groups.

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

In addition to giving an answer, also return a score of how fully it answered the user's question. This should be in the following format:

Question: [question here]
Helpful Answer: [answer here]
Score: [score between 0 and 100]

Begin!

Context:
---------
{context}
---------
Question: {question}
Answer:"""
PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"],
    output_parser=output_parser,
)

chain = load_qa_chain(llm, chain_type="stuff", return_intermediate_steps=True, prompt=PROMPT)

results = chain({"input_documents": docs, "question": query1}, return_only_outputs=True)
results["intermediate_steps"]

ValidationError: 1 validation error for StuffDocumentsChain
return_intermediate_steps
  extra fields not permitted (type=value_error.extra)

# Useing weaviate:

In [79]:
db = Weaviate(client=client, index_name="DataClass", text_key="comment")
retriever = db.as_retriever()

In [80]:
docs = retriever.get_relevant_documents(query=query1)
docs

[Document(page_content="Dwight Howard is a undercover Faggot, this guy was caught on camera fondling Isiah Canaan penis when you were playing for the Rockets. Dwight's career is over because he's softer than marmalade and he couldn't shoot a beach ball into a ocean.", metadata={}),
 Document(page_content='Shut yo gay ass up dwight', metadata={}),
 Document(page_content="It's drakes speed. He's an effeminate rapper. His courtside antics are like those of a disgruntled cheerleader or a drunk groupie. Don't let the beard and goofy wannabe mean face fool you. Drake's a pussy and likely gay. URL P.S. ESPN is already a gossip channel that provides catharsis to weak, inadequate, impotent, pedophilic white men. CBS morning can't outdo the nazi mouse in garbage content.", metadata={}),
 Document(page_content="He's not a basketball player Lmaoo buthurt ass nigga", metadata={})]

In [81]:
chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
result = chain("I hate chinese and malays they are pigs!", return_only_outputs=True)
print(result)

InvalidRequestError: Invalid URL (POST /v1/openai/deployments/gf-model-deploy-003/completions)

In [67]:
file_path='./test.json'
data = json.loads(Path(file_path).read_text())

pprint(data)

[{'comment': 'Annihilate the chinese government please. They are inbred '
             "retards and shouldn't be able to call themselves, human fucking "
             'pigs..',
  'core_identity': ['origin'],
  'sub_identity': ['specific country']},
 {'comment': 'Fucking racist pigs.',
  'core_identity': ['race'],
  'sub_identity': ['other']},
 {'comment': 'Man Chinese people are the worst',
  'core_identity': ['race'],
  'sub_identity': ['asian']},
 {'comment': "I dislike most malays cause they're judgemental",
  'core_identity': ['race', 'origin'],
  'sub_identity': ['asian', 'specific country']},
 {'comment': 'Fuck China. Bunch of hosers.',
  'core_identity': ['origin'],
  'sub_identity': ['specific country']},
 {'comment': 'Good, fuck the Chinese',
  'core_identity': ['origin'],
  'sub_identity': ['specific country']},
 {'comment': 'Fuck chinks',
  'core_identity': ['race'],
  'sub_identity': ['asian']},
 {'comment': 'Screw the Chinese',
  'core_identity': ['origin'],
  'sub_identit

In [78]:
# Define the metadata extraction function.
def metadata_func(record: dict, metadata: dict) -> dict:

    metadata["core_identity"] = record.get("core_identity")
    metadata["sub_identity"] = record.get("sub_identity")
    metadata["source"] = None
    metadata["seq_num"] = None

    return metadata



loader = JSONLoader(
    file_path='./test.json',
    jq_schema='.[]',
    content_key="comment",
    metadata_func=metadata_func
)

data = loader.load()

pprint(data)

[Document(page_content="Annihilate the chinese government please. They are inbred retards and shouldn't be able to call themselves, human fucking pigs..", metadata={'source': None, 'seq_num': None, 'core_identity': ['origin'], 'sub_identity': ['specific country']}),
 Document(page_content='Fucking racist pigs.', metadata={'source': None, 'seq_num': None, 'core_identity': ['race'], 'sub_identity': ['other']}),
 Document(page_content='Man Chinese people are the worst', metadata={'source': None, 'seq_num': None, 'core_identity': ['race'], 'sub_identity': ['asian']}),
 Document(page_content="I dislike most malays cause they're judgemental", metadata={'source': None, 'seq_num': None, 'core_identity': ['race', 'origin'], 'sub_identity': ['asian', 'specific country']}),
 Document(page_content='Fuck China. Bunch of hosers.', metadata={'source': None, 'seq_num': None, 'core_identity': ['origin'], 'sub_identity': ['specific country']}),
 Document(page_content='Good, fuck the Chinese', metadata={

In [109]:
from langchain.chains.question_answering import load_qa_chain
# instantiate QA chain
qa_chain = load_qa_chain(llm, chain_type="stuff", prompt=prompt_template)

In [92]:
pprint(prompt_template)

PromptTemplate(input_variables=['context', 'question'], output_parser=None, partial_variables={}, template="\nYou are a professional, specialised in reviewing abusive content which could violate sections 298 and 298A of Singapore's penal code.\nSection 298 of the Penal Code is a charge for uttering words, etc., with deliberate intent to wound the religious or racial feelings of any person.\nSection 298A of the Penal Code is a charge for promoting enmity between different groups on grounds of religion or race or doing acts prejudicial to maintenance of harmony.\nHate speech has been defined as all forms of expression which spread, incite, promote, or justify racial hatred, xenophobia, or other forms of hatred based on intolerance.\nYour instruction is to first evaluate if the comment at the end is considered a hate speech or not. Secondly, identify the possible core identity groups and the sub identity groups.\nUse only the context given below to carry out the task. If you don't know th

In [110]:
# ran with weaviate simiarlity + json as loader into langchain docs
output = qa_chain({"input_documents": data, "comment": "I hate chinese and malays they are pigs!"}, return_only_outputs=True)
print(output['output_text'])


Answer: Yes
Core Identity Groups: Chinese, Malays
Sub Identity Groups: None mentioned


In [111]:
# run using db without metadata
db = Weaviate(client=client, index_name="DataClass", text_key="comment")
retriever = db.as_retriever()
# pass example query to vector store and QA chain and print results
docs = db.similarity_search_by_text(query="I hate chinese and malays they are pigs!")
print(docs)
output = qa_chain({"input_documents": docs, "comment": "I hate chinese and malays they are pigs!"}, return_only_outputs=True)
print(output['output_text'])

[Document(page_content="Annihilate the chinese government please. They are inbred retards and shouldn't be able to call themselves, human fucking pigs..", metadata={}), Document(page_content='Fucking racist pigs.', metadata={}), Document(page_content='Man Chinese people are the worst', metadata={}), Document(page_content="I dislike most malays cause they're judgemental", metadata={})]

Answer: Yes
Core Identity Groups: Chinese, Malays
Sub Identity Groups: None mentioned


In [None]:
# clean up methods to find context, need to be able to include the metadata

In [None]:
# put it inside app.py

In [None]:
# run restricted format in JSON

In [None]:
# run it in a langchain explaining the backend of it

In [None]:
# put embedding back into nomic