In [1]:
import json
import textwrap
import time
import uuid
from typing import List
import numpy as np
import vertexai
from google.cloud import aiplatform
print(f"Vertex AI SDK version: {aiplatform.__version__}")

import sys
sys.path.append('./.local/lib/python3.10/site-packages')
import langchain
import os
import glob
print(f"LangChain version: {langchain.__version__}")
from langchain.docstore.document import Document
from langchain_google_vertexai import (
    VertexAI,
    VertexAIEmbeddings,
    VectorSearchVectorStore,
)
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel

Vertex AI SDK version: 1.75.0
LangChain version: 0.3.15


In [2]:
PROJECT_ID = os.environ["PROJECT_ID"]  
REGION = os.environ["REGION"]
vertexai.init(project=PROJECT_ID, location=REGION)
ME_REGION = REGION
ME_DIMENSIONS = 768  # when using Vertex PaLM Embedding

### define llm without guardrails (otherwise, everything is ruled out)

In [3]:
from langchain_google_vertexai import ChatVertexAI
from langchain_google_vertexai import HarmBlockThreshold, HarmCategory

llm = ChatVertexAI(model="gemini-1.5-flash", safety_settings={
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
        })

embeddings = VertexAIEmbeddings(model_name="textembedding-gecko-multilingual@001", location = REGION, project = PROJECT_ID )

### read the data

In [4]:
import json
import pandas as pd
import pickle
from features import list_of_features

with open("df_A_filtered.pickle", 'rb') as f:
    df_A_filtered = pickle.load(f)

with open("df_B_filtered.pickle", 'rb') as f:
    df_B_filtered = pickle.load(f)   
    
with open("df_posts.pickle", 'rb') as f:
    df_posts = pickle.load(f)   

#### read the questions

In [5]:
list_of_features

['Does the text promote or endorse harmful stereotypes about Jewish people (e.g., controlling the media, financial systems, or governments)?(Yes/No)',
 'Does the text blame Jewish people collectively for societal problems, political events, or economic issues?(Yes/No)',
 'Does the text deny, minimize, or distort the Holocaust or other historical antisemitic events?(Yes/No)',
 'Does the text promote conspiracy theories that falsely portray Jewish people as a secretive, manipulative, or powerful group controlling world affairs?(Yes/No)',
 'Does the text use derogatory language, slurs, or dehumanizing descriptions when referring to Jewish people?(Yes/No)',
 'Does the text accuse Jewish people of exaggerated or false crimes, such as blood libel?(Yes/No)',
 'Does the text compare Jewish people or Israel to Nazis in a way intended to demonize or delegitimize?(Yes/No)',
 'Does the text deny Jewish people the right to self-determination or their identity as a people?(Yes/No)',
 'Does the text 

#### utility function for saveing mid_results

In [None]:
def save(object_to_save, file_name):
    with open(f"{file_name}_features.pickle", "wb") as f:
        pickle.dump(object_to_save, f)

#### define class for structred output format

In [5]:
from typing import Optional
from pydantic import BaseModel, Field

class Structred_Answers(BaseModel):
    """
    structred answers on a series of question, on an input text
    """
    answer_1: str = Field(description = "the answer to qestion 1")
    answer_2: str = Field(description = "the answer to qestion 2")
    answer_3: str = Field(description = "the answer to qestion 3")
    answer_4: str = Field(description = "the answer to qestion 4")
    answer_5: str = Field(description = "the answer to qestion 5")
    answer_6: str = Field(description = "the answer to qestion 6")
    answer_7: str = Field(description = "the answer to qestion 7")
    answer_8: str = Field(description = "the answer to qestion 8")
    answer_9: str = Field(description = "the answer to qestion 9")
    answer_10: str = Field(description = "the answer to qestion 10")


### features from first degree comments - A
#### define function for A comments (prompt, and feature extraction based on submission and comments text)

In [62]:
from langchain.prompts import ChatPromptTemplate
def feature_vector_from_comment_A(submission_title, submission_text, comment, llm, list_of_features):
    """
    function to extract feature vector from the argument, via llm
    args:
        op_text: the original post subject text
        argument: the current argument
        llm: the llm in use
    returns: DataFrame with one row (the argument feature vector)    
    """
    
    template = """ you will be given a original post's title and text, and a comment regarding the "original post" and questions regarding the comment.
    answer the questions, on an input comment with respect to the title and subject in the original post. choose one of the optional answers followed by each question.
    <title>
    {title}
    </title>
    <subject>
    {subject}
    </subject>
    <questions>
    {list_of_features}
    </questions>
    <input_text>
    {input}
    </input_text>
    """
    prompt_template = ChatPromptTemplate.from_template(template)
    
    llm_structred = llm.with_structured_output(Structred_Answers)
    
    chain = prompt_template | llm_structred
    
    answers = chain.invoke({"title":submission_title,
                        "subject":submission_text,
                        "list_of_features":list_of_features,
                        "input": comment})
    if answers:
        list_of_answers = [0 if value == "No" else 1 for value in answers.dict().values()]
        dict_rephrase = {}
        for index, value in enumerate(list_of_answers):
            dict_rephrase[list_of_features[index][14:-8]] = value

        return dict_rephrase
    else:
        return None
   
    

#### check extraction for one record

In [65]:
index = 0
submission_title = df_A_filtered.submission_title.iloc[index]
submission_text = df_A_filtered.submission_text.iloc[index]
comment = df_A_filtered.comment_text.iloc[index]

langchain.debug = False

In [66]:
feature_vector = feature_vector_from_comment_A(submission_title, submission_text, comment, llm, list_of_features)
feature_vector

{'promote or endorse harmful stereotypes about Jewish people (e.g., controlling the media, financial systems, or governments)?': 0,
 'blame Jewish people collectively for societal problems, political events, or economic issues?': 0,
 'deny, minimize, or distort the Holocaust or other historical antisemitic events?': 0,
 'promote conspiracy theories that falsely portray Jewish people as a secretive, manipulative, or powerful group controlling world affairs?': 0,
 'use derogatory language, slurs, or dehumanizing descriptions when referring to Jewish people?': 0,
 'accuse Jewish people of exaggerated or false crimes, such as blood libel?': 0,
 'compare Jewish people or Israel to Nazis in a way intended to demonize or delegitimize?': 0,
 'deny Jewish people the right to self-determination or their identity as a people?': 0,
 'advocate for discrimination, violence, or exclusion against Jewish people?': 0,
 'selectively hold Jewish people or Israel to a higher standard than other groups or n

#### run over all dataframe A

In [None]:
features = []

for index, row in df_A_filtered.iterrows():
    features.append(feature_vector_from_comment_A(row.submission_title, row.submission_text, row.comment_text, llm, list_of_features))
    if index % 100 == 0:
        print(index)
        save(features)

100
200
500
600
700
800
900
1300
1400
1500
1600
1700
1800
2000
2100
2200
2300
2400
3000
3200
3400
3600
3700
3800
4400
4500
4600
4700
5100
5200
5300
5700
5800
6000
6100
6200
6500
6600
6700
7000
7100
7300
7500
7600
7700
8000
8200
8300
8400
8500
8600
8700
8900
9100
9400
9500
9600
9700
9800
10100
10500
10600
10700
10800
11000
11200
11300
11400
11600
12200
12300
12500
12700
13000
13100
13400
13500
13600
13800
14000
14200
14300
14400
14500
14600
14700
14900
15000
15100
15200
15300
15400
15600
15700
15800
15900
16200
16400
16500
16700
16800
16900
17000
17100
17200
17400
17500
17600
17700
17800
17900
18000
18200
18300
18600
18700
18800
18900
19000
19100
19200
19300
19400
19600
19900
20100
20300
20400
20500
20700
21100
21200
21300
21400
21500
21600
21700
21800
21900
22200
22400
22500
22600
23000
23200
23400
23500
23600
23700
23800
24000
24200
24300
24400
24500
24800
25000
25100
25200
25300
25400
25600
25700
25800
25900
26000
26100
26200
26400
26500
26600
26700
26800
26900
27000
27200
27300
2740

In [3]:
import pickle
with open("features.pickle", 'rb') as f:
    features = pickle.load(f)

In [32]:
df_A_filtered['features'] = features

### features from only posts 
#### define function for submissions (prompt, and feature extraction based on submission and the title)

In [33]:
from langchain.prompts import ChatPromptTemplate
def feature_vector_from_post(submission_title, submission_text, llm, list_of_features):
    """
    function to extract feature vector from the argument, via llm
    args:
        op_text: the original post subject text
        llm: the llm in use
    returns: DataFrame with one row (the argument feature vector)    
    """
    
    template = """ you will be given a post title and post text, and questions regarding the post text.
    answer the questions, on the input post text. choose one of the optional answers followed by each question.
    <title>
    {title}
    </title>
    <subject>
    {subject}
    </subject>
    <questions>
    {list_of_features}
    </questions>
    """
    prompt_template = ChatPromptTemplate.from_template(template)
    
    llm_structred = llm.with_structured_output(Structred_Answers)
    
    chain = prompt_template | llm_structred
    
    answers = chain.invoke({"title":submission_title,
                        "subject":submission_text,
                        "list_of_features":list_of_features})
    if answers:
        list_of_answers = [0 if value == "No" else 1 for value in answers.dict().values()]
        dict_rephrase = {}
        for index, value in enumerate(list_of_answers):
            dict_rephrase[list_of_features[index][14:-8]] = value

        return dict_rephrase
    else:
        return None
   
    

In [32]:
# import pickle 
# with open("df_posts.pickle", "rb") as f:
#     df_posts = pickle.load(f)
posts_list = df_posts.submission_id.values
posts_from_A = df_only_posts.submission_id.values
set(posts_from_A).issubset(set(posts_list))

df_posts.head(1)

Unnamed: 0,submission_id,submission_title,submission_text,submission_author,submission_timestamp,submission_score,upvote_ration,subreddit_id,num_comments
0,t3_16wuocg,Need help gathering info regarding the conflict.,My school is conducting an MUN (Model United N...,Moon_Knight_69,1696145282,1,1.0,t5_2sgz7,53


#### run over all submissions, and extract features

In [37]:
posts_features = []

for index, row in df_posts.iterrows():
    posts_features.append((row.submission_id, feature_vector_from_post(row.submission_title, row.submission_text, llm, list_of_features)))
    if index % 100 == 0:
        print(index)
        save(posts_features, "posts_features_from_posts")

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800


Retrying langchain_google_vertexai.chat_models._completion_with_retry.<locals>._completion_with_retry_inner in 4.0 seconds as it raised ResourceExhausted: 429 Resource exhausted. Please try again later. Please refer to https://cloud.google.com/vertex-ai/generative-ai/docs/error-code-429 for more details..
Retrying langchain_google_vertexai.chat_models._completion_with_retry.<locals>._completion_with_retry_inner in 4.0 seconds as it raised ResourceExhausted: 429 Resource exhausted. Please try again later. Please refer to https://cloud.google.com/vertex-ai/generative-ai/docs/error-code-429 for more details..


6900
7000
7100
7200
7300
7400
7500
7600
7700
7800


### features from second (or more) degree comments - B
#### define function for B comments (prompt, and feature extraction based on submission and previous comment and current comment text)

In [6]:
from langchain.prompts import ChatPromptTemplate
def feature_vector_from_comment_B(submission_title, submission_text, comment_A, comment_B, llm, list_of_features):
    """
    function to extract feature vector from the argument, via llm
    args:
        op_text: the original post subject text
        commant_A: the first comment
        commant_B: 
        llm: the llm in use
    returns: DataFrame with one row (the argument feature vector)    
    """
    
    template = """ you will be given a original post's title and text, comment_A to the "original post", a comment_B to comment_A and questions regarding the comment_B.
    answer the questions, on an comment_B with respect to the context which is the title and subject in the original post and comment_A. choose one of the optional answers followed by each question.
    <title>
    {title}
    </title>
    <subject>
    {subject}
    </subject>
    <comment_A>
    {comment_A}
    </comment_A>
    <comment_B>
    {comment_B}
    </comment_B>
    <questions>
    {list_of_features}
    </questions>
    
    """
    prompt_template = ChatPromptTemplate.from_template(template)
    
    llm_structred = llm.with_structured_output(Structred_Answers)
    
    chain = prompt_template | llm_structred
    
    answers = chain.invoke({"title":submission_title,
                        "subject":submission_text,
                        "comment_A": comment_A,
                        "comment_B": comment_B,
                        "list_of_features":list_of_features
                           })
    if answers:
        list_of_answers = [0 if value == "No" else 1 for value in answers.dict().values()]
        dict_rephrase = {}
        for index, value in enumerate(list_of_answers):
            dict_rephrase[list_of_features[index][14:-8]] = value

        return dict_rephrase
    else:
        return None
   
    

In [21]:
df_B_filtered.columns

Index(['submission_id', 'submission_title', 'submission_text', 'comment_id_x',
       'comment_text_x', 'comment_author_x', 'to_comment', 'comment_text_y',
       'comment_author_y', 'comment_timestamp', 'len_submission_text',
       'len_comment_text_x', 'len_comment_text_y'],
      dtype='object')

#### run over all dataframe B
#### save iterations in check points

In [None]:
features = []

for index, row in df_B_filtered.iterrows():
    features.append((index,row.comment_id_x,feature_vector_from_comment_B(row.submission_title, row.submission_text, row.comment_text_y, row.comment_text_x, llm, list_of_features)))
    if index % 100 == 0:
        print(index, row.comment_id_x)
        save(features, "B_comment_to_commentv2")

0 t1_k2z76q7
100 t1_k3203fp
200 t1_k360r6a
300 t1_k394dp0
400 t1_k3ae241
500 t1_k3csjl8
600 t1_k3fal5u
700 t1_k3hwv74
800 t1_k3l93g8
900 t1_k3mti7h
1000 t1_k3nat9e
1300 t1_k3qmk6e
1600 t1_k3tliyn
1800 t1_k3twnhr
1900 t1_k3u1hbu
2000 t1_k3u3xv3
2100 t1_k3u6e8d
2500 t1_k3ui0pn
2700 t1_k3unui0
2800 t1_k3upuhz
3100 t1_k3ux97a
3200 t1_k3uznyo
3300 t1_k3v27ti
3400 t1_k3v4k2x
3500 t1_k3v76iq
3600 t1_k3vaiie
3700 t1_k3veo6v
3800 t1_k3vj2z2
3900 t1_k3vmlx0
4000 t1_k3vp04k
4200 t1_k3vtoy5
4300 t1_k3vxz6t
4400 t1_k3w20rj
4500 t1_k3w5ej3
4700 t1_k3wayzj
5000 t1_k3wh98b
5100 t1_k3wj08o
5200 t1_k3wl9hm
5300 t1_k3wndoa
5400 t1_k3woxxy
5600 t1_k3ws445
5700 t1_k3wtll7
5800 t1_k3wv767
5900 t1_k3wwt2t
6200 t1_k3x1hdu
6300 t1_k3x33f7
6500 t1_k3x6b0y
6600 t1_k3x8097
6700 t1_k3x9c34
6800 t1_k3xb3yi
7000 t1_k3xf26l
7100 t1_k3xgvzr
7200 t1_k3ximgp
7400 t1_k3xmeij
7600 t1_k3xpte3
7700 t1_k3xr84p
7800 t1_k3xtjn2
7900 t1_k3xw1j0
8300 t1_k3y4kej
8400 t1_k3y6vqa
8500 t1_k3y91jw
8600 t1_k3yapq2
8700 t1_k3yc5z0
8800

In [1]:
# len(B_comment_to_comment)

### translate to real features or numpy array

In [48]:
text1 = df_posts.iloc[0].submission_text
text2 = df_posts.iloc[1].submission_text
raw_texts = [text1, text2]
len(raw_texts)

2

In [22]:
vector = embeddings.embed(texts=[text], embeddings_task_type='SEMANTIC_SIMILARITY')

In [25]:
len(vector[0])

768

#### run embedding over all texts, tuned for semantic similarity

at first, i thougt about using FAISS for similarity search for each comment. this was not used at the article
the division for batch, eas due to vertex resriction on embedding per minute

In [129]:
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel

embedding_model = TextEmbeddingModel.from_pretrained("textembedding-gecko-multilingual@001")
task = "SEMANTIC_SIMILARITY"
dimensionality = 768

docs = []
vectors = []
texts = []
metadatas = []
for index, row in df_posts[['submission_id','submission_text']][:500].iterrows():
    docs.append(Document(page_content = row.submission_text, metadata = {'submission_id':row.submission_id}))
    texts.append(row.submission_text)
    metadatas.append({'submission_id':row.submission_id})
    pre_text = [TextEmbeddingInput(row.submission_text, task)]
    vectors.append(embedding_model.get_embeddings(pre_text))       
    
embedded_vectors_as_list_of_list = np.array([embed[0].values for embed in vectors])

db = FAISS.from_embeddings(text_embeddings = zip(texts, embedded_vectors_as_list_of_list),
                           embedding = embeddings,
                           metadatas = metadatas)

                
for index in range(500,df_posts.shape[0],500):
    print(index)
    for index, row in df_posts[['submission_id','submission_text']][:3].iterrows():
        docs.append(Document(page_content = row.submission_text, metadata = {'submission_id':row.submission_id}))
        texts.append(row.submission_text)
        metadatas.append({'submission_id':row.submission_id})
        pre_text = [TextEmbeddingInput(row.submission_text, task)]
        vectors.append(embedding_model.get_embeddings(pre_text))       

    embedded_vectors_as_list_of_list = np.array([embed[0].values for embed in vectors])

    db = FAISS.from_embeddings(text_embeddings = zip(texts[index:], embedded_vectors_as_list_of_list[:index]),
                               embedding = embeddings,
                               metadatas = metadatas[:index])
    #     time.sleep(600)

In [154]:
kwargs = {'score_threshold':0.7}
result = db.similarity_search_by_vector(embedding = list(embedded_vectors_as_list_of_list[0]),
                               k = 4,
                            # filter = {'submission_id': 't3_16wuocg'},
                                       **kwargs)

In [174]:
# add save list every iteration

step = 500
for index in range(0,df_posts.shape[0],step):
    if index + step > df_posts.shape[0]:
        print(index,df_posts.shape[0])
    else:
        print(index,index+500)


0 500
500 1000
1000 1500
1500 2000
2000 2500
2500 3000
3000 3500
3500 4000
4000 4500
4500 5000
5000 5500
5500 6000
6000 6500
6500 7000
7000 7500
7500 7853


# embedding - posts

In [183]:
embedding_model = TextEmbeddingModel.from_pretrained("textembedding-gecko-multilingual@001")
task = "SEMANTIC_SIMILARITY"
step = 100
docs = []
vectors = []
texts = []
metadatas = []
    

for index in range(0,df_posts.shape[0],step):
    print(index)
    if index + step > df_posts.shape[0]:
        start = index
        end = df_posts.shape[0]
    else:
        start = index
        end = index + step
    for inner_index, row in df_posts[['submission_id','submission_text']][start:end].iterrows():
        print('inner_index ',inner_index)
        docs.append(Document(page_content = row.submission_text, metadata = {'submission_id':row.submission_id}))
        texts.append(row.submission_text)
        metadatas.append({'submission_id':row.submission_id})
        pre_text = [TextEmbeddingInput(row.submission_text, task)]
        vectors.append(embedding_model.get_embeddings(pre_text))       
    time.sleep(60)

0
inner_index  0
inner_index  1
inner_index  2
inner_index  3
inner_index  4
inner_index  5
inner_index  6
inner_index  7
inner_index  8
inner_index  9
inner_index  10
inner_index  11
inner_index  12
inner_index  13
inner_index  14
inner_index  15
inner_index  16
inner_index  17
inner_index  18
inner_index  19
inner_index  20
inner_index  21
inner_index  22
inner_index  23
inner_index  24
inner_index  25
inner_index  26
inner_index  27
inner_index  28
inner_index  29
inner_index  30
inner_index  31
inner_index  32
inner_index  33
inner_index  34
inner_index  35
inner_index  36
inner_index  37
inner_index  38
inner_index  39
inner_index  40
inner_index  41
inner_index  42
inner_index  43
inner_index  44
inner_index  45
inner_index  46
inner_index  47
inner_index  48
inner_index  49
inner_index  50
inner_index  51
inner_index  52
inner_index  53
inner_index  54
inner_index  55
inner_index  56
inner_index  57
inner_index  58
inner_index  59
inner_index  60
inner_index  61
inner_index  62


## embedding for df_A
the extraction of the vector was due to in-ability to pickle the output if TextEmbeddingInput()

In [2]:
embedding_model = TextEmbeddingModel.from_pretrained("textembedding-gecko-multilingual@001")
task = "SEMANTIC_SIMILARITY"
step = 100
vectors_A = []
start_point = 17381    

for index in range(start_point,df_A_filtered.shape[0],step):
    print(index)
    if index + step > df_A_filtered.shape[0]:
        start = index
        end = df_A_filtered.shape[0]
    else:
        start = index
        end = index + step
    for inner_index, row in df_A_filtered[['comment_id','comment_text']][start:end].iterrows():
        print('inner_index ',inner_index)
        pre_text = [TextEmbeddingInput(row.comment_text, task)]
        vectors_A.append((row.comment_id,embedding_model.get_embeddings(pre_text)))       
    time.sleep(60)
    for_pickle = vectors_B[:8096]
    for_pickle += [(vec[0],vec[1][0].values) for vec in vectors_B[8096:]]
    # for_pickle = [(vec[0],vec[1][0].values) for vec in vectors_B]
    with open('vectors_Bv4.pickle', 'wb') as f:
        pickle.dump(for_pickle, f)
    

## embedding for df_B

In [7]:
with open('B_comment_to_commentv2_features.pickle', 'rb') as f:
    B_features = pickle.load(f)
B_features_df = pd.DataFrame(B_features, columns = ['index','comment_id_x', 'features'])
df_B_filtered.shape, B_features_df.shape
df_B = df_B_filtered.merge(B_features_df, on='comment_id_x')
df_B.head(1)

Unnamed: 0,submission_id,submission_title,submission_text,comment_id_x,comment_text_x,comment_author_x,to_comment,comment_text_y,comment_author_y,comment_timestamp,len_submission_text,len_comment_text_x,len_comment_text_y,index,features
0,t3_16wvusf,Neturei Karta did all the thinking for me,I was extremely brainwashed about 10 years ago...,t1_k2z76q7,I guess I'd use those two words somewhat inter...,Old_Calligrapher1563,t1_k2z6w0g,>I know most jews absolutely despise them but ...,pinchasthegris,1696150982,162,24,37,0,{'promote or endorse harmful stereotypes about...


### explode questions to columns

In [8]:
df_B.dropna(inplace=True)
for key in df_B['features'].iloc[0].keys():
    df_B[key] = df_B['features'].apply(lambda x: x[key])

### calc sum of antisemtism
because B dataframe contain about 180k comments, and because i am only intrested in antisemitic comments, i filtered out all non antisemitic comments. 

In [9]:
df_B['sum_of_antisemitism'] = df_B.iloc[:,-10:].apply(lambda x: sum(x), axis=1)

In [34]:
df_B[df_B['sum_of_antisemitism']>0].head(1)

Unnamed: 0,submission_id,submission_title,submission_text,comment_id_x,comment_text_x,comment_author_x,to_comment,comment_text_y,comment_author_y,comment_timestamp,...,"blame Jewish people collectively for societal problems, political events, or economic issues?","deny, minimize, or distort the Holocaust or other historical antisemitic events?","promote conspiracy theories that falsely portray Jewish people as a secretive, manipulative, or powerful group controlling world affairs?","use derogatory language, slurs, or dehumanizing descriptions when referring to Jewish people?","accuse Jewish people of exaggerated or false crimes, such as blood libel?",compare Jewish people or Israel to Nazis in a way intended to demonize or delegitimize?,deny Jewish people the right to self-determination or their identity as a people?,"advocate for discrimination, violence, or exclusion against Jewish people?","selectively hold Jewish people or Israel to a higher standard than other groups or nations, implying inherent wrongdoing?",sum_of_antisemitism
3,t3_16wvusf,Neturei Karta did all the thinking for me,I was extremely brainwashed about 10 years ago...,t1_k2zcci5,">We dont really despise them.\n\nNah, we do, a...",Fast-Promotion-2805,t1_k2z6w0g,>I know most jews absolutely despise them but ...,pinchasthegris,1696155065,...,0,0,0,0,0,0,1,0,0,1


In [5]:
with open('vectors_Bv3.pickle', 'rb') as f:
    vectors_B= pickle.load(f)

In [3]:
len(vectors_B)

In [10]:
df_B_anti = df_B[df_B['sum_of_antisemitism']>0].reset_index(drop=True).copy()
df_B_anti.head(1)

Unnamed: 0,submission_id,submission_title,submission_text,comment_id_x,comment_text_x,comment_author_x,to_comment,comment_text_y,comment_author_y,comment_timestamp,...,"blame Jewish people collectively for societal problems, political events, or economic issues?","deny, minimize, or distort the Holocaust or other historical antisemitic events?","promote conspiracy theories that falsely portray Jewish people as a secretive, manipulative, or powerful group controlling world affairs?","use derogatory language, slurs, or dehumanizing descriptions when referring to Jewish people?","accuse Jewish people of exaggerated or false crimes, such as blood libel?",compare Jewish people or Israel to Nazis in a way intended to demonize or delegitimize?,deny Jewish people the right to self-determination or their identity as a people?,"advocate for discrimination, violence, or exclusion against Jewish people?","selectively hold Jewish people or Israel to a higher standard than other groups or nations, implying inherent wrongdoing?",sum_of_antisemitism
0,t3_16wvusf,Neturei Karta did all the thinking for me,I was extremely brainwashed about 10 years ago...,t1_k2zcci5,">We dont really despise them.\n\nNah, we do, a...",Fast-Promotion-2805,t1_k2z6w0g,>I know most jews absolutely despise them but ...,pinchasthegris,1696155065,...,0,0,0,0,0,0,1,0,0,1


In [22]:
embedding_model = TextEmbeddingModel.from_pretrained("textembedding-gecko-multilingual@001")
task = "SEMANTIC_SIMILARITY"
step = 100
# vectors_B = []
start_point = 17381    

for index in range(start_point,df_B_anti.shape[0],step):
    print(index)
    if index + step > df_B_anti.shape[0]:
        start = index
        end = df_B_anti.shape[0]
    else:
        start = index
        end = index + step
    for inner_index, row in df_B_anti[['comment_id_x','comment_text_x']][start:end].iterrows():
        print('inner_index ',inner_index)
        pre_text = [TextEmbeddingInput(row.comment_text_x, task)]
        vectors_B.append((row.comment_id_x,embedding_model.get_embeddings(pre_text)))       
    time.sleep(60)
    for_pickle = vectors_B[:8096]
    for_pickle += [(vec[0],vec[1][0].values) for vec in vectors_B[8096:]]
    # for_pickle = [(vec[0],vec[1][0].values) for vec in vectors_B]
    with open('vectors_Bv4.pickle', 'wb') as f:
        pickle.dump(for_pickle, f)
    

17381
inner_index  17381
inner_index  17382
inner_index  17383
inner_index  17384
inner_index  17385
inner_index  17386
inner_index  17387
inner_index  17388
inner_index  17389
inner_index  17390
inner_index  17391
inner_index  17392
inner_index  17393
inner_index  17394
inner_index  17395
inner_index  17396
inner_index  17397
inner_index  17398
inner_index  17399
inner_index  17400
inner_index  17401
inner_index  17402
inner_index  17403
inner_index  17404
inner_index  17405
inner_index  17406
inner_index  17407
inner_index  17408
inner_index  17409
inner_index  17410
inner_index  17411
inner_index  17412
inner_index  17413
inner_index  17414
inner_index  17415
inner_index  17416
inner_index  17417
inner_index  17418
inner_index  17419
inner_index  17420
inner_index  17421
inner_index  17422
inner_index  17423
inner_index  17424
inner_index  17425
inner_index  17426
inner_index  17427
inner_index  17428
inner_index  17429
inner_index  17430
inner_index  17431
inner_index  17432
inner_

In [18]:
df_B_filtered.head(2)

Unnamed: 0,submission_id,submission_title,submission_text,comment_id_x,comment_text_x,comment_author_x,to_comment,comment_text_y,comment_author_y,comment_timestamp,len_submission_text,len_comment_text_x,len_comment_text_y
0,t3_16wvusf,Neturei Karta did all the thinking for me,I was extremely brainwashed about 10 years ago...,t1_k2z76q7,I guess I'd use those two words somewhat inter...,Old_Calligrapher1563,t1_k2z6w0g,>I know most jews absolutely despise them but ...,pinchasthegris,1696150982,162,24,37
1,t3_16wvusf,Neturei Karta did all the thinking for me,I was extremely brainwashed about 10 years ago...,t1_k2z7ik6,>I know people generally find them to be extre...,pinchasthegris,t1_k2z76q7,I guess I'd use those two words somewhat inter...,Old_Calligrapher1563,1696151238,162,89,24


In [196]:
with open('df_posts.pickle', 'wb') as f:
    pickle.dump(df_posts, f)

In [194]:
import pickle 
with open('df_posts.pickle', 'rb') as f:
    df = pickle.load(f)
df.shape

(7853, 9)

In [122]:
pre_text = [TextEmbeddingInput(text1, task)]
gcp_embed = embedding_model.get_embeddings(pre_text)