In [1]:
import torch
from scipy.spatial.distance import cosine
from transformers import AutoModel, AutoTokenizer

# Import our models. The package will take care of downloading the models automatically
tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
model = AutoModel.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")

# Tokenize input texts
texts = [
    "There's a kid on a skateboard.",
    "A kid is skateboarding.",
    "A kid is inside the house."
]
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Get the embeddings
with torch.no_grad():
    embeddings = model(**inputs, output_hidden_states=True, return_dict=True).pooler_output

# Calculate cosine similarities
# Cosine similarities are in [-1, 1]. Higher means more similar
cosine_sim_0_1 = 1 - cosine(embeddings[0], embeddings[1])
cosine_sim_0_2 = 1 - cosine(embeddings[0], embeddings[2])

print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[1], cosine_sim_0_1))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[2], cosine_sim_0_2))




tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Cosine similarity between "There's a kid on a skateboard." and "A kid is skateboarding." is: 0.943
Cosine similarity between "There's a kid on a skateboard." and "A kid is inside the house." is: 0.439


In [3]:
from fastfit import FastFitTrainer, FastFit

In [4]:
model = FastFit.from_pretrained('/projects/academic/kjoseph/zijian/zijianan_blm_random_0')

In [1]:
from transformers import AutoTokenizer, pipeline




In [1]:
from transformers import AutoModel, AutoTokenizer
import torch
import numpy as np
from tqdm import tqdm_notebook as tqdm
def get_features(texts, model_name='princeton-nlp/sup-simcse-bert-base-uncased', batch_size=32):
    # Load pre-trained model tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    # Prepare to collect batches of embeddings
    all_embeddings = []

    for i in tqdm(range(0, len(texts), batch_size)):
        # Process each batch
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=256)
        inputs = {key: value.to(device) for key, value in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
        
        embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        all_embeddings.append(embeddings)

    # Concatenate all batch embeddings
    all_embeddings = np.vstack(all_embeddings)
    return all_embeddings

# Example usage
import pandas as pd

# Load your dataset
text = pd.read_csv('test_10000.csv')
texts = text['modeling_text'].values.tolist()

# Generate embeddings
embeddings = get_features(texts, batch_size=32)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm(range(0, len(texts), batch_size)):


  0%|          | 0/313 [00:00<?, ?it/s]

In [1]:
from transformers import BertModel, BertTokenizer
import torch
import torch
from scipy.spatial.distance import cosine
from transformers import AutoModel, AutoTokenizer

def get_features(text, model_name='bert-base-uncased'):
    # Load pre-trained model tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
    model = AutoModel.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Encode text to get token ids and attention masks
    model = model.to(device)

    # Encode text to get token ids and attention masks
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=256)

    # Send inputs to the same device
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Get the hidden states from the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the embeddings for the [CLS] token (used for sentence classification tasks)
    features = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    
    return features



In [2]:
import pandas as pd
text = pd.read_csv('test_10000.csv')

In [3]:
texts = text['modeling_text'].values.tolist()

In [5]:
len(texts)

10000

In [2]:
embeddings.shape

(10000, 768)

In [3]:
import numpy as np

In [4]:
embeddings_array = np.array(embeddings).squeeze()

In [5]:
embeddings = embeddings_array

In [6]:
import faiss
import numpy as np

dimension = embeddings.shape[1]  # Dimension of the vectors
index = faiss.IndexFlatL2(dimension)  # L2 distance for similarity
index.add(np.array(embeddings).astype('float32')) 

In [7]:
texts

['how do i dream when each night i defend my right to live? #ferguson',
 "time to demilitarize portland police. civil rights orgs agree--we shouldn't have these weapons on our streets. url: portlandmercury.com",
 '"i\'m a proud #nra member. and like countless other nra members, i\'m a veteran. for generations, nra members like me have stood up to terrorist organizations, nazis, fascists and evil dictators who threatened the well-being of the innocent." —veteran green beret jeff houston ',
 'rioters and looters burn down many cities, injure &amp;  kill innocent americans but the dems and msm say it’s okay and fear trump will blame them for spikes in coronavirus. trump plans rallies &amp; the same people call it dangerous. the world is upside down #voteredtosaveamerica2020',
 'on numerous occasions, i\'ve heard christian leaders talk about how "people are changing the definition of racism" that\'s "different from what we grew up with." the "new definition," they argue, comes from critica

In [10]:
query = "racism"  # Example query
query_vec = get_features(query)  # Convert query to vector
query_vec = np.array(query_vec).squeeze()
query_vec = np.array(query_vec).astype('float32')

# Reshape query_vec to be two-dimensional
query_vec = query_vec.reshape(1, -1)  # Reshape to (1, number of features)

# Perform the search
k = 100  # Number of nearest neighbors to find
D, I = index.search(query_vec, k)

# k = 2  # Number of nearest neighbors to find
# D, I = index.search(np.array(query_vec).astype('float32'), k)  # Perform the search

print("Query Results:")
for i, (distance, idx) in enumerate(zip(D[0], I[0])):
    print(f"{i + 1}: {texts[idx]} (Distance: {distance})")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm(range(0, len(texts), batch_size)):


  0%|          | 0/1 [00:00<?, ?it/s]

Query Results:
1: racism 
is
predictable (Distance: 26.24663734436035)
2: they’re racist. 

(open pic for uncropped)  (Distance: 29.85943031311035)
3: racism isn't exclusive to the right (Distance: 33.56657791137695)
4: being white is racist. (Distance: 33.83761215209961)
5: offended??

good, get bent!

if you think posting about black pride, asian pride, hispanic pride or any other race is ok, but somehow this is racist, you can gfy!

you are the true racist, by excluding whites!

#easterneuropean

#whiteandproud

#democratsarestupid

#fdemocrats  (Distance: 34.921932220458984)
6: dear white people, 

black people cannot be racist. prejudice yes but not racist. racism describes a system of disadvantage based on race.  (Distance: 36.48942565917969)
7: #ifiwaswhite i would say "white privilege" isn't real &amp; "reverse racism" is real. yes there people are real...😐😐😐  (Distance: 36.5029411315918)
8: i hear a lot this idea "everyone is racist." if true, wouldn't that mean racism isn't t