In [1]:
# installation of packages 
!pip install tiktoken
!pip install openai



In [3]:
from transformers import BertModel, BertTokenizer
import torch
import torch.nn as nn
import os
import pandas as pd
import numpy as np
import tiktoken

In [5]:
df = pd.read_csv("data/data_wide.csv", index_col=0)
df.head(3)

Unnamed: 0,seqn,gender,age,race,education,marital_status,pir,bmi,time1,time2,...,time2007,time2008,time2009,time2010,time2011,time2012,time2013,time2014,time2015,time2016
1,21009,1,55,3,3,1,3.79,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,21010,2,52,3,4,6,1.24,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,21012,1,63,4,3,6,0.89,0,1,0,...,1,1,0,0,0,1,1,0,0,0


In [13]:
time_columns = [col for col in df.columns if col.startswith('time')]

df[time_columns] = df[time_columns].astype(str)

def combine(row):
    combined_values = ' '.join(row[col] for col in time_columns)
    return combined_values

df['combined'] = df.apply(combine, axis=1)

embedding_encoding = "cl100k_base"
encoding = tiktoken.get_encoding(embedding_encoding)
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))

In [15]:
df = df.drop(columns=[col for col in df.columns if col.startswith('time')])
df.to_csv("data/data_wide_embedding_initial.csv")

In [17]:
df = pd.read_csv("data/data_wide_embedding_initial.csv", index_col=0)
df.head(2)

Unnamed: 0,seqn,gender,age,race,education,marital_status,pir,bmi,combined,n_tokens
1,21009,1,55,3,3,1,3.79,1,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,4031
2,21010,2,52,3,4,6,1.24,1,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,4031


In [21]:
################### GPT ##############################

# model: text-embedding-3-small
# default embedding dimension: 1536

from openai import OpenAI
api_key = ''
client = OpenAI(api_key=api_key)

def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[ 0].embedding

embedding_model = "text-embedding-3-small"
embedding_encoding = "cl100k_base"
max_tokens = 8191  # the maximum for text-embedding-3-small is 8191

# default dimension

df_gpt=df.copy()
df_gpt["embedding"] = df_gpt.combined.apply(lambda x: get_embedding(x, model=embedding_model))
df_gpt = df_gpt[[col for col in df.columns if not col.startswith('n_tokens') and not col.startswith('Time')]]
df_gpt.to_csv("data/data_wide_embedding_gpt1536.csv")
df_gpt.head(2)

# reduced dimension

# Normalize embeddings using L2 normalization
def normalize_l2(x):
    x = np.array(x)
    if x.ndim == 1:
        norm = np.linalg.norm(x)
        if norm == 0:
            return x
        return x / norm
    else:
        norm = np.linalg.norm(x, 2, axis=1, keepdims=True)
        return np.where(norm == 0, x, x / norm)

def get_embeddings(texts, model="text-embedding-3-small"):
    embeddings = []
    for text in texts:
        response = client.embeddings.create(
            model=model, input=text, encoding_format="float"
        )
        embeddings.append(response.data[0].embedding[:50])
    return np.array(embeddings)

def batch_process(texts, batch_size=10):
    n = len(texts)
    for i in range(0, n, batch_size):
        yield texts[i:i+batch_size]

SEQN = df['seqn'].tolist()
batched_texts = list(batch_process(df['combined']))
all_embeddings = []
all_SEQN = []

for batch in batched_texts:
    embeddings = get_embeddings(batch)
    all_embeddings.extend(embeddings)
    start_index = len(all_SEQN)
    all_SEQN.extend(SEQN[start_index:start_index + len(batch)])
    
# Convert all embeddings to numpy array and normalize
all_embeddings = np.array(all_embeddings)
norm_dim = normalize_l2(all_embeddings)

# Create a dataframe from the normalized embeddings
df_embeddings = pd.DataFrame(norm_dim)
df_embeddings['seqn'] = all_SEQN

# Reorder columns to have SEQN as the first column
cols = df_embeddings.columns.tolist()
cols = cols[-1:] + cols[:-1]
df_embeddings = df_embeddings[cols]

df_filtered = df.loc[:, ~df.columns.str.startswith('time')]

# merge two dataset
df_new = pd.merge(df_filtered, df_embeddings, on='seqn', how='left')

df_new.to_csv('data/data_wide_embedding_gpt50.csv', index=False)
df_new.head(2)

APIConnectionError: Connection error.

In [16]:
# GPT embedding modified 

from openai import OpenAI
api_key = ''
client = OpenAI(api_key=api_key)

def get_embedding(text, model="text-embedding-3-small"):
    if not isinstance(text, str):  # Handle non-string inputs
        return None
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding
    
#def get_embedding(text, model="text-embedding-3-small"):
   #text = text.replace("\n", " ")
   #return client.embeddings.create(input = [text], model=model).data[ 0].embedding

embedding_model = "text-embedding-3-small"
embedding_encoding = "cl100k_base"
max_tokens = 8191  # the maximum for text-embedding-3-small is 8191

# default dimension

df_gpt=df.copy()
df_gpt["embedding"] = df_gpt.combined.apply(lambda x: get_embedding(x, model=embedding_model))
df_gpt = df_gpt[[col for col in df.columns if not col.startswith('n_tokens') and not col.startswith('Time')]]
df_gpt.to_csv("data/data_wide_embedding_gpt1536.csv")
df_gpt.head(2)

# reduced dimension

# Normalize embeddings using L2 normalization
def normalize_l2(x):
    x = np.array(x)
    if x.ndim == 1:
        norm = np.linalg.norm(x)
        if norm == 0:
            return x
        return x / norm
    else:
        norm = np.linalg.norm(x, 2, axis=1, keepdims=True)
        return np.where(norm == 0, x, x / norm)

def get_embeddings(texts, model="text-embedding-3-small"):
    embeddings = []
    for text in texts:
        response = client.embeddings.create(
            model=model, input=text, encoding_format="float"
        )
        embeddings.append(response.data[0].embedding[:50])
    return np.array(embeddings)

def batch_process(texts, batch_size=10):
    n = len(texts)
    for i in range(0, n, batch_size):
        yield texts[i:i+batch_size]

SEQN = df['seqn'].tolist()
batched_texts = list(batch_process(df['combined']))
all_embeddings = []
all_SEQN = []

for batch in batched_texts:
    embeddings = get_embeddings(batch)
    all_embeddings.extend(embeddings)
    start_index = len(all_seqn)
    all_SEQN.extend(seqn[start_index:start_index + len(batch)])
    
# Convert all embeddings to numpy array and normalize
all_embeddings = np.array(all_embeddings)
norm_dim = normalize_l2(all_embeddings)

# Create a dataframe from the normalized embeddings
df_embeddings = pd.DataFrame(norm_dim)
df_embeddings['seqn'] = all_SEQN

# Reorder columns to have SEQN as the first column
cols = df_embeddings.columns.tolist()
cols = cols[-1:] + cols[:-1]
df_embeddings = df_embeddings[cols]

df_filtered = df.loc[:, ~df.columns.str.startswith('time')]

# merge two dataset
df_new = pd.merge(df_filtered, df_embeddings, on='seqn', how='left')

df_new.to_csv('data/data_wide_embedding_gpt50.csv', index=False)
df_new.head(2)

ValueError: Out of range float values are not JSON compliant

In [23]:
################### BERT ##############################

# model: bert-base-uncased
# default embedding dimension: 768

from transformers import BertTokenizer, BertModel
import torch

# Load the pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)


# default dimension

# Define a function to get BERT embeddings

def get_bert_embeddings(text):
    # Convert the text to the model input format
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    
    # Disable gradient calculation
    with torch.no_grad():
        # Get the model outputs
        outputs = model(**inputs)
    
    # Get the CLS token embeddings
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, 768)
    return cls_embedding.squeeze().detach().numpy()  # Shape: (768,)

df_bert=df.copy()
df_bert["embedding"] = df_bert['combined'].apply(get_bert_embeddings)

columns_to_drop = [col for col in df_bert.columns if col.startswith('n_tokens') or col.startswith('time')]
df_bert = df_bert.drop(columns=columns_to_drop)

df_bert.to_csv("data/data_wide_embedding_bert768.csv")
df_bert.head(2)

# reduce dimension


def reduce_dimension(embedding, dim=50):
    return embedding[:, :dim]

# Normalize embeddings using L2 normalization
def normalize_l2(x):
    x = np.array(x)
    if x.ndim == 1:
        norm = np.linalg.norm(x)
        if norm == 0:
            return x
        return x / norm
    else:
        norm = np.linalg.norm(x, 2, axis=1, keepdims=True)
        return np.where(norm == 0, x, x / norm)


# Convert texts to embedding and reduce the dimension
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # Shape: (1, 768)
    reduced_embedding = reduce_dimension(cls_embedding)  # Shape: (1, 50)
    return reduced_embedding.squeeze().detach().numpy()  # Shape: (50,)


# Process texts in batches
def batch_process(texts, batch_size=10):
    n = len(texts)
    for i in range(0, n, batch_size):
        yield texts[i:i+batch_size]

SEQN = df['seqn'].tolist()
batched_texts = list(batch_process(df['combined']))
all_embeddings = []
all_SEQN = []


for batch in batched_texts:
    batch_embeddings = []
    for text in batch:
        embedding = get_bert_embeddings(text)
        batch_embeddings.append(embedding)
    all_embeddings.extend(batch_embeddings)
    start_index = len(all_SEQN)
    all_SEQN.extend(SEQN[start_index:start_index + len(batch)])


# Convert all embeddings to numpy array and normalize
all_embeddings = np.array(all_embeddings)
norm_dim = normalize_l2(all_embeddings)

# Create a dataframe from the normalized embeddings
df_embeddings = pd.DataFrame(norm_dim)
df_embeddings['seqn'] = all_SEQN

# Reorder columns to have SEQN as the first column
cols = df_embeddings.columns.tolist()
cols = cols[-1:] + cols[:-1]
df_embeddings = df_embeddings[cols]

df_filtered = df.loc[:, ~df.columns.str.startswith('time')]

# merge two dataset
df_bert = pd.merge(df_filtered, df_embeddings, on='seqn', how='left')

df_bert.to_csv("data/data_wide_embedding_bert50.csv")
df_bert.head(2)

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Unnamed: 0,seqn,gender,age,race,education,marital_status,pir,bmi,combined,n_tokens,...,40,41,42,43,44,45,46,47,48,49
0,21009,1,55,3,3,1,3.79,1,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,4031,...,0.081482,0.136982,0.093444,-0.129943,0.029425,-0.0311,-0.691505,0.07768,0.048244,0.017918
1,21010,2,52,3,4,6,1.24,1,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,4031,...,0.077118,0.127334,0.082427,-0.124168,0.030411,-0.032818,-0.699059,0.082167,0.031773,0.031621


In [None]:
################### Cohere ##############################

import time
import cohere
co = cohere.Client("") 


#####################
# split into two dataframe and run them separetly if needed

# num_rows = len(df)
# print(f"Number of rows: {num_rows}")

# split_point = num_rows // 2

# df1 = df.iloc[:split_point]
# df2 = df.iloc[split_point:]
#######################


# model: embed-english-v3.0
# default embedding dimension: 1024

# default dimension

def get_embeddings(texts, model='embed-english-v3.0', input_type="search_document"):
    output = co.embed(
        model=model,
        input_type=input_type,
        texts=texts)
    return output.embeddings

# Create a new DataFrame to store embeddings
df_cohere = df.drop(columns=['combined']).copy()
df_cohere['embedding'] = get_embeddings(df['combined'].tolist())

df_cohere.to_csv("E:/EntroLLM/data_wide_embedding_cohere1024.csv")


# reduce dimension

def reduce_dimension(embedding, dim=50):
    return embedding[:, :dim]


# Normalize embeddings using L2 normalization
def normalize_l2(x):
    x = np.array(x)
    if x.ndim == 1:
        norm = np.linalg.norm(x)
        if norm == 0:
            return x
        return x / norm
    else:
        norm = np.linalg.norm(x, 2, axis=1, keepdims=True)
        return np.where(norm == 0, x, x / norm)

# Get embeddings for given texts
def get_embeddings(texts, model='embed-english-v3.0', input_type="search_document",delay=0):
    output = co.embed(
        model=model,
        input_type=input_type,
        texts=texts)
    time.sleep(delay)
    reduced_embedding = reduce_dimension(np.array(output.embeddings))  # Shape: (batch_size, dim)
    return reduced_embedding

# Process texts in batches
def batch_process(texts, batch_size=10):
    n = len(texts)
    for i in range(0, n, batch_size):
        yield texts[i:i+batch_size]

# Assuming df is your dataframe
SEQN = df['SEQN'].tolist()
batched_texts = list(batch_process(df['combined'].tolist()))
all_embeddings = []
all_SEQN = []

for batch in batched_texts:
    embeddings = get_embeddings(batch)
    all_embeddings.append(embeddings)
    start_index = len(all_SEQN)
    all_SEQN.extend(SEQN[start_index:start_index + len(batch)])

# Convert all embeddings to numpy array and normalize
all_embeddings = np.vstack(all_embeddings)  # Stack embeddings vertically
norm_dim = normalize_l2(all_embeddings)  

# Create a dataframe from the normalized embeddings
df_embeddings = pd.DataFrame(norm_dim)
df_embeddings['SEQN'] = all_SEQN

# Reorder columns to have SEQN as the first column
cols = df_embeddings.columns.tolist()
cols = cols[-1:] + cols[:-1]
df_embeddings = df_embeddings[cols]

df_filtered = df.loc[:, ~df.columns.str.startswith('Time')]

# Merge two datasets
df_cohere = pd.merge(df_filtered, df_embeddings, on='SEQN', how='left')

df_cohere.to_csv("E:/EntroLLM/data_wide_embedding_cohere50.csv")
df_cohere.head(2)