In [44]:
import csv
import json
import openai
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge  
from sklearn.metrics import r2_score, make_scorer
import tiktoken
from openai.embeddings_utils import get_embedding
import os
from sklearn.model_selection import KFold, cross_val_predict
openai.api_key = os.getenv("OPENAI_API_KEY")

# Embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # This the encoding for text-embedding-ada-002
max_tokens = 8000  # The maximum for text-embedding-ada-002 is 8191
input_datapath = "IntroBigFiveessays_anon/essays_anon_full.csv"
embeddings_output_path = "embeddings.csv"
results_output_path = "output_results.csv"

# Read data from CSV file
df = pd.read_csv(input_datapath)
df = df.dropna()

# Initialize encoding
encoding = tiktoken.get_encoding(embedding_encoding)

# Truncate long texts and calculate token count
df["n_tokens"] = df["text"].apply(lambda x: len(encoding.encode(x)))
# Omit rows where text is too long to embed
df = df[df["n_tokens"] <= max_tokens]
# Get embeddings
df["embedding"] = df["text"].apply(lambda x: get_embedding(x, engine=embedding_model))

# # Tokenize and truncate the text to keep only the first `max_tokens` tokens
# def tokenize_and_truncate(text, encoding, max_tokens=8000):
#     encoded_text = encoding.encode(text)
#     truncated_tokens = encoded_text[:max_tokens]
#     truncated_text = ','.join(str(token) for token in truncated_tokens)
#     return truncated_text

# # Truncate the text to the maximum number of tokens
# df["truncated_tokenized_text"] = df["text"].apply(lambda x: tokenize_and_truncate(x, encoding, max_tokens))

# # Calculate the number of tokens in the truncated text
# df["n_tokens"] = df["truncated_tokenized_text"].apply(lambda x: len(x.split(',')))

# # Get embeddings for the truncated text
# df["embedding"] = df["truncated_tokenized_text"].apply(lambda x: get_embedding(x, engine=embedding_model))

# Save embeddings, AUTHID, text, and n_tokens to a separate CSV file
# embeddings_df = df[['#AUTHID', 'text','truncated_tokenized_text', 'n_tokens', 'embedding']]

# Save embeddings as a NumPy array
# Extract embeddings and auth ids
auth_ids = df['#AUTHID'].to_numpy()
embeddings = np.vstack(df['embedding'].values)
# Save to .npz file
np.savez('embeddings.npz', auth_ids=auth_ids, embeddings=embeddings)


# # load the embeddings
# with np.load('embeddings.npz') as data:
#     embeddings = data['arr_0']

# Prepare data for regression
X = np.vstack(df["embedding"].apply(np.array).values)
y = df[['zEXT', 'zNEU', 'zAGR', 'zCON', 'zOPN']].values

# Initialize Ridge regression model
reg_model = Ridge()

# Define 5-fold cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)


cv_r2_scores = {}
# Dictionary to store cv predictions
cv_predictions = {}

# Calculate cross-validation predictions and R^2 scores for each personality trait
for i, trait in enumerate(['zEXT', 'zNEU', 'zAGR', 'zCON', 'zOPN']):
    # Get cross-validated predictions
    predictions = cross_val_predict(reg_model, X, y[:, i], cv=cv)
    # Calculate R^2 score
    r2_score_cv = r2_score(y[:, i], predictions)
    # Store results
    cv_r2_scores[f'cv_r2_score_{trait}'] = r2_score_cv
    cv_predictions[f'cv_pred_{trait}'] = predictions

# Add R^2 scores to DataFrame
for trait, score in cv_r2_scores.items():
    df[trait] = score

# Add predictions to DataFrame
for trait, preds in cv_predictions.items():
    df[trait] = preds

# Save results to CSV file
results_df = df[['#AUTHID', 'text', 'n_tokens', 'zEXT', 'zNEU', 'zAGR', 'zCON', 'zOPN'] + [f'cv_r2_score_{trait}' for trait in ['zEXT', 'zNEU', 'zAGR', 'zCON', 'zOPN']] + [f'cv_pred_{trait}' for trait in ['zEXT', 'zNEU', 'zAGR', 'zCON', 'zOPN']]]
results_df.to_csv(results_output_path, index=False, quoting=csv.QUOTE_NONNUMERIC, line_terminator='\n')

embeddings_df = df[['#AUTHID', 'text', 'n_tokens', 'embedding']]
embeddings_df.to_csv(embeddings_output_path, index=False, quoting=csv.QUOTE_NONNUMERIC, line_terminator='\n')



  results_df.to_csv(results_output_path, index=False, quoting=csv.QUOTE_NONNUMERIC, line_terminator='\n')
  embeddings_df.to_csv(embeddings_output_path, index=False, quoting=csv.QUOTE_NONNUMERIC, line_terminator='\n')


In [53]:
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge  
from sklearn.model_selection import train_test_split


# Load embeddings and auth_ids from NPZ file
with np.load('embeddings.npz', allow_pickle=True) as data:
    auth_ids = data['auth_ids']
    embeddings = data['embeddings']

# Convert embeddings to a list of lists (if not already in that format)
embeddings = [embedding.tolist() for embedding in embeddings]

# Create a DataFrame from the loaded data
df_embeddings = pd.DataFrame({'#AUTHID': auth_ids, 'embedding': embeddings})


# df_embeddings.to_csv('output_with_embeddings.csv', index=False)


input_datapath = "IntroBigFiveessays_anon/essays_anon_full.csv"
# Read data from CSV file
df = pd.read_csv(input_datapath)
df = df.dropna()

# Merge the dataframes on '#AUTHID'
merged_df = pd.merge(df, df_embeddings, on='#AUTHID')

# # Check if the merge is successful
# print(merged_df.head())

X = np.vstack(merged_df["embedding"].apply(np.array).values)
y = merged_df[['zEXT', 'zNEU', 'zAGR', 'zCON', 'zOPN']].values

# Create linear regression model
# model = LinearRegression()
model = Ridge()
# Train on full dataset
model.fit(X, y)
full_dataset_predictions = model.predict(X)

# Calculate and print the R^2 score for each trait on the full dataset
print("R^2 Scores on Full Dataset:")
for i, trait in enumerate(['zEXT', 'zNEU', 'zAGR', 'zCON', 'zOPN']):
    print(f'{trait}: {r2_score(y[:, i], full_dataset_predictions[:, i])}')

# Split the data into train and test sets (80-20 split)
train_df, test_df = train_test_split(merged_df, test_size=0.2, random_state=42)

# Extract embeddings and personality traits for training and testing
X_train = np.vstack(train_df["embedding"].apply(np.array).values)
y_train = train_df[['zEXT', 'zNEU', 'zAGR', 'zCON', 'zOPN']].values

X_test = np.vstack(test_df["embedding"].apply(np.array).values)
y_test = test_df[['zEXT', 'zNEU', 'zAGR', 'zCON', 'zOPN']].values

# Create a new linear regression model
# model_split = LinearRegression()
model_split = Ridge()

# Fit the model on the training data
model_split.fit(X_train, y_train)

# Make predictions on the test data
predictions_split = model_split.predict(X_test)

# Calculate and print the R^2 score for each trait on the test data
print("R^2 Scores on Test Data (80-20 split):")
for i, trait in enumerate(['zEXT', 'zNEU', 'zAGR', 'zCON', 'zOPN']):
    print(f'{trait}: {r2_score(y_test[:, i], predictions_split[:, i])}')

R^2 Scores on Full Dataset:
zEXT: 0.20286945750828544
zNEU: 0.19291191932268892
zAGR: 0.16807171266611753
zCON: 0.1842493948764451
zOPN: 0.26487599614806734
R^2 Scores on Test Data (80-20 split):
zEXT: 0.0850096978541458
zNEU: 0.02329294286956285
zAGR: 0.02776647955763334
zCON: 0.028616450998607768
zOPN: 0.14848491251035767
