In [121]:
import pyro
import torch
import json
import pickle
import os 
from prodslda_cls import ProdSLDA

from sklearn.feature_extraction.text import CountVectorizer

MODEL_PATH = '/burg/nlp/users/zfh2000/style_results/pos_bigrams/2023-12-14_17_54_45/model_epoch5_20914.218841552734.pt'
DATA_DIR_PATH = '/burg/nlp/users/zfh2000/style_results/pos_bigrams/maxdf0.5_mindf5_DATA'

with open(os.path.join(DATA_DIR_PATH, 'bows.pickle'), 'rb') as in_file:
    bows = pickle.load(in_file)
        
with open(os.path.join(DATA_DIR_PATH, 'meta_vectorized.pickle'), 'rb') as in_file:
    meta_vectorized = pickle.load(in_file)    

with open(os.path.join(DATA_DIR_PATH, "raw_text.json"), 'r') as in_file:
    raw_text = json.load(in_file)    

with open(os.path.join(DATA_DIR_PATH, "authors_json.json"), 'r') as in_file:
    authors_json = json.load(in_file)    

with open(os.path.join(DATA_DIR_PATH, "meta_feature_to_names.json"), 'r') as in_file:
    meta_feature_to_names = json.load(in_file)

with open(os.path.join(DATA_DIR_PATH, "vectorizer.pickle"), 'rb') as in_file:
    vectorizer = pickle.load(in_file)

In [122]:
pyro.clear_param_store()

prodsdla = torch.load(MODEL_PATH)
prodsdla.eval()

ProdSLDA(
  (encoder): GeneralEncoder(
    (drop): Dropout(p=0, inplace=False)
    (fc1s): ModuleDict(
      (doc): Linear(in_features=9267, out_features=64, bias=True)
    )
    (fc2): Linear(in_features=64, out_features=64, bias=True)
    (fcmu): Linear(in_features=64, out_features=10, bias=True)
    (fclv): Linear(in_features=64, out_features=10, bias=True)
    (bnmu): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
    (bnlv): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
  )
  (decoder): Decoder(
    (beta): Linear(in_features=10, out_features=9267, bias=False)
    (bn): BatchNorm1d(9267, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
    (drop): Dropout(p=0, inplace=False)
  )
  (style_encoder): GeneralEncoder(
    (drop): Dropout(p=0, inplace=False)
    (fc1s): ModuleDict(
      (pos_bigrams): Linear(in_features=324, out_features=64, bias=True)
    )
    (fc2): Linear(in_features=64, out_featur

In [123]:
def top_beta_document(model, vectorizer, top_k=20):
    betas_document = model.beta_document()
    features_to_betas = {}
    idx_to_name = {v:k for k,v in vectorizer.vocabulary_.items()}
    for feature, logits in betas_document.items():
        features_to_betas[feature] = []
        num_features = logits.shape[0]
        top_results = torch.topk(logits, top_k, dim=-1)
        
        ids = top_results.indices.cpu().numpy()
        values = top_results.values.cpu().numpy()
        
        for i in tqdm(range(num_features)):
            features_to_betas[feature].append({'values':values[i], 'top':[idx_to_name[idx] for idx in ids[i]]})
                
    return features_to_betas

def top_beta_meta(model, meta_feature_to_names, top_k=20):
    betas_metas = model.beta_meta()
    features_to_betas = {}
    for feature, logits in betas_metas.items():
        idx_to_name = {i:k for i,k in enumerate(meta_feature_to_names[feature])}
        features_to_betas[feature] = []
        num_features = logits.shape[0]
        top_results = torch.topk(logits, top_k, dim=-1)
        ids = top_results.indices.cpu().numpy()
        values = top_results.values.cpu().numpy()
        for i in tqdm(range(num_features)):
            features_to_betas[feature].append({'values':values[i], 'top':[idx_to_name[idx] for idx in ids[i]]})
        
    return features_to_betas 


In [124]:
top_words_per_latent = top_beta_document(prodsdla, vectorizer,  top_k=20)
top_meta_per_latent = top_beta_meta(prodsdla, meta_feature_to_names, top_k=20)

print('Document Term Info')
for latent, top in top_words_per_latent.items():
    print(f'\t{latent} ({len(top)}):')
    for i, results in enumerate(top):
        print(f'\t\t {latent} ({i}):\n{results["top"]}')
        print()

print('Meta Var Info')
for latent, top in top_meta_per_latent.items():

    print(f'\t{latent} ({len(top)}):')
    for i, results in enumerate(top):
        print(f'\t\t {latent} ({i}):\n{results["top"]}')
        print()


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 3308.85it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 36663.50it/s]

Document Term Info
	beta_topic (10):
		 beta_topic (0):
['fine', 'hope', 'sally', '28', '09', 'stuff', '08', 'hourahead', 'night', 'attached', 'jim', 'tickets', 'energy', 'making', 'presentation', 'kitchen', 'manual', '713', 'linda', 'original']

		 beta_topic (1):
['silva', 'geae', '962', '7566', 'eb3892', 'manis', 'x33278', '1575', 'freyre', 'bambos', 'ccampbell', 'giuseppe', 'vande', 'kupiecki', '4727', 'noncore', 'kayne', 'mckinsey', 'nassos', 'centilli']

		 beta_topic (2):
['fax', '713', 'john', 'market', 'bcc', 'help', 'yes', 'needs', 'hey', 'request', 'credit', 'questions', 'deals', 'address', 'updates', 'message', 'email', '646', 'management', 'www']

		 beta_topic (3):
['print', 'attachment', 'enron', 'report', 'asked', '07', 'format', 'sppc', 'wrong', 'job', 'waiting', 'kate', 'information', 'language', 'retain', 'time', 'sorry', 'attached', '00', 'basis']

		 beta_topic (4):
['fyi', 'need', 'just', 'meeting', 'sent', 'thanks', 'ferc', '2000', 'mark', 'revised', 'going', 'dr




In [118]:
bows['training'].shape

(66668, 9267)

In [127]:
import torch.nn.functional as F
DEVICE='cuda'
meta_vectorized['training'] = {k:v.toarray() for k,v in meta_vectorized['training'].items()}

author_to_result = {}

bows['training'] = bows['training'].toarray()

with torch.no_grad():

    for i in tqdm(range(len(raw_text['training']))):
        text = raw_text['training'][i]
        author = authors_json['training'][i]
        b = torch.tensor(torch.tensor(bows['training'][i]).unsqueeze(0).float().to(DEVICE))
        m = {k:torch.tensor(v[i]).unsqueeze(0).float().to(DEVICE) for k,v in meta_vectorized['training'].items()}
        # print(b)
        # print(m)

        theta, kappa =  prodsdla.guide(b, m)

        theta = F.softmax(theta,-1).detach().cpu()
        kappa = F.softmax(kappa,-1).detach().cpu()
        if author not in author_to_result:
            author_to_result[author] = []
            
        author_to_result[author].append((text, theta, kappa))


  b = torch.tensor(torch.tensor(bows['training'][i]).unsqueeze(0).float().to(DEVICE))
100%|█████████████████████████████████████████████████████████████████████████████████████████████| 66668/66668 [03:37<00:00, 306.93it/s]


In [None]:
author_to_result

In [None]:
author_list = sorted(author_to_result.keys())

In [None]:
num_topics = len(author_to_result[author_list[0]][0][1][0])
num_styles = len(author_to_result[author_list[0]][0][2][0])

topics_to_top_examples = {i:[] for i in range(num_topics)}
style_to_top_examples = {i:[] for i in range(num_styles)}

for a in tqdm(author_to_result):
    for text, theta, kappa in author_to_result[a]:
        for i, x in enumerate(theta[0]):
            topics_to_top_examples[i].append((x, a,text))
        for i, x in enumerate(kappa[0]):
            style_to_top_examples[i].append((x, a,text))
            
            
topics_to_top_examples = {i: sorted(v) for i,v in topics_to_top_examples.items()}        
style_to_top_examples = {i: sorted(v) for i,v in style_to_top_examples.items()}  
    

In [None]:
for i, v in topics_to_top_examples.items():
    print("TOPIC",i)
    print(v[-5:])
    print()

for i, v in style_to_top_examples.items():
    print("STYLE",i)
    print(v[-5:])
    print()

In [None]:
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

def compute_projection(x):
    pca = PCA(n_components=2)
    pca.fit(x)
    return pca.transform(x)

def plot_projection(x, y, authors):
    color = ['red', 'blue', 'green', 'purple', 'orange', 'yellow', 'black', 'pink', 'brown', 'gray']
    
    # is_gpt = [a.startswith('gpt3.5_') for a in authors]
    # authors_norm = [a.replace('gpt3.5_','') for a in authors]

    nongpt_x = []
    nongpt_y = []
    human_labels = []

    gpt_x = []
    gpt_y = []
    gpt_labels = []

    for i, a in enumerate(authors):
        if a.startswith('gpt3.5_'):
            gpt_x.append(x[i])
            gpt_y.append(y[i])
            gpt_labels.append(a)
        else:
            nongpt_x.append(x[i])
            nongpt_y.append(y[i])
            human_labels.append(a)

    plt.figure(figsize=(10,10))
    plt.scatter(nongpt_x, nongpt_y, color='red', label='human')
    plt.scatter(gpt_x, gpt_y, color='blue', label='synthetic')

    # for i, label in enumerate(human_labels):
        # plt.annotate(label, (nongpt_x[i], nongpt_y[i]))

    # TODO: match specific authors to specific colors, use different shapes for gpt vs human

    plt.show()

In [None]:
thetas = []
kappas = []
a_labels = []
texts = []

for a in tqdm(sorted(author_to_result.keys())):
    for text, theta, kappa in author_to_result[a]:
        kappas.append(kappa[0])
        thetas.append(theta[0])
        a_labels.append(a)
        texts.append(text)
       

In [None]:
kappa_proj = compute_projection(np.stack(kappas))
plot_projection(kappa_proj[:,0], kappa_proj[:,1], a_labels)

In [None]:
theta_proj = compute_projection(np.stack(thetas))
plot_projection(theta_proj[:,0], theta_proj[:,1], a_labels)

In [None]:
import math

def plot_projection_comparison(x, y, authors):
    color = ['pink', 'blue','black'] #['red', 'blue', 'green', 'purple', 'orange', 'yellow', 'black', 'pink', 'brown', 'gray']


    
    is_gpt = [a for a in authors if a.startswith('gpt3.5_')]
    authors_paired = [a for a in authors if 'gpt3.5_'+a in is_gpt]

    plt.figure(figsize=(10,10))

    for c, chosen_a in enumerate(sorted(set(authors_paired))[:len(color)]):
        nongpt_x = []
        nongpt_y = []
    
        gpt_x = []
        gpt_y = []

        for i, a in enumerate(authors):
            if chosen_a not in a: continue
            if a.startswith('gpt3.5_'):
                gpt_x.append(x[i])
                gpt_y.append(y[i])
            else:
                nongpt_x.append(x[i])
                nongpt_y.append(y[i])

        

                
        
   
        plt.scatter(nongpt_x, nongpt_y, color=color[c], label='human', marker='.')
        plt.scatter(gpt_x, gpt_y, color=color[c], label='synthetic', marker='x')

        for i in range(len(gpt_x)):
            distances = [(math.dist([gpt_x[i], gpt_y[i]], [nongpt_x[j], nongpt_y[j]]), ([nongpt_x[j], nongpt_y[j]])) for j in range(len(nongpt_x))]
            distances = sorted(distances)
            plt.plot([gpt_x[i], distances[0][1][0]], [gpt_y[i], distances[0][1][1]], color=color[c], linestyle='--', linewidth=0.5)
    plt.show()


In [None]:
plot_projection_comparison(kappa_proj[:,0], kappa_proj[:,1], a_labels)

In [None]:
plot_projection_comparison(theta_proj[:,0], theta_proj[:,1], a_labels)