In [1]:
import json
def load_data(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

In [2]:
def get_feature_set(json_data):
    feature_set = set()
    for occupation, info in json_data.items():
        for feature in info['attribute']:
            # print(feature)
            feature_set.add(feature)
    return feature_set

data = load_data('output/occupation_attribute_100.json')
feature_set = get_feature_set(data)


In [28]:
feature_set

{'acceptability',
 'acting',
 'actor',
 'actress',
 'address',
 'air',
 'album',
 'alvin',
 'american',
 'andy',
 'animated',
 'anime',
 'announcer',
 'appearance',
 'appeared',
 'appointed',
 'april',
 'architect',
 'architecture',
 'army',
 'arranger',
 'art',
 'artist',
 'ascent',
 'athlete',
 'author',
 'automobile',
 'award',
 'ballet',
 'band',
 'baptist',
 'berkeley',
 'best',
 'better',
 'bible',
 'blue',
 'board',
 'book',
 'boyd',
 'british',
 'broadcast',
 'broadcaster',
 'broadcasting',
 'broadway',
 'brower',
 'building',
 'bulgarian',
 'business',
 'businessman',
 'businesswoman',
 'cabinet',
 'canada',
 'canadian',
 'cannes',
 'career',
 'carpenter',
 'cathedral',
 'centre',
 'ceo',
 'ceylon',
 'chairman',
 'champion',
 'championship',
 'channel',
 'character',
 'chart',
 'chief',
 'child',
 'china',
 'chinese',
 'christian',
 'church',
 'cinema',
 'city',
 'classical',
 'climb',
 'climbed',
 'climber',
 'climbing',
 'club',
 'coach',
 'college',
 'column',
 'columnist',

# from wikipedia

In [29]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from collections import defaultdict, Counter
from tqdm import tqdm
import numpy as np

def build_probability_matrix_from_list(data, feature_set):
    lemmatizer = WordNetLemmatizer()
    feature_counts = Counter()
    co_occurrence_counts = defaultdict(lambda: defaultdict(int))
    
    # Process each occupation group in the data
    for summary in tqdm(data, desc="Processing occupations"):
        words = word_tokenize(summary.lower())
        lemmatized_words = [lemmatizer.lemmatize(word) for word in words if word.isalpha()]
        features_in_summary = [word for word in lemmatized_words if word in feature_set]

        in_this_summary = set()

        for i in range(len(features_in_summary)):
            for j in range(i + 1, len(features_in_summary)):
                if (features_in_summary[i],features_in_summary[j]) not in in_this_summary:
                    in_this_summary.add((features_in_summary[i], features_in_summary[j]))
                    feature_counts[features_in_summary[i]] += 1
                    co_occurrence_counts[features_in_summary[i]][features_in_summary[j]] += 1

    feature_list = sorted(feature_set)
    matrix_size = len(feature_list)
    probability_matrix = np.zeros((matrix_size, matrix_size))
    
    for i, f1 in enumerate(feature_list):
        total_co_occurrences = sum(co_occurrence_counts[f1].values())
        for j, f2 in enumerate(feature_list):
            # if f1 != f2 and total_co_occurrences > 0:
            if total_co_occurrences > 0:
                probability_matrix[i, j] = co_occurrence_counts[f1][f2] / total_co_occurrences

    return feature_list, probability_matrix, feature_counts, co_occurrence_counts



In [32]:
# json_file = 'output/occupation_data_100.json'
json_file = 'output/occupation_summaries_200.json'
data = load_data(json_file)
wiki_summary_list = [i for i in data.values()]
feature_list, prob_matrix, feature_counts, co_occurrence_counts = build_probability_matrix_from_list(wiki_summary_list, feature_set)

Processing occupations: 100%|██████████| 105840/105840 [01:44<00:00, 1015.71it/s]


In [33]:
import numpy as np

def save_outputs(feature_list, probability_matrix, feature_counts, co_occurrence_counts, output_prefix):
    # Save the feature list
    with open(f'{output_prefix}_feature_list.json', 'w') as f:
        json.dump(feature_list, f, indent=4)
    
    # Save the probability matrix using NumPy's save function
    np.save(f'{output_prefix}_probability_matrix.npy', probability_matrix)
    
    # Save the feature counts
    with open(f'{output_prefix}_feature_counts.json', 'w') as f:
        json.dump(dict(feature_counts), f, indent=4)
    
    # Save the co_occurrence counts
    with open(f'{output_prefix}_co_occurrence_counts.json', 'w') as f:
        # Convert defaultdict to a normal dictionary for JSON serialization
        co_occurrence_dict = {k: dict(v) for k, v in co_occurrence_counts.items()}
        json.dump(co_occurrence_dict, f, indent=4)

save_outputs(feature_list, prob_matrix, feature_counts, co_occurrence_counts, 'output/wiki_allow_same')

In [58]:
def save_sorted_probabilities(feature_list, probability_matrix, feature_counts, co_occurrence_counts, output_file):
    feature_index = {feature: idx for idx, feature in enumerate(feature_list)}
    sorted_probabilities = []

    # Collect all relevant data
    for f1 in feature_list:
        for f2 in feature_list:
            f1_idx = feature_index[f1]
            f2_idx = feature_index[f2]
            prob = probability_matrix[f1_idx][f2_idx]
            f1_count = feature_counts[f1]
            co_occurrence = co_occurrence_counts[f1][f2]
            sorted_probabilities.append(((f1, f2), prob, f1_count, co_occurrence))

    # Sort by probability, descending
    sorted_probabilities.sort(key=lambda x: x[1], reverse=True)

    # Save to file
    with open(output_file, 'w') as file:
        for entry in sorted_probabilities:
            line = f"{entry[0]}: Probability={entry[1]:.4f}, Count of {entry[0][0]}={entry[2]}, Count of {entry[0][0]} followed by {entry[0][1]}={entry[3]}\n"
            file.write(line)


In [59]:
output_file = 'wiki_allow_same_sorted_probabilities_report.txt'
save_sorted_probabilities(feature_list, prob_matrix, feature_counts, co_occurrence_counts, output_file)

# from generation

In [45]:
import json
from pathlib import Path

def load_data_from_jsonl(jsonl_files):
    data = []
    for file_path in jsonl_files:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                # Each line is a complete JSON object
                entry = json.loads(line)
                data.append(entry)
    return data

unlabeled_path = Path('data/unlabeled')
labeled_path = Path('data/labeled')
jsonl_files = list(unlabeled_path.glob('*.jsonl'))+list(labeled_path.glob('*.jsonl'))######
all_data = load_data_from_jsonl(jsonl_files)
gen_output_list = [i['output'] for i in all_data]


In [46]:
jsonl_files

[PosixPath('data/unlabeled/Alpaca-13B.jsonl'),
 PosixPath('data/unlabeled/Alpaca-65B.jsonl'),
 PosixPath('data/unlabeled/Alpaca-7B.jsonl'),
 PosixPath('data/unlabeled/ChatGPT.jsonl'),
 PosixPath('data/unlabeled/Dolly-12B.jsonl'),
 PosixPath('data/unlabeled/Pythia-12B.jsonl'),
 PosixPath('data/unlabeled/GPT-4.jsonl'),
 PosixPath('data/unlabeled/InstructGPT.jsonl'),
 PosixPath('data/unlabeled/MPT-Chat-7B.jsonl'),
 PosixPath('data/unlabeled/Stablelm-alpha-7B.jsonl'),
 PosixPath('data/unlabeled/Vicuna-13B.jsonl'),
 PosixPath('data/unlabeled/Vicuna-7B.jsonl'),
 PosixPath('data/labeled/ChatGPT.jsonl'),
 PosixPath('data/labeled/InstructGPT.jsonl'),
 PosixPath('data/labeled/PerplexityAI.jsonl')]

In [47]:
feature_list_gen, prob_matrix_gen, feature_counts_gen, co_occurrence_counts_gen = build_probability_matrix_from_list(gen_output_list, feature_set)

Processing occupations: 100%|██████████| 6549/6549 [00:08<00:00, 811.47it/s] 


In [48]:
save_outputs(feature_list_gen, prob_matrix_gen, feature_counts_gen, co_occurrence_counts_gen, 'output/gen_allow_same')

In [60]:
output_file = 'gen_allow_same_sorted_probabilities_report.txt'
save_sorted_probabilities(feature_list_gen, prob_matrix_gen, feature_counts_gen, co_occurrence_counts_gen, output_file)

# compare

In [61]:
def compare_probability_matrices(wiki_matrix, gen_matrix, feature_list, output_file):

    matrix_size = len(feature_list)
    differences = np.zeros((matrix_size, matrix_size))
    
    for i in range(matrix_size):
        for j in range(matrix_size):
            differences[i][j] = gen_matrix[i][j] - wiki_matrix[i][j]

    # print(differences)

    flat_differences = []
    for i in range(matrix_size):
        for j in range(matrix_size):
            flat_differences.append(((feature_list[i], feature_list[j]), differences[i][j]))

    flat_differences.sort(key=lambda x: x[1], reverse=True)
    
    significant_differences = {
        'most_positive': flat_differences[:20], 
        'most_negative': flat_differences[-20:] 
    }

    with open(output_file, 'w') as f:
        json.dump(significant_differences, f, indent=4)

    return significant_differences

In [62]:
print(len(prob_matrix))
print(len(feature_list))
print(len(prob_matrix_gen))

443
443
443


In [63]:

wiki_prob_matrix = prob_matrix
gen_prob_matrix = prob_matrix_gen
feature_list = feature_list

output_file_path = 'comparison_results_allow_same.json'
comparison_results = compare_probability_matrices(wiki_prob_matrix, gen_prob_matrix, feature_list, output_file_path)


# build buckets and compare

In [73]:
import pandas as pd

def flatten_matrix(matrix, feature_list):
    flattened_data = []
    for i in range(len(feature_list)):
        for j in range(len(feature_list)):
            value = matrix[i, j]
            flattened_data.append((feature_list[i], feature_list[j], value))
    return flattened_data

wiki_flattened = flatten_matrix(prob_matrix, feature_list)
gen_flattened = flatten_matrix(prob_matrix_gen, feature_list_gen)

wiki_df = pd.DataFrame(wiki_flattened, columns=['Feature1', 'Feature2', 'Wiki_Prob'])
gen_df = pd.DataFrame(gen_flattened, columns=['Feature1', 'Feature2', 'Gen_Prob'])

comparison_df = pd.merge(wiki_df, gen_df, on=['Feature1', 'Feature2'], how='inner')
comparison_df = comparison_df[comparison_df['Wiki_Prob'] != 0]
comparison_df = comparison_df[comparison_df['Gen_Prob'] != 0]

comparison_df['Wiki_Bucket'] = pd.qcut(comparison_df['Wiki_Prob'], 5, labels=False) + 1
comparison_df['Gen_Bucket'] = pd.qcut(comparison_df['Gen_Prob'], 5, labels=False) + 1

comparison_df['Change'] = comparison_df['Gen_Prob'] - comparison_df['Wiki_Prob']
comparison_df['Change_Direction'] = comparison_df['Change'].apply(lambda x: 'Increase' if x > 0 else ('Decrease' if x < 0 else 'No Change'))

In [74]:
comparison_df

Unnamed: 0,Feature1,Feature2,Wiki_Prob,Gen_Prob,Wiki_Bucket,Gen_Bucket,Change,Change_Direction
444,acting,acting,0.013720,0.024896,5,5,0.011175,Increase
445,acting,actor,0.015059,0.021935,5,5,0.006875,Increase
446,acting,actress,0.009742,0.015025,5,5,0.005283,Increase
447,acting,address,0.000116,0.000110,1,1,-0.000007,Decrease
448,acting,air,0.001029,0.000219,2,1,-0.000809,Decrease
...,...,...,...,...,...,...,...,...
196243,york,writer,0.006027,0.003320,5,4,-0.002707,Decrease
196244,york,writing,0.005166,0.004006,4,4,-0.001159,Decrease
196245,york,wrote,0.005308,0.002862,4,3,-0.002446,Decrease
196246,york,year,0.014373,0.009501,5,5,-0.004872,Decrease


In [75]:
# 1. Calculate the stats of how many pairs move from each original bucket to each new bucket
bucket_movement = comparison_df.groupby(['Wiki_Bucket', 'Gen_Bucket']).size().unstack(fill_value=0)
print(bucket_movement)

Gen_Bucket       1     2     3     4      5
Wiki_Bucket                                
1            10019  4523  2155   859    288
2             5124  5939  4045  2049    687
3             2117  4852  5583  3828   1463
4              558  2203  4874  6591   3618
5               64   325  1158  4509  11788


In [76]:
for i in range(1,6):
    print(i)
    print(comparison_df[comparison_df['Wiki_Bucket'] == i]['Change'].describe(include=['O']))

1
count    17844.000000
mean         0.000690
std          0.001905
min         -0.000648
25%         -0.000043
50%          0.000188
75%          0.000741
max          0.045911
Name: Change, dtype: float64
2
count    17844.000000
mean         0.000725
std          0.002984
min         -0.001540
25%         -0.000553
50%         -0.000073
75%          0.000907
max          0.075675
Name: Change, dtype: float64
3
count    17843.000000
mean         0.000648
std          0.003918
min         -0.002940
25%         -0.001224
50%         -0.000444
75%          0.001025
max          0.085000
Name: Change, dtype: float64
4
count    17844.000000
mean         0.000675
std          0.005249
min         -0.005624
25%         -0.002177
50%         -0.000777
75%          0.001656
max          0.128205
Name: Change, dtype: float64
5
count    17844.000000
mean         0.002067
std          0.010190
min         -0.151061
25%         -0.003473
50%         -0.000219
75%          0.005090
max          0.2

In [77]:
comparison_df2 = pd.merge(wiki_df, gen_df, on=['Feature1', 'Feature2'], how='inner')
comparison_df2 = comparison_df2[comparison_df2['Wiki_Prob'] != 0]

comparison_df2['Wiki_Bucket'] = pd.qcut(comparison_df2['Wiki_Prob'], 5, labels=False) + 1
comparison_df2['Gen_Bucket'] = pd.cut(comparison_df2['Gen_Prob'], bins=5, labels=False) + 1

comparison_df2['Change'] = comparison_df2['Gen_Prob'] - comparison_df2['Wiki_Prob']
comparison_df2['Change_Direction'] = comparison_df2['Change'].apply(lambda x: 'Increase' if x > 0 else ('Decrease' if x < 0 else 'No Change'))

comparison_df2

Unnamed: 0,Feature1,Feature2,Wiki_Prob,Gen_Prob,Wiki_Bucket,Gen_Bucket,Change,Change_Direction
0,acceptability,acceptability,0.027027,0.000000,5,1,-0.027027,Decrease
114,acceptability,deputy,0.027027,0.000000,5,1,-0.027027,Decrease
123,acceptability,economic,0.027027,0.000000,5,1,-0.027027,Decrease
129,acceptability,elected,0.027027,0.000000,5,1,-0.027027,Decrease
130,acceptability,election,0.027027,0.000000,5,1,-0.027027,Decrease
...,...,...,...,...,...,...,...,...
196244,york,writing,0.005166,0.004006,5,1,-0.001159,Decrease
196245,york,wrote,0.005308,0.002862,5,1,-0.002446,Decrease
196246,york,year,0.014373,0.009501,5,1,-0.004872,Decrease
196247,york,yoga,0.000041,0.000000,1,1,-0.000041,Decrease


In [None]:
bucket_movement = comparison_df.groupby(['Wiki_Bucket', 'Gen_Bucket']).size().unstack(fill_value=0)
print(bucket_movement)

Gen_Bucket       1     2     3     4      5
Wiki_Bucket                                
1            10019  4523  2155   859    288
2             5124  5939  4045  2049    687
3             2117  4852  5583  3828   1463
4              558  2203  4874  6591   3618
5               64   325  1158  4509  11788


In [79]:
for i in range(1,6):
    print(i)
    print(comparison_df[comparison_df['Wiki_Bucket'] == i]['Change'].describe(include=['O']))

1
count    17844.000000
mean         0.000690
std          0.001905
min         -0.000648
25%         -0.000043
50%          0.000188
75%          0.000741
max          0.045911
Name: Change, dtype: float64
2
count    17844.000000
mean         0.000725
std          0.002984
min         -0.001540
25%         -0.000553
50%         -0.000073
75%          0.000907
max          0.075675
Name: Change, dtype: float64
3
count    17843.000000
mean         0.000648
std          0.003918
min         -0.002940
25%         -0.001224
50%         -0.000444
75%          0.001025
max          0.085000
Name: Change, dtype: float64
4
count    17844.000000
mean         0.000675
std          0.005249
min         -0.005624
25%         -0.002177
50%         -0.000777
75%          0.001656
max          0.128205
Name: Change, dtype: float64
5
count    17844.000000
mean         0.002067
std          0.010190
min         -0.151061
25%         -0.003473
50%         -0.000219
75%          0.005090
max          0.2

In [80]:
print(comparison_df['Wiki_Prob'].sum())

333.4381765430599


In [81]:
print(comparison_df['Gen_Prob'].sum())

419.1609584040059


In [82]:
print(comparison_df2['Wiki_Prob'].sum())

442.99999999999994


In [83]:
print(comparison_df2['Gen_Prob'].sum())

419.1609584040059
