In [4]:
import json
def load_data(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

In [None]:
def get_feature_set(json_data):
    feature_set = set()
    for occupation, info in json_data.items():
        for feature in info['attribute']:
            # print(feature)
            feature_set.add(feature)
    return feature_set

data = load_data('output/occupation_attribute_100.json')
feature_set = get_feature_set(data)


# from wikipedia

In [28]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from collections import defaultdict, Counter
from tqdm import tqdm
import numpy as np

def build_probability_matrix_from_list(data, feature_set):
    lemmatizer = WordNetLemmatizer()
    feature_counts = Counter()
    co_occurrence_counts = defaultdict(lambda: defaultdict(int))
    
    # Process each occupation group in the data
    for summary in tqdm(data, desc="Processing occupations"):
        words = word_tokenize(summary.lower())
        lemmatized_words = [lemmatizer.lemmatize(word) for word in words if word.isalpha()]
        features_in_summary = [word for word in lemmatized_words if word in feature_set]

        in_this_summary = set()

        for i in range(len(features_in_summary)):
            for j in range(i + 1, len(features_in_summary)):
                if (features_in_summary[i],features_in_summary[j]) not in in_this_summary:
                    in_this_summary.add((features_in_summary[i], features_in_summary[j]))
                    feature_counts[features_in_summary[i]] += 1
                    co_occurrence_counts[features_in_summary[i]][features_in_summary[j]] += 1

    feature_list = sorted(feature_set)
    matrix_size = len(feature_list)
    probability_matrix = np.zeros((matrix_size, matrix_size))
    
    for i, f1 in enumerate(feature_list):
        total_co_occurrences = sum(co_occurrence_counts[f1].values())
        for j, f2 in enumerate(feature_list):
            if f1 != f2 and total_co_occurrences > 0:
                probability_matrix[i, j] = co_occurrence_counts[f1][f2] / total_co_occurrences

    return feature_list, probability_matrix, feature_counts, co_occurrence_counts



In [29]:
# json_file = 'output/occupation_data_100.json'
json_file = 'output/occupation_summaries_200.json'
data = load_data(json_file)
wiki_summary_list = [i for i in data.values()]
feature_list, prob_matrix, feature_counts, co_occurrence_counts = build_probability_matrix_from_list(wiki_summary_list, feature_set)

Processing occupations: 100%|██████████| 105840/105840 [01:47<00:00, 983.23it/s] 


In [10]:
import numpy as np

def save_outputs(feature_list, probability_matrix, feature_counts, co_occurrence_counts, output_prefix):
    # Save the feature list
    with open(f'{output_prefix}_feature_list.json', 'w') as f:
        json.dump(feature_list, f, indent=4)
    
    # Save the probability matrix using NumPy's save function
    np.save(f'{output_prefix}_probability_matrix.npy', probability_matrix)
    
    # Save the feature counts
    with open(f'{output_prefix}_feature_counts.json', 'w') as f:
        json.dump(dict(feature_counts), f, indent=4)
    
    # Save the co_occurrence counts
    with open(f'{output_prefix}_co_occurrence_counts.json', 'w') as f:
        # Convert defaultdict to a normal dictionary for JSON serialization
        co_occurrence_dict = {k: dict(v) for k, v in co_occurrence_counts.items()}
        json.dump(co_occurrence_dict, f, indent=4)

save_outputs(feature_list, prob_matrix, feature_counts, co_occurrence_counts, 'output/wiki')

In [11]:
def save_sorted_probabilities(feature_list, probability_matrix, feature_counts, co_occurrence_counts, output_file):
    feature_index = {feature: idx for idx, feature in enumerate(feature_list)}
    sorted_probabilities = []

    # Collect all relevant data
    for f1 in feature_list:
        for f2 in feature_list:
            if f1 != f2:
                f1_idx = feature_index[f1]
                f2_idx = feature_index[f2]
                prob = probability_matrix[f1_idx][f2_idx]
                f1_count = feature_counts[f1]
                co_occurrence = co_occurrence_counts[f1][f2]
                sorted_probabilities.append(((f1, f2), prob, f1_count, co_occurrence))

    # Sort by probability, descending
    sorted_probabilities.sort(key=lambda x: x[1], reverse=True)

    # Save to file
    with open(output_file, 'w') as file:
        for entry in sorted_probabilities:
            line = f"{entry[0]}: Probability={entry[1]:.4f}, Count of {entry[0][0]}={entry[2]}, Count of {entry[0][0]} followed by {entry[0][1]}={entry[3]}\n"
            file.write(line)


In [12]:
output_file = 'wiki_sorted_probabilities_report.txt'
save_sorted_probabilities(feature_list, prob_matrix, feature_counts, co_occurrence_counts, output_file)

# from generation

In [30]:
import json
from pathlib import Path

def load_data_from_jsonl(jsonl_files):
    data = []
    for file_path in jsonl_files:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                # Each line is a complete JSON object
                entry = json.loads(line)
                data.append(entry)
    return data

labeled_path = Path('data/unlabeled')
jsonl_files = list(labeled_path.glob('*.jsonl'))
all_data = load_data_from_jsonl(jsonl_files)
gen_output_list = [i['output'] for i in all_data]


In [31]:
feature_list_gen, prob_matrix_gen, feature_counts_gen, co_occurrence_counts_gen = build_probability_matrix_from_list(gen_output_list, feature_set)

Processing occupations: 100%|██████████| 6000/6000 [00:07<00:00, 818.90it/s] 


In [32]:
save_outputs(feature_list_gen, prob_matrix_gen, feature_counts_gen, co_occurrence_counts_gen, 'output/gen')

In [33]:
output_file = 'gen_sorted_probabilities_report.txt'
save_sorted_probabilities(feature_list_gen, prob_matrix_gen, feature_counts_gen, co_occurrence_counts_gen, output_file)

# compare

In [34]:
def compare_probability_matrices(wiki_matrix, gen_matrix, feature_list, output_file):

    matrix_size = len(feature_list)
    differences = np.zeros((matrix_size, matrix_size))
    
    for i in range(matrix_size):
        for j in range(matrix_size):
            if i != j:  # We skip comparing a feature with itself
                differences[i][j] = wiki_matrix[i][j] - gen_matrix[i][j]

    # print(differences)

    flat_differences = []
    for i in range(matrix_size):
        for j in range(matrix_size):
            if i != j:
                flat_differences.append(((feature_list[i], feature_list[j]), differences[i][j]))

    flat_differences.sort(key=lambda x: x[1], reverse=True)
    
    significant_differences = {
        'most_positive': flat_differences[:20], 
        'most_negative': flat_differences[-20:] 
    }

    with open(output_file, 'w') as f:
        json.dump(significant_differences, f, indent=4)

    return significant_differences

In [35]:
print(len(prob_matrix))
print(len(feature_list))

443
443


In [36]:

wiki_prob_matrix = prob_matrix
gen_prob_matrix = prob_matrix_gen
feature_list = feature_list

output_file_path = 'comparison_results.json'
comparison_results = compare_probability_matrices(wiki_prob_matrix, gen_prob_matrix, feature_list, output_file_path)


# build buckets and compare

In [None]:
import pandas as pd

def flatten_matrix(matrix, feature_list):
    flattened_data = []
    for i in range(len(feature_list)):
        for j in range(len(feature_list)):
            if i != j:  # Exclude diagonal elements
                value = matrix[i, j]
                if value > 0:
                    flattened_data.append((feature_list[i], feature_list[j], value))
    return flattened_data

wiki_flattened = flatten_matrix(prob_matrix, feature_list)
gen_flattened = flatten_matrix(prob_matrix_gen, feature_list_gen)

wiki_df = pd.DataFrame(wiki_flattened, columns=['Feature1', 'Feature2', 'Wiki_Prob'])
gen_df = pd.DataFrame(gen_flattened, columns=['Feature1', 'Feature2', 'Gen_Prob'])

comparison_df = pd.merge(wiki_df, gen_df, on=['Feature1', 'Feature2'], how='inner')

comparison_df['Wiki_Bucket'] = pd.qcut(comparison_df['Wiki_Prob'], 5, labels=False) + 1
comparison_df['Gen_Bucket'] = pd.qcut(comparison_df['Gen_Prob'], 5, labels=False) + 1


In [46]:
comparison_df

Unnamed: 0,Feature1,Feature2,Wiki_Prob,Gen_Prob,Wiki_Bucket,Gen_Bucket,Change,Change_Direction
0,acting,actor,0.015059,0.021979,5,5,0.006920,Increase
1,acting,actress,0.009742,0.015448,5,5,0.005706,Increase
2,acting,address,0.000116,0.000126,1,1,0.000009,Increase
3,acting,air,0.001029,0.000126,2,1,-0.000903,Decrease
4,acting,album,0.003610,0.006029,4,4,0.002419,Increase
...,...,...,...,...,...,...,...,...
85303,york,world,0.008073,0.012391,5,5,0.004318,Increase
85304,york,writer,0.006027,0.003401,5,4,-0.002625,Decrease
85305,york,writing,0.005166,0.004252,4,4,-0.000914,Decrease
85306,york,wrote,0.005308,0.003037,4,4,-0.002271,Decrease


In [44]:
# 1. Calculate the stats of how many pairs move from each original bucket to each new bucket
bucket_movement = comparison_df.groupby(['Wiki_Bucket', 'Gen_Bucket']).size().unstack(fill_value=0)
print(bucket_movement)

Gen_Bucket      1     2     3     4      5
Wiki_Bucket                               
1            9383  4258  2205   892    325
2            4885  5516  3963  2007    689
3            2119  4664  5158  3661   1461
4             618  2261  4517  6095   3569
5              73   366  1218  4387  11018


In [45]:
# Add columns to check if each pair's probability increased, decreased, or stayed the same
comparison_df['Change'] = comparison_df['Gen_Prob'] - comparison_df['Wiki_Prob']
comparison_df['Change_Direction'] = comparison_df['Change'].apply(lambda x: 'Increase' if x > 0 else ('Decrease' if x < 0 else 'No Change'))

# 2. Analyze the change for original bucket 5
bucket_5_change_stats = comparison_df[comparison_df['Wiki_Bucket'] == 5].groupby('Change_Direction').size()

# 3. Analyze the change for original bucket 1
bucket_1_change_stats = comparison_df[comparison_df['Wiki_Bucket'] == 1].groupby('Change_Direction').size()

# Summary of average and median changes for bucket 5 and bucket 1
bucket_5_change_values = comparison_df[comparison_df['Wiki_Bucket'] == 5]['Change'].describe()
bucket_1_change_values = comparison_df[comparison_df['Wiki_Bucket'] == 1]['Change'].describe()

# Print the results
print("Bucket 5 Change Stats:")
print(bucket_5_change_stats)
print("\nBucket 5 Change Values:")
print(bucket_5_change_values)

print("\nBucket 1 Change Stats:")
print(bucket_1_change_stats)
print("\nBucket 1 Change Values:")
print(bucket_1_change_values)


Bucket 5 Change Stats:
Change_Direction
Decrease    8788
Increase    8274
dtype: int64

Bucket 5 Change Values:
count    17062.000000
mean         0.002226
std          0.011289
min         -0.151061
25%         -0.003586
50%         -0.000254
75%          0.005340
max          0.490012
Name: Change, dtype: float64

Bucket 1 Change Stats:
Change_Direction
Decrease     5019
Increase    12044
dtype: int64

Bucket 1 Change Values:
count    17063.000000
mean         0.000779
std          0.002287
min         -0.000676
25%         -0.000041
50%          0.000212
75%          0.000825
max          0.110528
Name: Change, dtype: float64
