In [1]:
import pandas as pd
import numpy as np
import random
from random import randint
import json

In [2]:
def reset_data():
    data = pd.read_json('Data/initial_data_1cat.json')
    
    # Extracting the 'words' data and flattening it
    words_data = pd.json_normalize(data['Category']['words'])

    # Transposing the DataFrame for better readability and structure
    words_data = words_data.transpose()

    # Resetting the index to have a proper DataFrame structure
    words_data.reset_index(inplace=True)

    # Splitting the 'index' column to separate word keys from their properties
    words_data[['word_key', 'property']] = words_data['index'].str.split('.', expand=True)

    # Pivoting the table to have properties as columns
    pivoted_data = words_data.pivot(index='word_key', columns='property', values=0)

    # Resetting index for clarity
    pivoted_data.reset_index(inplace=True)

    # Saving the filtered data to JSON file
    pivoted_data_path = "Data/pivoted_data.json"

    pivoted_data.to_json(pivoted_data_path, orient='records', lines=True)
    
    return "Data/pivoted_data.json"

In [3]:
def strengths(runs, iterations):
    for a in range(runs):
        # Define strengths dictionary
        strengths = {}
        
        # Reset data
        pivoted_data_path = reset_data()

        for i in range(iterations):
            # Read master data
            df = pd.read_json(pivoted_data_path, orient='records', lines=True)

            # Extract the 'words' dictionary from the DataFrame
            words_data = df['word_key'].tolist()
            frequencies = df['frequency'].astype(int).tolist()

            # Picking a word based on its frequency
            chosen_word = random.choices(words_data, weights=frequencies, k=1)[0]

            # Extract exemplars for the chosen word
            exemplars_list = df[df.word_key == chosen_word]['exemplars'].iloc[0]
            
            # Extract frequencies for the chosen word
            frequency = df[df.word_key == chosen_word]['frequency'].iloc[0]
            
            # Calculate exemplar index
            exemplar_index = randint(0, len(exemplars_list)-1)

            # Choose an exemplar randomly without weighting
            new_exemplar = exemplars_list[exemplar_index]

            # Save exemplar data
            strengths[i] = {"word_key":chosen_word,"exemplar":new_exemplar, 
                            "exemplar_index":exemplar_index, "frequency":int(frequency)}
            
        # Save the mean dictionaries as JSON files        
        strengths_path = "Outputs/Old/strengths_r" + str(a+1) + "_i" + str(i+1) + ".json"

        with open(strengths_path, 'w') as f:
            json.dump(strengths, f)

In [4]:
strengths(1, 10000)

In [5]:
strengths_data = pd.DataFrame(pd.read_json('Outputs/Old/strengths_r1_i10000.json').transpose())
strengths_data

Unnamed: 0,word_key,exemplar,exemplar_index,frequency
0,F7T11,0.2,0,7
1,F9T4,-0.6,4,9
2,F5T10,-0.9,0,5
3,F6T4,0.8,1,6
4,F6T9,0.4,4,6
...,...,...,...,...
9995,F4T3,-0.1,3,4
9996,F8T6,1.2,6,8
9997,F3T3,-0.4,2,3
9998,F6T14,-1.5,2,6


In [6]:
# Final data
final_strengths_data = pd.DataFrame(strengths_data[['word_key', 'exemplar', 'exemplar_index', 'frequency']].value_counts())
final_strengths_data.reset_index(inplace = True)
final_strengths_data.columns = ['word_key', 'exemplars', 'exemplar_index', 'frequency', 'exemplar_frequency']
final_strengths_data['exemplars'] = final_strengths_data['exemplars'].round(1)

# Master data
master_df = pd.read_csv("Data/master_df.csv")
master_df.rename(columns={"Unnamed: 0" : "index"}, inplace = True)
master_df['exemplars'] = master_df['exemplars'].round(1)

# Put it together
final_strengths_data = final_strengths_data.merge(master_df, how="left", 
                                                  on = ['word_key', 'exemplars', 'exemplar_index', 'frequency'])
final_strengths_data = final_strengths_data.sort_values(by=['index'])

# Add exemplar strength
final_strengths_data['exemplar_strength'] = (len(final_strengths_data)-final_strengths_data['exemplar_frequency'])/len(final_strengths_data)

# change frequency for last word
final_strengths_data.loc[final_strengths_data.word_key == 'F9T5', 'frequency'] = 8

final_strengths_data

Unnamed: 0,word_key,exemplars,exemplar_index,frequency,exemplar_frequency,index,exemplar_strength
472,F10T1,0.2,0,10,12,0,0.975560
478,F10T1,-1.0,1,10,11,1,0.977597
98,F10T1,-1.2,2,10,24,2,0.951120
454,F10T1,0.2,3,10,14,3,0.971487
161,F10T1,0.5,4,10,22,4,0.955193
...,...,...,...,...,...,...,...
348,F9T5,0.1,3,8,18,486,0.963340
310,F9T5,0.1,4,8,19,487,0.961303
211,F9T5,0.4,5,8,21,488,0.957230
162,F9T5,0.2,6,8,22,489,0.955193


In [7]:
# Here we group by 'word_key', aggregate 'exemplars' into lists, and sum 'exemplar_frequency'
grouped_data = final_strengths_data.groupby(['word_key']).agg({
    'exemplars': lambda x: x.tolist(),  # convert exemplars to a list
    'frequency': 'first',  # take the first frequency assuming it's the same for all
    'exemplar_frequency': lambda x: x.tolist(),  # convert exemplar_frequency to a list
    'exemplar_strength' : lambda x: x.tolist() # convert exemplar_strength to a list
}).reset_index()

# Open the file to write the JSON data
with open('Outputs/Old/strengths.json', 'w') as f:
    for index, row in grouped_data.iterrows():
        # Create a dictionary for the current row
        row_dict = {
            "word_key": row["word_key"],
            "exemplars": row["exemplars"],
            "frequency": row["frequency"],
            "exemplar_frequency": row["exemplar_frequency"],
            "exemplar_strength": row["exemplar_strength"]
        }
        # Convert the dictionary to a JSON string
        json_data = json.dumps(row_dict, separators=(',', ':'))
        # Write the JSON string followed by a newline character to the file
        f.write(f"{json_data}\n")

In [8]:
grouped_data

Unnamed: 0,word_key,exemplars,frequency,exemplar_frequency,exemplar_strength
0,F10T1,"[0.2, -1.0, -1.2, 0.2, 0.5, 1.2, -2.1, 1.9, 1....",10,"[12, 11, 24, 14, 22, 25, 16, 26, 28, 25]","[0.9755600814663951, 0.9775967413441955, 0.951..."
1,F10T2,"[1.4, -1.3, -0.3, -0.2, 0.2, -0.2, 0.9, -0.2, ...",10,"[8, 19, 22, 18, 16, 20, 22, 20, 17, 19]","[0.9837067209775967, 0.9613034623217923, 0.955..."
2,F10T3,"[-0.4, 0.8, -0.2, -0.4, 1.1, -0.2, 1.5, 0.5, -...",10,"[14, 24, 18, 19, 22, 18, 18, 18, 19, 15]","[0.9714867617107943, 0.9511201629327902, 0.963..."
3,F11T1,"[-0.2, -1.3, -0.5, -0.9, 0.6, 1.5, 0.7, -1.4, ...",11,"[18, 13, 20, 21, 17, 16, 18, 16, 28, 25, 27]","[0.9633401221995926, 0.9735234215885947, 0.959..."
4,F12T1,"[-1.5, 0.8, 0.5, -1.6, -0.5, 1.5, -0.2, 0.7, -...",12,"[22, 20, 13, 26, 19, 21, 21, 21, 26, 26, 18, 19]","[0.955193482688391, 0.9592668024439919, 0.9735..."
...,...,...,...,...,...
87,F9T1,"[-1.2, -0.5, 1.6, -1.2, -0.9, -0.6, 0.9, 2.2, ...",9,"[24, 18, 25, 15, 22, 29, 24, 20, 27]","[0.9511201629327902, 0.9633401221995926, 0.949..."
88,F9T2,"[0.8, 0.4, 0.2, 1.1, -0.8, -2.0, -1.2, 1.3, -0.6]",9,"[20, 27, 21, 19, 27, 22, 20, 23, 16]","[0.9592668024439919, 0.945010183299389, 0.9572..."
89,F9T3,"[0.7, 0.6, 0.1, 0.6, -0.6, -0.7, 0.9, 1.2, -0.5]",9,"[24, 11, 21, 19, 19, 24, 15, 10, 21]","[0.9511201629327902, 0.9775967413441955, 0.957..."
90,F9T4,"[-0.9, -0.4, -1.6, -0.4, -0.6, -0.9, -0.5, 2.3...",9,"[27, 22, 26, 25, 28, 14, 21, 23, 17]","[0.945010183299389, 0.955193482688391, 0.94704..."
