# Jensen-Shannon Score Sampling

In [1]:
import pandas as pd
import numpy as np
import scipy
from scipy.spatial import distance
from scipy.spatial.distance import jensenshannon
import sqlite3
from sklearn.neighbors import KernelDensity
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

# 1 — Helper Functions

In [2]:
def random_sampler(dataframe, num_samples, seed):
    '''
    The function randomly samples the rows of a dataframe based on the number of samples requested.
    A new dataframe with the random sample is returned.
    '''
    np.random.seed(seed)
    df = dataframe.sample(replace=False, n = num_samples, axis=0)
    return df

In [3]:
def calculate_jsd(samplea, sampleb, column):
    """Calculates the Jensen-Shannon distance between two samples using their
    density curves.
    
    Arguments
    ---------
    sample1, sample2: arrays of numbers representing the samples, which can be of
    different sizes.

    Returns
    -------
    jsd: float representing the Jensen-Shannon distance between the two distributions.
    """
    # reshape inputs
    sample1 = np.array(samplea[column]).reshape(-1, 1)
    sample2 = np.array(sampleb[column]).reshape(-1, 1)

    # fit kernel density
    kde1 = KernelDensity(kernel='gaussian', bandwidth=0.05).fit(sample1)
    kde2 = KernelDensity(kernel='gaussian', bandwidth=0.05).fit(sample2)

    # calculate log-densities
    values = np.linspace(-1.6, -0.6, 100).reshape(-1, 1)
    log_densities1 = kde1.score_samples(values)
    log_densities2 = kde2.score_samples(values)

    # get densities by exponentiating
    densities1 = np.exp(log_densities1)
    densities2 = np.exp(log_densities2)
    
    # return JSD
    jsd = jensenshannon(densities1, densities2)
    return jsd

# Step 2 — Calculate Jensen-Shannon Scores

In [4]:
def mass_function():
    #Recall final dataframe of generated proto-lexicon stimuli
    final = pd.read_csv("final.csv")
    
    ### frequency tables
    final_high_frequency = final[final['frequency_bin'] == '>100']
    final_medium_frequency = final[final['frequency_bin'] == '10-100']
    final_low_frequency = final[final['frequency_bin'] == '<10']
    
    #filter by length — high
    final_high_frequency_5 = final_high_frequency[final_high_frequency['length'] == 5]
    final_high_frequency_6 = final_high_frequency[final_high_frequency['length'] == 6]
    final_high_frequency_7 = final_high_frequency[final_high_frequency['length'] == 7]
    final_high_frequency_8 = final_high_frequency[final_high_frequency['length'] == 8]
    
    #filter by bin — medium
    #bin1: -1.4 < x <= -1.2 
    final_medium_frequency_bin1 = final_medium_frequency[(final_medium_frequency['real_wbps'] > -1.4) 
                                                     & (final_medium_frequency['real_wbps'] < -1.2)]
    #bin2: -1.2 < x <= -1.0
    final_medium_frequency_bin2 = final_medium_frequency[(final_medium_frequency['real_wbps'] > -1.2) 
                                                     & (final_medium_frequency['real_wbps'] < -1.0)]
    #bin3: -1.0 < x <= -0.8
    final_medium_frequency_bin3 = final_medium_frequency[(final_medium_frequency['real_wbps'] > -1.0) 
                                                     & (final_medium_frequency['real_wbps'] < -0.8)]
    #bin4: -0.8 < x <= -0.6
    final_medium_frequency_bin4 = final_medium_frequency[(final_medium_frequency['real_wbps'] > -0.8) 
                                                     & (final_medium_frequency['real_wbps'] < -0.6)]
    #filter by bin — low
    #bin1: -1.4 < x <= -1.2 
    final_low_frequency_bin1 = final_low_frequency[(final_low_frequency['real_wbps'] > -1.4) 
                                                         & (final_low_frequency['real_wbps'] < -1.2)]
    #bin2: -1.2 < x <= -1.0
    final_low_frequency_bin2 = final_low_frequency[(final_low_frequency['real_wbps'] > -1.2) 
                                                         & (final_low_frequency['real_wbps'] < -1.0)]
    #bin3: -1.0 < x <= -0.8
    final_low_frequency_bin3 = final_low_frequency[(final_low_frequency['real_wbps'] > -1.0) 
                                                         & (final_low_frequency['real_wbps'] < -0.8)]
    #bin4: -0.8 < x <= -0.6
    final_low_frequency_bin4 = final_low_frequency[(final_low_frequency['real_wbps'] > -0.8) 
                                                         & (final_low_frequency['real_wbps'] < -0.6)]
    
    #initialize Jenson-Shannon Score Dataframe
    jsd_df = pd.DataFrame(columns=['length1', 'length2', 'frequency1', 'frequency2'])
    
    #loop
    for n in range(1, 1001):
        ### HIGH ###
        five_high_sample = random_sampler(final_high_frequency_5, 32, n)
        six_high_sample = random_sampler(final_high_frequency_6, 48, n)
        seven_high_sample = random_sampler(final_high_frequency_7, 48, n)
        eight_high_sample = random_sampler(final_high_frequency_8, 32, n)
        
        #score
        bin_edges = np.arange(-1.4, -0.4, 0.2)
        
        five_high_sample['score_bin'] = pd.cut(five_high_sample['real_wbps'], bins=bin_edges)
        histogram5 = five_high_sample.groupby(['length', 'score_bin']).size().unstack()

        six_high_sample['score_bin'] = pd.cut(six_high_sample['real_wbps'], bins=bin_edges)
        histogram6 = six_high_sample.groupby(['length', 'score_bin']).size().unstack()\
        
        seven_high_sample['score_bin'] = pd.cut(seven_high_sample['real_wbps'], bins=bin_edges)
        histogram7 = seven_high_sample.groupby(['length', 'score_bin']).size().unstack()
        
        eight_high_sample['score_bin'] = pd.cut(eight_high_sample['real_wbps'], bins=bin_edges)
        histogram8 = eight_high_sample.groupby(['length', 'score_bin']).size().unstack()
        
        ### MEDIUM ###
        
        #length 5

        #bin1: -1.4 < x <= -1.2  
        five_medium_bin1_sample = random_sampler(final_medium_frequency_bin1[final_medium_frequency_bin1['length'] == 5], histogram5.iloc[0, 0], n)

        #bin2: -1.2 < x <= -1.0
        five_medium_bin2_sample = random_sampler(final_medium_frequency_bin2[final_medium_frequency_bin2['length'] == 5], histogram5.iloc[0, 1], n)

        #bin3: -1.0 < x <= -0.8
        five_medium_bin3_sample = random_sampler(final_medium_frequency_bin3[final_medium_frequency_bin3['length'] == 5], histogram5.iloc[0, 2], n)

        #bin4: -0.8 < x <= -0.6
        five_medium_bin4_sample = random_sampler(final_medium_frequency_bin4[final_medium_frequency_bin4['length'] == 5], histogram5.iloc[0, 3], n)

        #combine bins
        five_medium_sample = pd.concat([five_medium_bin1_sample,
                                        five_medium_bin2_sample,
                                        five_medium_bin3_sample, 
                                        five_medium_bin4_sample])
        #length 6

        #bin1: -1.4 < x <= -1.2  
        six_medium_bin1_sample = random_sampler(final_medium_frequency_bin1[final_medium_frequency_bin1['length'] == 6], histogram6.iloc[0, 0], n)

        #bin2: -1.2 < x <= -1.0
        six_medium_bin2_sample = random_sampler(final_medium_frequency_bin2[final_medium_frequency_bin2['length'] == 6], histogram6.iloc[0, 1], n)

        #bin3: -1.0 < x <= -0.8
        six_medium_bin3_sample = random_sampler(final_medium_frequency_bin3[final_medium_frequency_bin3['length'] == 6], histogram6.iloc[0, 2], n)

        #bin4: -0.8 < x <= -0.6
        six_medium_bin4_sample = random_sampler(final_medium_frequency_bin4[final_medium_frequency_bin4['length'] == 6], histogram6.iloc[0, 3], n)

        #combine bins
        six_medium_sample = pd.concat([six_medium_bin1_sample,
                                        six_medium_bin2_sample,
                                        six_medium_bin3_sample, 
                                        six_medium_bin4_sample])

        #length 7

        #bin1: -1.4 < x <= -1.2  
        seven_medium_bin1_sample = random_sampler(final_medium_frequency_bin1[final_medium_frequency_bin1['length'] == 7], histogram7.iloc[0, 0], n)

        #bin2: -1.2 < x <= -1.0
        seven_medium_bin2_sample = random_sampler(final_medium_frequency_bin2[final_medium_frequency_bin2['length'] == 7], histogram7.iloc[0, 1], n)

        #bin3: -1.0 < x <= -0.8
        seven_medium_bin3_sample = random_sampler(final_medium_frequency_bin3[final_medium_frequency_bin3['length'] == 7], histogram7.iloc[0, 2], n)

        #bin4: -0.8 < x <= -0.6
        seven_medium_bin4_sample = random_sampler(final_medium_frequency_bin4[final_medium_frequency_bin4['length'] == 7], histogram7.iloc[0, 3], n)

        #combine bins
        seven_medium_sample = pd.concat([seven_medium_bin1_sample,
                                        seven_medium_bin2_sample,
                                        seven_medium_bin3_sample, 
                                        seven_medium_bin4_sample])
        
        #length 8
        
        #bin1: -1.4 < x <= -1.2  
        eight_medium_bin1_sample = random_sampler(final_medium_frequency_bin1[final_medium_frequency_bin1['length'] == 8], histogram8.iloc[0, 0], n)

        #bin2: -1.2 < x <= -1.0
        eight_medium_bin2_sample = random_sampler(final_medium_frequency_bin2[final_medium_frequency_bin2['length'] == 8], histogram8.iloc[0, 1], n)

        #bin3: -1.0 < x <= -0.8
        eight_medium_bin3_sample = random_sampler(final_medium_frequency_bin3[final_medium_frequency_bin3['length'] == 8], histogram8.iloc[0, 2], n)

        #bin4: -0.8 < x <= -0.6
        eight_medium_bin4_sample = random_sampler(final_medium_frequency_bin4[final_medium_frequency_bin4['length'] == 8], histogram8.iloc[0, 3], n)

        #combine bins
        eight_medium_sample = pd.concat([eight_medium_bin1_sample,
                                        eight_medium_bin2_sample,
                                        eight_medium_bin3_sample, 
                                        eight_medium_bin4_sample]) 
        
        ## LOW ##

        #length 5

        #bin1: -1.4 < x <= -1.2  
        five_low_bin1_sample = random_sampler(final_low_frequency_bin1[final_low_frequency_bin1['length'] == 5], histogram5.iloc[0, 0], n)

        #bin2: -1.2 < x <= -1.0
        five_low_bin2_sample = random_sampler(final_low_frequency_bin2[final_low_frequency_bin2['length'] == 5], histogram5.iloc[0, 1], n)

        #bin3: -1.0 < x <= -0.8
        five_low_bin3_sample = random_sampler(final_low_frequency_bin3[final_low_frequency_bin3['length'] == 5], histogram5.iloc[0, 2], n)

        #bin4: -0.8 < x <= -0.6
        five_low_bin4_sample = random_sampler(final_low_frequency_bin4[final_low_frequency_bin4['length'] == 5], histogram5.iloc[0, 3], n)

        #combine bins
        five_low_sample = pd.concat([five_low_bin1_sample,
                                        five_low_bin2_sample,
                                        five_low_bin3_sample, 
                                        five_low_bin4_sample])
        #length 6

        #bin1: -1.4 < x <= -1.2  
        six_low_bin1_sample = random_sampler(final_low_frequency_bin1[final_low_frequency_bin1['length'] == 6], histogram6.iloc[0, 0], n)

        #bin2: -1.2 < x <= -1.0
        six_low_bin2_sample = random_sampler(final_low_frequency_bin2[final_low_frequency_bin2['length'] == 6], histogram6.iloc[0, 1], n)

        #bin3: -1.0 < x <= -0.8
        six_low_bin3_sample = random_sampler(final_low_frequency_bin3[final_low_frequency_bin3['length'] == 6], histogram6.iloc[0, 2], n)

        #bin4: -0.8 < x <= -0.6
        six_low_bin4_sample = random_sampler(final_low_frequency_bin4[final_low_frequency_bin4['length'] == 6], histogram6.iloc[0, 3], n)

        #combine bins
        six_low_sample = pd.concat([six_low_bin1_sample,
                                        six_low_bin2_sample,
                                        six_low_bin3_sample, 
                                        six_low_bin4_sample])

        #length 7

        #bin1: -1.4 < x <= -1.2  
        seven_low_bin1_sample = random_sampler(final_low_frequency_bin1[final_low_frequency_bin1['length'] == 7], histogram7.iloc[0, 0], n)

        #bin2: -1.2 < x <= -1.0
        seven_low_bin2_sample = random_sampler(final_low_frequency_bin2[final_low_frequency_bin2['length'] == 7], histogram7.iloc[0, 1], n)

        #bin3: -1.0 < x <= -0.8
        seven_low_bin3_sample = random_sampler(final_low_frequency_bin3[final_low_frequency_bin3['length'] == 7], histogram7.iloc[0, 2], n)

        #bin4: -0.8 < x <= -0.6
        seven_low_bin4_sample = random_sampler(final_low_frequency_bin4[final_low_frequency_bin4['length'] == 7], histogram7.iloc[0, 3], n)

        #combine bins
        seven_low_sample = pd.concat([seven_low_bin1_sample,
                                        seven_low_bin2_sample,
                                        seven_low_bin3_sample, 
                                        seven_low_bin4_sample])
        
        #length 8
        
        #bin1: -1.4 < x <= -1.2  
        eight_low_bin1_sample = random_sampler(final_low_frequency_bin1[final_low_frequency_bin1['length'] == 8], histogram8.iloc[0, 0], n)

        #bin2: -1.2 < x <= -1.0
        eight_low_bin2_sample = random_sampler(final_low_frequency_bin2[final_low_frequency_bin2['length'] == 8], histogram8.iloc[0, 1], n)

        #bin3: -1.0 < x <= -0.8
        eight_low_bin3_sample = random_sampler(final_low_frequency_bin3[final_low_frequency_bin3['length'] == 8], histogram8.iloc[0, 2], n)

        #bin4: -0.8 < x <= -0.6
        eight_low_bin4_sample = random_sampler(final_low_frequency_bin4[final_low_frequency_bin4['length'] == 8], histogram8.iloc[0, 3], n)

        #combine bins
        eight_low_sample = pd.concat([eight_low_bin1_sample,
                                        eight_low_bin2_sample,
                                        eight_low_bin3_sample, 
                                        eight_low_bin4_sample])
        
        ### JENSEN-SHANNON SCORES ###

        #Create dictionary to store Jensen-Shannon scores
        js_dict = {}
        js_dict[(5, 5, 'High', 'Medium')] = calculate_jsd(five_high_sample, five_medium_sample, 'real_wbps')
        js_dict[(5, 5, 'High', 'Low')] = calculate_jsd(five_high_sample, five_low_sample, 'real_wbps')
        js_dict[(5, 6, 'High', 'High')] = calculate_jsd(five_high_sample, six_high_sample, 'real_wbps')
        js_dict[(5, 7, 'High', 'High')] = calculate_jsd(five_high_sample, seven_high_sample, 'real_wbps')
        js_dict[(5, 8, 'High', 'High')] = calculate_jsd(five_high_sample, eight_high_sample, 'real_wbps')
        js_dict[(6, 6, 'High', 'Medium')] = calculate_jsd(six_high_sample, six_medium_sample, 'real_wbps')
        js_dict[(6, 6, 'High', 'Low')] = calculate_jsd(six_high_sample, six_low_sample, 'real_wbps')
        js_dict[(6, 7, 'High', 'High')] = calculate_jsd(six_high_sample, seven_high_sample, 'real_wbps')
        js_dict[(6, 8, 'High', 'High')] = calculate_jsd(six_high_sample, eight_high_sample, 'real_wbps')
        js_dict[(7, 7, 'High', 'Medium')] = calculate_jsd(seven_high_sample, seven_medium_sample, 'real_wbps')
        js_dict[(7, 7, 'High', 'Low')] = calculate_jsd(seven_high_sample, seven_low_sample, 'real_wbps')
        js_dict[(7, 8, 'High', 'High')] = calculate_jsd(seven_high_sample, eight_high_sample, 'real_wbps')
        js_dict[(8, 8, 'High', 'Medium')] = calculate_jsd(eight_high_sample, eight_medium_sample, 'real_wbps')
        js_dict[(8, 8, 'High', 'Low')] = calculate_jsd(eight_high_sample, eight_low_sample, 'real_wbps')
        js_dict[(5, 5, 'Medium', 'Low')] = calculate_jsd(five_medium_sample, five_low_sample, 'real_wbps')
        js_dict[(5, 6, 'Medium', 'Medium')] = calculate_jsd(five_medium_sample, six_medium_sample, 'real_wbps')
        js_dict[(5, 7, 'Medium', 'Medium')] = calculate_jsd(five_medium_sample, seven_medium_sample, 'real_wbps')
        js_dict[(5, 8, 'Medium', 'Medium')] = calculate_jsd(five_medium_sample, eight_medium_sample, 'real_wbps')
        js_dict[(6, 6, 'Medium', 'Low')] = calculate_jsd(six_medium_sample, six_low_sample, 'real_wbps')
        js_dict[(6, 7, 'Medium', 'Medium')] = calculate_jsd(six_medium_sample, seven_medium_sample, 'real_wbps')
        js_dict[(6, 8, 'Medium', 'Medium')] = calculate_jsd(six_medium_sample, eight_medium_sample, 'real_wbps')
        js_dict[(7, 7, 'Medium', 'Low')] = calculate_jsd(seven_medium_sample, seven_low_sample, 'real_wbps')
        js_dict[(7, 8, 'Medium', 'Medium')] = calculate_jsd(seven_medium_sample, eight_medium_sample, 'real_wbps')
        js_dict[(8, 8, 'Medium', 'Low')] = calculate_jsd(eight_medium_sample, eight_low_sample, 'real_wbps')
        js_dict[(5, 6, 'Low', 'Low')] = calculate_jsd(five_low_sample, six_low_sample, 'real_wbps')
        js_dict[(5, 7, 'Low', 'Low')] = calculate_jsd(five_low_sample, seven_low_sample, 'real_wbps')
        js_dict[(5, 8, 'Low', 'Low')] = calculate_jsd(five_low_sample, eight_low_sample, 'real_wbps')
        js_dict[(6, 7, 'Low', 'Low')] = calculate_jsd(six_low_sample, seven_low_sample, 'real_wbps')
        js_dict[(6, 8, 'Low', 'Low')] = calculate_jsd(six_low_sample, eight_low_sample, 'real_wbps')
        js_dict[(7, 8, 'Low', 'Low')] = calculate_jsd(seven_low_sample, eight_low_sample, 'real_wbps')
        
        #create mass dataframe
        score_string = "score" + str(n)
        keys = [(*key, value) for key, value in js_dict.items()]
        jsd_data = [value for key, value in js_dict.items()] 
        jsd_df[score_string] = jsd_data
        
        #create sql tables
        db_name = "words" + str(n) + ".db"
        connection = sqlite3.connect(db_name)
        cursor = connection.cursor()

        dataframes = {
            'five_low_sample': five_low_sample,
            'six_low_sample': six_low_sample,
            'seven_low_sample': seven_low_sample,
            'eight_low_sample': eight_low_sample,
            'five_medium_sample': five_medium_sample,
            'six_medium_sample': six_medium_sample,
            'seven_medium_sample': seven_medium_sample,
            'eight_medium_sample': eight_medium_sample,
            'five_high_sample': five_high_sample,
            'six_high_sample': six_high_sample,
            'seven_high_sample': seven_high_sample,
            'eight_high_sample': eight_high_sample
        }

        for table_name, dataframe in dataframes.items():
            categorical_columns = dataframe.select_dtypes(include=['category']).columns
            for column in categorical_columns:
                dataframe[column] = dataframe[column].cat.add_categories(['NA']) 
        
        for table_name, dataframe in dataframes.items():
            dataframe.fillna(value='NA', inplace=True)
            for column in dataframe.columns:
                if dataframe[column].dtype.name == 'category':
                    dataframe[column] = dataframe[column].astype(str)
            cursor.execute(f"DROP TABLE IF EXISTS {table_name}")
            dataframe.to_sql(table_name, connection, if_exists='append', index=False)

        connection.commit()
        connection.close()
    
    #set keys   
    for index in range(0, 30):
        jsd_df['length1'][index] = keys[index][0]
        jsd_df['length2'][index] = keys[index][1]
        jsd_df['frequency1'][index] = keys[index][2]
        jsd_df['frequency2'][index] = keys[index][3]

    return jsd_df

In [5]:
jsd_thousand = mass_function()

In [6]:
jsd_thousand

Unnamed: 0,length1,length2,frequency1,frequency2,score1,score2,score3,score4,score5,score6,...,score991,score992,score993,score994,score995,score996,score997,score998,score999,score1000
0,5,5,High,Medium,0.156275,0.064474,0.132025,0.126677,0.120441,0.109812,...,0.137802,0.098569,0.087634,0.140246,0.133148,0.113755,0.122102,0.159009,0.051646,0.138327
1,5,5,High,Low,0.172204,0.096396,0.126371,0.155723,0.127556,0.09917,...,0.075647,0.133396,0.139702,0.112005,0.136296,0.103779,0.135332,0.14544,0.088966,0.137525
2,5,6,High,High,0.201178,0.153666,0.175308,0.172442,0.125117,0.225968,...,0.192585,0.153008,0.239436,0.153648,0.169845,0.116102,0.166397,0.263797,0.168896,0.205015
3,5,7,High,High,0.223128,0.174351,0.197546,0.1694,0.175868,0.236286,...,0.215762,0.175156,0.196396,0.192491,0.150553,0.16434,0.23023,0.263114,0.233905,0.208192
4,5,8,High,High,0.288197,0.290759,0.295392,0.27601,0.258602,0.346818,...,0.293759,0.294264,0.355978,0.270984,0.258749,0.261813,0.337667,0.375555,0.268144,0.265501
5,6,6,High,Medium,0.113266,0.113073,0.063625,0.071252,0.052084,0.070631,...,0.105354,0.110407,0.106522,0.068268,0.136767,0.12187,0.102994,0.117246,0.103168,0.102688
6,6,6,High,Low,0.133385,0.116903,0.092298,0.072568,0.080184,0.120843,...,0.11196,0.116087,0.131447,0.097583,0.109777,0.091036,0.096797,0.142102,0.112801,0.111172
7,6,7,High,High,0.151872,0.145446,0.133289,0.141872,0.188396,0.099714,...,0.14822,0.130858,0.132022,0.124008,0.159962,0.132039,0.155616,0.165015,0.171392,0.199013
8,6,8,High,High,0.204188,0.194234,0.148039,0.122395,0.155242,0.157576,...,0.167479,0.205408,0.186554,0.14153,0.189214,0.160429,0.251611,0.183366,0.163882,0.1778
9,7,7,High,Medium,0.122657,0.062018,0.056011,0.08496,0.125654,0.098792,...,0.070024,0.068059,0.095019,0.107304,0.085245,0.092912,0.100007,0.077941,0.080783,0.052627


# Step 3 — Test Sample Word Data Banks

**Each of the 1000 samples has a relational database called words#.db that has 12 tables: one for each of the 12 samples. Each table contains the full data for the words in that sample. The list of tables is below. To access a table from a sample, run the test code below but change the # and the name of the table to the ones needed.**

- `five_low_sample`
- `six_low_sample`
- `seven_low_sample`
- `eight_low_sample`
- `five_medium_sample`
- `six_medium_sample`
- `seven_medium_sample`
- `eight_medium_sample`
- `five_high_sample`
- `six_high_sample`
- `seven_high_sample`
- `eight_high_sample`


In [7]:
#test
connection = sqlite3.connect('words254.db')
query = "SELECT * FROM five_low_sample"
data = pd.read_sql_query(query, connection)
connection.close()
print(data)

      word word_shape word_phono  length frequency_bin  real_wbps  real_mbps  \
0    núbil      CVCVC      nuBil       5           <10  -1.230044  -1.049088   
1   fuelle     CVVCCV      fweje       5           <10  -1.271759  -1.071325   
2   brocha     CCVCCV      bRoCa       5           <10  -1.121532  -0.957581   
3   tallar     CVCCVC      tajaR       5           <10  -1.073525  -1.129436   
4   charco     CCVCCV      CaRko       5           <10  -1.159107  -1.013246   
5    sesgo      CVCCV      sesGo       5           <10  -1.072232  -1.008023   
6    velón      CVCVC      belon       5           <10  -1.007879  -0.968154   
7    vatio      CVCVV      batjo       5           <10  -1.054183  -0.980786   
8    adobo      VCVCV      aDoBo       5           <10  -1.163426  -1.041204   
9    tauro      CVVCV      tawRo       5           <10  -1.162379  -1.073423   
10   añoso      VCVCV      aYoso       5           <10  -1.120044  -0.943943   
11   nevar      CVCVC      neBaR       5

# Step 4 — Select Best Sample

**The best sample is the one with the lowest combined sum of Jensen-Shannon scores.***

In [8]:
# Step 1: Select columns 5-1004
selected_columns = jsd_thousand.iloc[:, 5:1005]

# Step 2: Calculate column sums
column_sums = selected_columns.sum(axis=0)

# Step 3: Find column index with the lowest sum
lowest_column_index = column_sums.idxmin()

print(lowest_column_index)

score469
