In [None]:
#This script attempts to minimize the amount of labelling thats needed of different runs of the same topic number and model.
#This is a very simple method that ensures all topics are assigned to a "cluster"
#It calculates how many words are in common, assigning "Likely pairs"
#It also sees which topics are "likely pairs" with each other
#Overall it is very good at pulling out near identical clusters in NMF, and pointing out where attention may be needed
#The parameters of the last few cells are tuned to NMF (which is very similar usually)), but will likely show WAY more variation for LDA. Tune these from Cell 17 onward.

In [1]:
import pandas as pd
import numpy as np

In [2]:
num_words=30
num_models=10
num_topics=44
#Must be csv, formatted with num_topics columns, where every num_words is a different model's return on that topic
df_path='../../data/prd/RND Topic Modelling/10RunsModelNSF44topics.csv' 

df=pd.read_csv(df_path) 

In [3]:

def parse_dataframe_into_topic_term_sets(df,num_models,num_words,num_topics):
    """#Create sets of words composing each topic for each model."""
    all_models=[]
    for model in range(num_models):
        #Every num_words rows indicates a new model run
        mini_model=df.iloc[model*num_words:(model+1)*num_words]
        #One set for each topic and its num_words components
        #Words are currently formatted as one tuple to cell: ('word',numeric_weighting)
        all_models.append([set(mini_model['Topic {}'.format(i)].apply(lambda x: x.split(',')[0].strip('\'('))) for i in range(num_topics)])
    return all_models
all_models=parse_dataframe_into_topic_term_sets(df,num_models, num_words, num_topics)
# A list of length num_models, where each index is a list of sets composing a topic--the first index is topic 0, second is topic 1, etc.


In [4]:
def hamming_distance_sets(model1,model2,model1_id,model2_id):
    """returns the greatest overlapping topics of model1 found in model2.
    IDs refer to naming convention--e.g. For Models 1 and 2, model1_id=1 and model2_id=2 to show the most similar topic in model 2 to each topic in model 1
    Its not guaranteed that each match is unique--e.g. Topic 1 and Topic 3 in Model 1 might both align best with Topic 7 in Model 2
    returns a list of length n_topics showing the closest topic for each topic in model1"""
    pairs=[]
    #Looking at each topic in model1
    for i in range(len(model1)):
        minimum=0 #0 words in common
        pair=(np.nan, np.nan,np.nan) #defaults
        #Look at each topic in model2 and find the closest match in terms of overlapping words--aka, the longest intersection of the two sets.
        for j in range(len(model2)):
            #If these two sets have more items in common than the minimum, update it
            if len(model1[i] & model2[j])>minimum:
                minimum=len(model1[i] & model2[j])
                pair=['Model {}, Topic {}'.format(model1_id,i),'Model {}, Topic {}'.format(model2_id,j),minimum]
        #Whatever the greatest overalpping pair for this topic number in model 1, add it to pairs
        pairs.append(pair)
    return pairs

#Example: Differences between Model 0's topic 0 and Model 1's Topic 1
#results=hamming_distance_sets(all_models[0],all_models[1],0,1)
#Was each topic in model 0 linked to a unique topic,
#or was there an overlap in the best match?
#amount_of_overlap=num_topics-len(np.unique([r[1] for r in results]))
#print('Number of topics matched to more than one topic in other model')
#amount_of_overlap

In [5]:

def calculate_closest_topic_each_model(all_models):
    """#Caclculate the closest topic pair for each topic within each model
    #Length n_models^2 - n_models #Each index is the comparison of one model's topics to anothers."""
    all_results=[]
    for model1 in range(len(all_models)):
        for model2 in range(len(all_models)):
            #Dont compare to self
            if model1 != model2:
                #Hamming distance will return a list of length n_topics where each index is a tuple of the focal topic in model 1, the closest topic in model 2, and the hamming distance
                all_results.append(hamming_distance_sets(all_models[model1],all_models[model2],model1,model2))
    return all_results

all_results=calculate_closest_topic_each_model(all_models) #Length n_models^2 - n_models #Each index is the comparison of one model's topics to anothers.

In [6]:
#With all results, now we know, when comparing two models, which of the topics within those models are close. 
#But, we might assume to see similar topics that appear in each model--not just one or another
#how can we determine which topics ACROSS all models are similar
'''
#Similar topics will be the match for each other
#Example: For what focal topics was Model 1, Topic 1 the match?
matches=[x[0] for model in all_results for x in model if x[1]=='Model 1, Topic 1']
#Okay, for those focal topics, what focal topics were THEY the match of?
matches_2=[x[0] for model in all_results for x in model if x[1] in matches]
"Ten is perfect agreement"
len(set(matches_2)) 
'''

'''
#Example: We know in our sample dataset that all Topic 30s for each model are brain sciences, and should be the closest topic to each other at every index
#Code replicates for 19
subset=[x[30] for x in all_results] #This is topic 30's comparison to every other topic 30. 
#The first index is the focal topic, the one we want to find a match to. The second is the clsoest match for the chosen comparison topic
s_1=[x[0] for x in subset] #This is the list of topics that will be given a match
s_1.extend([x[1] for x in subset]) #This is a list of the topics that s_1 were matched to. 
#Make this a set to get the number of unique items matched to one another--that is, if x is matched to y, does y also match to x?
#Perfect agreement--where everyone in the same topic is matched to another--is equal to n_models, and anything more than that means not perfect agreement
print(len(set(s_1))) #Perfect agreement)
'''

def find_matches_and_matches_of_matches(j):
    """returns OVERALL MATCHES, aka the set of topics that tend to match with j, or match with matches of j"""
    #The topic where the match is j. Each index of all results is one topic comparing to a single different model's topics.
    matches=[x[0] for model in all_results for x in model if x[1]==j]
    #The match for instances where the a match of j is the match of another topic
    matches_2=[x[0] for model in all_results for x in model if x[1] in matches] #This guarantees we'll also get j back
    #The smaller the set, the more overlap between topics
    return set(matches_2)

#Example
"""
print('set of overall matches to Model 1, Topic 28: that is, Topics matched to Model 1, Topic 28, or to matches of Model 1, Topic 28')
print(find_matches_and_matches_of_matches('Model 1, Topic 28'))
print('What differences are there between overall matches to Model 1 Topic 28 and Model 2, Topic 28?')
#There is no difference between the matches and matches of matches of Model 2, Topic 28 or Model 1, Topic 28, suggesting convergence
len(find_matches_and_matches_of_matches('Model 2, Topic 28').symmetric_difference(find_matches_and_matches_of_matches('Model 1, Topic 28')))
"""

"\nprint('set of overall matches to Model 1, Topic 28: that is, Topics matched to Model 1, Topic 28, or to matches of Model 1, Topic 28')\nprint(find_matches_and_matches_of_matches('Model 1, Topic 28'))\nprint('What differences are there between overall matches to Model 1 Topic 28 and Model 2, Topic 28?')\n#There is no difference between the matches and matches of matches of Model 2, Topic 28 or Model 1, Topic 28, suggesting convergence\nlen(find_matches_and_matches_of_matches('Model 2, Topic 28').symmetric_difference(find_matches_and_matches_of_matches('Model 1, Topic 28')))\n"

In [7]:
#Find the set of overall matches for each topic in each model
all_sets=[find_matches_and_matches_of_matches('Model {}, Topic {}'.format(x,y)) for y in range(num_topics) for x in range(num_models)]
true_unique_sets={frozenset(x) for x in all_sets} #A set of sets--removes duplicate sets. Do NOT use numpy.unique for this--it does not work.

In [8]:
print('Overall number of sets of topics than tend to be most similar to each other')
print(len(all_sets)) #num_topics * num_models)
print('Number of duplicate sets')
len(all_sets)-len(true_unique_sets) #Half are identical!

Overall number of sets of topics than tend to be most similar to each other
440
Number of duplicate sets


369

In [17]:
#Most of our sets were exactly identical, but since we care about overall topics that are OVERALL similar, allow a margin of error of size threshold, rather than set it at 0


#Comparing a model set_1 from all_sets to those in unique sets, is it different enough from those sets to be included as a unique set?

def consolidate_near_duplicates_and_remove_singletons(true_unique_sets,threshold_new_topic=.4,threshold_same_topic=.1):
    #Unique sets is the set of relatively unique sets--i.e. the sets of topics that are generally similar to one another
    unique_sets=[frozenset()] 
    #threshold # Whether the difference between the two sets is greater than threshold% of set_1--acounts for smaller sets
    
    print('Length of truly unique sets, unconsolidataed: {}'.format(len(true_unique_sets)))
    for set_1 in true_unique_sets:
        #How does set1 differ compared to sets we know are unique sets)?
        #A small difference indicates its pretty similar and doesn't need to be included as a totally different topic.
        differences=[len(set_1.symmetric_difference(set_2)) for set_2 in unique_sets]
        #Too short
        if len(set_1)==1:
            add=False
        #If the *minimum* difference to our set of unique sets is bigger than a threshold, that means this set is pretty different from  all those in our unique_sets.
        elif min(differences)>threshold_new_topic*len(set_1):
            add=True
        #Minimum difference is less than threshold, suggesting its not a truly unique set. It can contribute to other, similar sets below
        else:
            add=False
        if add:
            unique_sets.append(set_1)
        else:
            #If the *minimum* difference to our set of unique sets is bigger than a threshold, that means this set is pretty different from  all those in our unique_sets.
            if min(differences)>0:
                for i in range(len(unique_sets)):
                    if len(set_1.symmetric_difference(unique_sets[i])) <threshold_same_topic*len(set_1) and not set_1.issubset(unique_sets[i]) and not unique_sets[i].issubset(set_1):
                        unique_sets[i]=frozenset(set_1.union(unique_sets[i]))
    print('Length of consolidated unique sets: {}'.format(len(unique_sets)))
    return unique_sets

unique_sets=consolidate_near_duplicates_and_remove_singletons(true_unique_sets)

#What was filtered out or consolidated with another topic
#true_unique_sets - set(unique_sets)

Length of truly unique sets, unconsolidataed: 71
Length of consolidated unique sets: 49


In [12]:
duplicates=[]
for i in range(len(unique_sets)):
    for j in range(len(unique_sets)):
        if i>j:
            if unique_sets[i] & unique_sets[j] != set():
                duplicates.append(unique_sets[i] & unique_sets[j])
duplicate_minors=[]
for i in duplicates:
    for j in duplicates:
        if not i.issubset(j):
            duplicate_minors.append(i)
            break

In [None]:
#Topics that appear in more than one cluster:
#PRimarily NSF or other broad STEM topics--which sometimes blead nto instrumentation e.g. nsf, intstrumentation, facility, vessel
#Fluid dynamics
#instrumentation and imaging--vessel, ocean, oceanographypops up frequently, whats that about--also overalps with imaging topic
#Patient care: overlap with imaging again
#Oceans, carbon, co2, marine, soil
#Software developmeny

#devices sensors design, overlapping with imaging, wireless, etc.--a few topics that are more sensors and imagining so a lot of 
#Membranes, gas, fouling, surfaces
#Magnetism

In [21]:
set.union(*[set(x) for x in duplicate_minors])

{'Model 0, Topic 1',
 'Model 0, Topic 10',
 'Model 0, Topic 32',
 'Model 0, Topic 35',
 'Model 0, Topic 37',
 'Model 0, Topic 38',
 'Model 0, Topic 4',
 'Model 0, Topic 40',
 'Model 0, Topic 41',
 'Model 0, Topic 43',
 'Model 1, Topic 1',
 'Model 1, Topic 10',
 'Model 1, Topic 11',
 'Model 1, Topic 32',
 'Model 1, Topic 36',
 'Model 1, Topic 38',
 'Model 1, Topic 4',
 'Model 1, Topic 42',
 'Model 1, Topic 43',
 'Model 2, Topic 1',
 'Model 2, Topic 10',
 'Model 2, Topic 34',
 'Model 2, Topic 36',
 'Model 2, Topic 37',
 'Model 2, Topic 39',
 'Model 2, Topic 4',
 'Model 2, Topic 41',
 'Model 2, Topic 42',
 'Model 2, Topic 43',
 'Model 3, Topic 0',
 'Model 3, Topic 1',
 'Model 3, Topic 10',
 'Model 3, Topic 32',
 'Model 3, Topic 36',
 'Model 3, Topic 37',
 'Model 3, Topic 39',
 'Model 3, Topic 4',
 'Model 3, Topic 42',
 'Model 3, Topic 43',
 'Model 4, Topic 1',
 'Model 4, Topic 10',
 'Model 4, Topic 3',
 'Model 4, Topic 34',
 'Model 4, Topic 36',
 'Model 4, Topic 38',
 'Model 4, Topic 4',


In [14]:
def calculate_unassigned_topics(num_models, num_topics, unique_sets):
    """print any topics that are not found in any topic cluster"""
    expected_tokens=set(['Model {}, Topic {}'.format(x,y) for x in range(num_models) for y in range(num_topics)])
    diff=expected_tokens - set.union(*[set(x) for x in unique_sets])
    if len(diff)>0:
        print('Some topics within models were not assigned to any topic cluster:')
        print(diff)
    else:
        print('All topics were successfully assigned to at least one topic cluster')

In [20]:
calculate_unassigned_topics(num_models,num_topics,unique_sets)

All topics were successfully assigned to at least one topic cluster


In [19]:
for i, topic_cluster in enumerate(unique_sets[1:]):
    print('Cluster {}'.format(i))
    model_params=[]
    for i in topic_cluster:
        i=i.replace(',','').split()
        model_params.append((int(i[1]),int(i[-1])))
    my_words=[]
    for i in model_params:
        my_words.append(all_models[i[0]][i[1]])
    print('Associated terms')
    print(sorted(set.union(*my_words)))
    if len(set.union(*my_words))>num_words*1.3:
        print('This cluster has a larger number of words than expected--you may want to examine these clusters manually to ensure they are truly identical')
        print(list(topic_cluster))
    print()


Cluster 0
Associated terms
['activity', 'animal', 'behavior', 'behavioral', 'brain', 'circuit', 'cognition', 'cognitive', 'cortical', 'disorder', 'fmri', 'function', 'functional', 'human', 'image', 'imaging', 'individual', 'information', 'mechanism', 'memory', 'motor', 'nervous', 'neural', 'neuron', 'neuronal', 'neuroscience', 'perception', 'processing', 'response', 'sensory', 'signal', 'understanding', 'visual']

Cluster 1
Associated terms
['adhesion', 'bilayer', 'biochemical', 'biological', 'biology', 'cancer', 'cell', 'cellular', 'chemical', 'culture', 'desalination', 'differentiation', 'disease', 'division', 'drug', 'filtration', 'force', 'fouling', 'fuel', 'function', 'gas', 'growth', 'high', 'imaging', 'immune', 'ion', 'lipid', 'lipid_bilayer', 'liquid', 'living', 'mechanical', 'mechanism', 'membrane', 'microscopy', 'mixture', 'molecular', 'nanopore', 'organ', 'peptide', 'permeability', 'pi', 'pore', 'porous', 'process', 'proton', 'purification', 'receptor', 'regulate', 'response