In [1]:
import pandas as pd
import networkx as nx

# Example SPO DataFrame structure
df = pd.DataFrame({
    'subject': ['entity_1', 'entity_2', 'entity_3', 'class_B'],
    'predicate': ['subclass_of', 'subclass_of', 'instance_of','subclass_of'],
    'object': ['class_A', 'class_B', 'class_C', 'class_C']
})

df.info()
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   subject    4 non-null      object
 1   predicate  4 non-null      object
 2   object     4 non-null      object
dtypes: object(3)
memory usage: 224.0+ bytes


Unnamed: 0,subject,predicate,object
0,entity_1,subclass_of,class_A
1,entity_2,subclass_of,class_B
2,entity_3,instance_of,class_C
3,class_B,subclass_of,class_C


In [2]:
# Compute PageRank
# Create directed edges for subclass and instance_of relationships
edges = df[df['predicate'].isin(['subclass_of', 'instance_of'])][['subject', 'object']].values
G = nx.DiGraph()
G.add_edges_from(edges)
pagerank = nx.pagerank(G)


In [3]:
# Get the top 10 classes based on PageRank score
top_classes = sorted(pagerank.items(), key=lambda x: x[1], reverse=True)[:10]
top_classes = [x[0] for x in top_classes]
print(f'Top 10 important classes: {top_classes}')


Top 10 important classes: ['class_C', 'class_A', 'class_B', 'entity_1', 'entity_2', 'entity_3']


In [4]:
# Find closest important class for each entity
def assign_to_closest_class(entity, top_classes):
    distances = {cls: nx.shortest_path_length(G, entity, cls) for cls in top_classes if nx.has_path(G, entity, cls)}
    return min(distances, key=distances.get) if distances else None

df['closest_class'] = df['subject'].apply(lambda x: assign_to_closest_class(x, top_classes))
df['closest_class_object'] = df['object'].apply(lambda x: assign_to_closest_class(x, top_classes))


In [5]:
# Filter for subjects and objects belonging to important classes
df_filtered = df[(df['closest_class'].notnull()) & (df['closest_class_object'].notnull())]

# Count predicate occurrences
predicate_counts = df_filtered.groupby('predicate').size().reset_index(name='count').sort_values(by='count', ascending=False)
important_predicate = predicate_counts.iloc[0]['predicate']
print(f'Most important predicate: {important_predicate}')


Most important predicate: subclass_of


# larger data

In [6]:
df=pd.read_csv('./data/4_spo_e.csv')
metaclass=['second-order class','metaclass','type of object']
df = df[~df['objectLabel'].isin(metaclass)]
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 829132 entries, 0 to 829668
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Unnamed: 0      829132 non-null  int64 
 1   subjectLabel    829131 non-null  object
 2   predicateLabel  829132 non-null  object
 3   objectLabel     829132 non-null  object
 4   subject         829132 non-null  object
 5   object          829132 non-null  object
 6   source          829132 non-null  object
dtypes: int64(1), object(6)
memory usage: 50.6+ MB


Unnamed: 0.1,Unnamed: 0,subjectLabel,predicateLabel,objectLabel,subject,object,source
0,0,Mohs' hardness,instance of,Wikidata property related to mineralogy,P1088,Q24041781,P1088
1,1,Mohs' hardness,related property,hardness,P1088,P5483,P1088
2,2,Mohs' hardness,Wikidata item of this property,Mohs scale of mineral hardness,P1088,Q41472,P1088
3,3,Mohs' hardness,Wikidata property example,diamond,P1088,Q5283,P1088
4,4,Mohs' hardness,Wikidata property example,quartz,P1088,Q43010,P1088


In [7]:
# Compute PageRank
# Create directed edges for subclass and instance_of relationships
edges = df[df['predicateLabel'].isin(['subclass of', 'instance of'])][['subject', 'object']].values
G = nx.DiGraph()
G.add_edges_from(edges)
pagerank = nx.pagerank(G)

In [8]:
# Get the top 10 classes based on PageRank score
top_classes = sorted(pagerank.items(), key=lambda x: x[1], reverse=True)[:20]
top_classes = [x[0] for x in top_classes]
print(f'Top 20 important classes: {top_classes}')

Top 20 important classes: ['Q4164871', 'Q214339', 'Q12737077', 'Q54050', 'Q5003624', 'Q11835431', 'Q41207', 'Q28640', 'Q12089225', 'Q29028649', 'Q2424752', 'Q271669', 'Q106589819', 'Q63981612', 'Q1354775', 'Q4989906', 'Q151885', 'Q1807498', 'Q178659', 'Q11060274']


### machine detection
table: df_report

| entity_y | entityLable_y | COUNTA of |
|---|---|---|
| Q4164871 | position | 10006 |
| Q5003624 | memorial | 8814 |
| Q11835431 | engraving | 8622 |
| Q41207 | coin | 7838 |
| Q12089225 | mineral species | 6500 |
| Q28640 | profession | 6286 |
| Q2424752 | product | 5896 |
| Q63981612 | product category | 4271 |
| Q12737077 | occupation | 3906 |
| Q2142903 | jewelry | 3394 |
| Q1065579 | costume accessory | 2801 |
| Q17339814 | group or class of chemical substances | 850 |
| Q16887380 | group | 817 |
| Q151885 | concept | 324 |
| Q4989906 | monument | 1 |
| Q214339 | role | 1 |
| Q1807498 | social position | 1 |
| Q178659 | illustration | 1 |
| Q1354775 | memory space | 1 |
| Q11060274 | print | 1 |

In [9]:
# manual select top class by remove abstract classes
top_classes=[
    'Q5003624',
    'Q11835431',
    'Q41207',
    'Q12089225',
    'Q28640',
    'Q2424752',
    'Q2142903',
    'Q1065579',
    'Q151885',
]
df1 = pd.DataFrame({'top_class':top_classes})
df1.info()
df1.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   top_class  9 non-null      object
dtypes: object(1)
memory usage: 200.0+ bytes


Unnamed: 0,top_class
0,Q5003624
1,Q11835431
2,Q41207
3,Q12089225
4,Q28640


In [10]:
# list of sucject
df2=df[['subject','subjectLabel']].drop_duplicates()
df2.columns=['entity','entityLable']
df2.info()
df2.head()

<class 'pandas.core.frame.DataFrame'>
Index: 711529 entries, 0 to 829668
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   entity       711529 non-null  object
 1   entityLable  711528 non-null  object
dtypes: object(2)
memory usage: 16.3+ MB


Unnamed: 0,entity,entityLable
0,P1088,Mohs' hardness
14,Q7946,mineral
15,P5483,hardness
16,Q55594526,Category:Pages using Wikidata property P1088
17,Q12089225,mineral species


In [11]:
# list of object
df3=df[['object','objectLabel']].drop_duplicates()
df3.columns=['entity','entityLable']
df3.info()
df3.head()

<class 'pandas.core.frame.DataFrame'>
Index: 6386 entries, 0 to 829654
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   entity       6386 non-null   object
 1   entityLable  6386 non-null   object
dtypes: object(2)
memory usage: 149.7+ KB


Unnamed: 0,entity,entityLable
0,Q24041781,Wikidata property related to mineralogy
1,P5483,hardness
2,Q41472,Mohs scale of mineral hardness
3,Q5283,diamond
4,Q43010,quartz


In [12]:
# entity set is union between subjects and objects
df4 = pd.concat([df2, df3]).drop_duplicates()
df4.info()
df4.head()

<class 'pandas.core.frame.DataFrame'>
Index: 713891 entries, 0 to 829652
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   entity       713891 non-null  object
 1   entityLable  713890 non-null  object
dtypes: object(2)
memory usage: 16.3+ MB


Unnamed: 0,entity,entityLable
0,P1088,Mohs' hardness
14,Q7946,mineral
15,P5483,hardness
16,Q55594526,Category:Pages using Wikidata property P1088
17,Q12089225,mineral species


In [13]:
# displat mapping betweeen entity and entity label
Left_join = pd.merge(df1,  
                     df4,  
                     left_on='top_class',
                     right_on='entity',
                     how ='left') 
Left_join

Unnamed: 0,top_class,entity,entityLable
0,Q5003624,Q5003624,memorial
1,Q11835431,Q11835431,engraving
2,Q41207,Q41207,coin
3,Q12089225,Q12089225,mineral species
4,Q28640,Q28640,profession
5,Q2424752,Q2424752,product
6,Q2142903,Q2142903,jewelry
7,Q1065579,Q1065579,costume accessory
8,Q151885,Q151885,concept


In [14]:
# Find closest important class for each entity
def assign_to_closest_class(entity, top_classes):
    try:
        distances = {cls: nx.shortest_path_length(G, entity, cls) for cls in top_classes if nx.has_path(G, entity, cls)}
        return min(distances, key=distances.get)
    except:
        return None

# df['closest_class'] = df['subject'].apply(lambda x: assign_to_closest_class(x, top_classes))
# df['closest_class_object'] = df['object'].apply(lambda x: assign_to_closest_class(x, top_classes))

In [15]:
# for each entity, find closest important class 
df4['closest_class'] = df4['entity'].apply(lambda x: assign_to_closest_class(x, top_classes))
df4.info()
df4.head()

<class 'pandas.core.frame.DataFrame'>
Index: 713891 entries, 0 to 829652
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   entity         713891 non-null  object
 1   entityLable    713890 non-null  object
 2   closest_class  55910 non-null   object
dtypes: object(3)
memory usage: 21.8+ MB


Unnamed: 0,entity,entityLable,closest_class
0,P1088,Mohs' hardness,
14,Q7946,mineral,Q2424752
15,P5483,hardness,
16,Q55594526,Category:Pages using Wikidata property P1088,
17,Q12089225,mineral species,Q12089225


In [16]:
# df4.to_csv('entity_closet_class.csv')

In [17]:
# map class name
df5 = pd.merge(df4.dropna(subset=['closest_class',]),  
                     df4[['entity','entityLable']],  
                     left_on='closest_class',
                     right_on='entity',
                     how ='left') 
df5.info()
df5.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55910 entries, 0 to 55909
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   entity_x       55910 non-null  object
 1   entityLable_x  55910 non-null  object
 2   closest_class  55910 non-null  object
 3   entity_y       55910 non-null  object
 4   entityLable_y  55910 non-null  object
dtypes: object(5)
memory usage: 2.1+ MB


Unnamed: 0,entity_x,entityLable_x,closest_class,entity_y,entityLable_y
0,Q7946,mineral,Q2424752,Q2424752,product
1,Q12089225,mineral species,Q12089225,Q12089225,mineral species
2,Q118793301,virtual talent,Q151885,Q151885,concept
3,Q1001313,filigree,Q2142903,Q2142903,jewelry
4,Q108074373,Wooden boards (CBL Cpt 803),Q2142903,Q2142903,jewelry


In [18]:
df_entity_report=pd.pivot_table(
    df5.reset_index(),
    index=['entity_y', 'entityLable_y'],
    columns=[],
    values=['index'],
    aggfunc={
        'index': ["count"],
    },
    fill_value=0
)
df_entity_report.columns = ['count']
# Sort the result in descending order by 'count'
df_entity_report = df_entity_report.sort_values(by='count', ascending=False).reset_index()
df_entity_report

Unnamed: 0,entity_y,entityLable_y,count
0,Q5003624,memorial,8811
1,Q11835431,engraving,8718
2,Q28640,profession,7791
3,Q41207,coin,7629
4,Q2424752,product,6742
5,Q12089225,mineral species,6520
6,Q151885,concept,3475
7,Q2142903,jewelry,3396
8,Q1065579,costume accessory,2828


In [19]:
df5.to_csv('./data/old_7_entity_closet_class.csv')

### filter only entity that has class

In [20]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 829132 entries, 0 to 829668
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Unnamed: 0      829132 non-null  int64 
 1   subjectLabel    829131 non-null  object
 2   predicateLabel  829132 non-null  object
 3   objectLabel     829132 non-null  object
 4   subject         829132 non-null  object
 5   object          829132 non-null  object
 6   source          829132 non-null  object
dtypes: int64(1), object(6)
memory usage: 50.6+ MB


Unnamed: 0.1,Unnamed: 0,subjectLabel,predicateLabel,objectLabel,subject,object,source
0,0,Mohs' hardness,instance of,Wikidata property related to mineralogy,P1088,Q24041781,P1088
1,1,Mohs' hardness,related property,hardness,P1088,P5483,P1088
2,2,Mohs' hardness,Wikidata item of this property,Mohs scale of mineral hardness,P1088,Q41472,P1088
3,3,Mohs' hardness,Wikidata property example,diamond,P1088,Q5283,P1088
4,4,Mohs' hardness,Wikidata property example,quartz,P1088,Q43010,P1088


In [21]:
df_entity=df5[['entity_x','entityLable_x','entity_y','entityLable_y']]
df_entity.columns=['entity','entityLabel','class','classLabel']
df_entity.info()
df_entity.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55910 entries, 0 to 55909
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   entity       55910 non-null  object
 1   entityLabel  55910 non-null  object
 2   class        55910 non-null  object
 3   classLabel   55910 non-null  object
dtypes: object(4)
memory usage: 1.7+ MB


Unnamed: 0,entity,entityLabel,class,classLabel
0,Q7946,mineral,Q2424752,product
1,Q12089225,mineral species,Q12089225,mineral species
2,Q118793301,virtual talent,Q151885,concept
3,Q1001313,filigree,Q2142903,jewelry
4,Q108074373,Wooden boards (CBL Cpt 803),Q2142903,jewelry


In [22]:
df6=pd.merge(
    df[['predicateLabel','subject','object','source']],  
    df_entity[['entity','class']],  
    left_on='subject',
    right_on='entity',
    how ='left'
) 
df6

Unnamed: 0,predicateLabel,subject,object,source,entity,class
0,instance of,P1088,Q24041781,P1088,,
1,related property,P1088,P5483,P1088,,
2,Wikidata item of this property,P1088,Q41472,P1088,,
3,Wikidata property example,P1088,Q5283,P1088,,
4,Wikidata property example,P1088,Q43010,P1088,,
...,...,...,...,...,...,...
829127,subclass of,Q6822438,Q99731758,Q99731758,Q6822438,Q2424752
829128,subclass of,Q11777657,Q99731758,Q99731758,Q11777657,Q2424752
829129,subclass of,Q12122841,Q99731758,Q99731758,Q12122841,Q2424752
829130,subclass of,Q16989064,Q99731758,Q99731758,Q16989064,Q2424752


In [23]:
df7=pd.merge(
    df6[['predicateLabel','subject','object','entity','class','source']],  
    df_entity[['entity','class']],  
    left_on='object',
    right_on='entity',
    how ='left'
)
df7

Unnamed: 0,predicateLabel,subject,object,entity_x,class_x,source,entity_y,class_y
0,instance of,P1088,Q24041781,,,P1088,,
1,related property,P1088,P5483,,,P1088,,
2,Wikidata item of this property,P1088,Q41472,,,P1088,,
3,Wikidata property example,P1088,Q5283,,,P1088,Q5283,Q12089225
4,Wikidata property example,P1088,Q43010,,,P1088,Q43010,Q12089225
...,...,...,...,...,...,...,...,...
829127,subclass of,Q6822438,Q99731758,Q6822438,Q2424752,Q99731758,Q99731758,Q2424752
829128,subclass of,Q11777657,Q99731758,Q11777657,Q2424752,Q99731758,Q99731758,Q2424752
829129,subclass of,Q12122841,Q99731758,Q12122841,Q2424752,Q99731758,Q99731758,Q2424752
829130,subclass of,Q16989064,Q99731758,Q16989064,Q2424752,Q99731758,Q99731758,Q2424752


In [24]:
df8 = df7.dropna(subset=['class_x','class_y'])
df8.to_csv('./data/7_entity_with_class.csv')
df8.info()
df8.head()

<class 'pandas.core.frame.DataFrame'>
Index: 67920 entries, 210 to 829130
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   predicateLabel  67920 non-null  object
 1   subject         67920 non-null  object
 2   object          67920 non-null  object
 3   entity_x        67920 non-null  object
 4   class_x         67920 non-null  object
 5   source          67920 non-null  object
 6   entity_y        67920 non-null  object
 7   class_y         67920 non-null  object
dtypes: object(8)
memory usage: 4.7+ MB


Unnamed: 0,predicateLabel,subject,object,entity_x,class_x,source,entity_y,class_y
210,subclass of,Q1001313,Q2142903,Q1001313,Q2142903,Q1001313,Q2142903,Q2142903
211,said to be the same as,Q1001313,Q4421576,Q1001313,Q2142903,Q1001313,Q4421576,Q2424752
214,instance of,Q108074373,Q1001313,Q108074373,Q2142903,Q1001313,Q1001313,Q2142903
215,instance of,Q86743627,Q1001313,Q86743627,Q2142903,Q1001313,Q1001313,Q2142903
220,fabrication method,Q120714259,Q1001313,Q120714259,Q2142903,Q1001313,Q1001313,Q2142903


In [25]:
df9 =pd.merge(
    df8,
    df_entity.set_index('entity')[['entityLabel']],
    left_on='class_x',
    right_index=True
)
df9

Unnamed: 0,predicateLabel,subject,object,entity_x,class_x,source,entity_y,class_y,entityLabel
210,subclass of,Q1001313,Q2142903,Q1001313,Q2142903,Q1001313,Q2142903,Q2142903,jewelry
211,said to be the same as,Q1001313,Q4421576,Q1001313,Q2142903,Q1001313,Q4421576,Q2424752,jewelry
214,instance of,Q108074373,Q1001313,Q108074373,Q2142903,Q1001313,Q1001313,Q2142903,jewelry
215,instance of,Q86743627,Q1001313,Q86743627,Q2142903,Q1001313,Q1001313,Q2142903,jewelry
220,fabrication method,Q120714259,Q1001313,Q120714259,Q2142903,Q1001313,Q1001313,Q2142903,jewelry
...,...,...,...,...,...,...,...,...,...
793746,made from material,Q111879910,Q897,Q111879910,Q41207,Q897,Q897,Q2424752,coin
793747,made from material,Q111879919,Q897,Q111879919,Q41207,Q897,Q897,Q2424752,coin
793748,made from material,Q111879922,Q897,Q111879922,Q41207,Q897,Q897,Q2424752,coin
793749,made from material,Q111879923,Q897,Q111879923,Q41207,Q897,Q897,Q2424752,coin


In [26]:
df10 =pd.merge(
    df9,
    df_entity.set_index('entity')[['entityLabel']],
    left_on='class_y',
    right_index=True
)
df10=df10[['entityLabel_x','predicateLabel','entityLabel_y','entity_x','class_x','entity_y','class_y','source']]

df10.to_parquet('7_spo_closest_class.parquet')

In [2]:
import pandas as pd
df10 = pd.read_parquet('7_spo_closest_class.parquet')
df10.info()
df10.head()

<class 'pandas.core.frame.DataFrame'>
Index: 67920 entries, 210 to 560091
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   entityLabel_x   67920 non-null  object
 1   predicateLabel  67920 non-null  object
 2   entityLabel_y   67920 non-null  object
 3   entity_x        67920 non-null  object
 4   class_x         67920 non-null  object
 5   entity_y        67920 non-null  object
 6   class_y         67920 non-null  object
 7   source          67920 non-null  object
dtypes: object(8)
memory usage: 4.7+ MB


Unnamed: 0,entityLabel_x,predicateLabel,entityLabel_y,entity_x,class_x,entity_y,class_y,source
210,jewelry,subclass of,jewelry,Q1001313,Q2142903,Q2142903,Q2142903,Q1001313
214,jewelry,instance of,jewelry,Q108074373,Q2142903,Q1001313,Q2142903,Q1001313
215,jewelry,instance of,jewelry,Q86743627,Q2142903,Q1001313,Q2142903,Q1001313
220,jewelry,fabrication method,jewelry,Q120714259,Q2142903,Q1001313,Q2142903,Q1001313
82078,jewelry,instance of,jewelry,Q11735356,Q2142903,Q2142903,Q2142903,Q11735356


In [3]:
df11=df10[~df10['predicateLabel'].isin(['subclass of','instance of'])].reset_index()
df_report=pd.pivot_table(
    df11,
    index=['entityLabel_x'],
    columns=['entityLabel_y'],
    values=['index'],
    aggfunc={
        'index': ["count"],
    },
    fill_value=0
)
df_report.columns = df_report.columns.get_level_values(2) 
df_report

entityLabel_y,coin,concept,costume accessory,engraving,jewelry,memorial,mineral species,product,profession
entityLabel_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
coin,5,0,0,0,1,0,0,1131,0
concept,0,14,0,0,0,0,1,2,6
costume accessory,0,7,6,0,2,0,18,192,0
engraving,0,0,0,2750,0,0,0,5,0
jewelry,0,7,2,0,5,0,135,1388,0
memorial,0,0,0,0,0,5,0,606,1
mineral species,0,16,0,0,0,0,129,508,0
product,1,15,14,0,11,0,186,718,13
profession,2,8,2,3,5,0,0,37,15


In [6]:
df_predicate_report=pd.pivot_table(
    df11,
    index=['predicateLabel'],
    columns=[],
    values=['index'],
    aggfunc={
        'index': ["count"],
    },
    fill_value=0
)
df_predicate_report.columns =['count']
df_predicate_report=df_predicate_report.sort_values(by=['count'], ascending=False).head(20).reset_index()

# Conclusion

In [7]:
# top_classes
df_entity_report

NameError: name 'df_entity_report' is not defined

In [8]:
# top predicates
df_predicate_report

Unnamed: 0,predicateLabel,count
0,made from material,5861
1,genre,1061
2,named after,510
3,has part(s),164
4,has use,45
5,solid solution series with,38
6,part of,27
7,type locality (geology),26
8,field of this occupation,25
9,culture,24


# graph entropy

In [19]:
import pandas as pd
import numpy as np

df=df10.copy()
# Assume `df` is the DataFrame containing the SPO triples

# Step 1: Calculate frequency of each predicate
predicate_counts = df['predicateLabel'].value_counts()
total_predicates = predicate_counts.sum()

# Calculate probability distribution for each predicate
predicate_probs = predicate_counts / total_predicates

# Step 2: Degree diversity: Count unique entities connected by each predicate
# Unique subjects and objects connected by each predicate
predicate_diversity = df.groupby('predicateLabel').agg({
    'entityLabel_x': pd.Series.nunique,
    'entityLabel_y': pd.Series.nunique
}).rename(columns={'entityLabel_x': 'unique_subjects', 'entityLabel_y': 'unique_objects'})

# Sum unique subjects and objects for diversity
predicate_diversity['degree_diversity'] = predicate_diversity['unique_subjects'] + predicate_diversity['unique_objects']

# Step 3: Calculate entropy for each predicate
# Define a function to compute entropy
def calculate_entropy(prob):
    return -prob * np.log2(prob) if prob > 0 else 0

# Apply entropy calculation to the predicate probability distribution
predicate_entropy = predicate_probs.apply(calculate_entropy)

# Combine entropy with degree diversity
predicate_entropy_df = pd.DataFrame({
    'predicate': predicate_counts.index,
    'frequency': predicate_counts.values,
    'entropy': predicate_entropy.values,
    'degree_diversity': predicate_diversity['degree_diversity'].values
})

# Step 4: Filter for high-entropy predicates
# Select predicates with high entropy and degree diversity, based on a threshold
# (Adjust threshold as necessary for your data)
entropy_threshold = predicate_entropy_df['entropy'].mean()*0.00001  # Example threshold
degree_diversity_threshold = predicate_entropy_df['degree_diversity'].mean()  # Example threshold

high_entropy_predicates = predicate_entropy_df[
    (predicate_entropy_df['entropy'] >= entropy_threshold) &
    (predicate_entropy_df['degree_diversity'] >= degree_diversity_threshold)
]

# Display selected high-entropy predicates
# print("High-entropy predicates for selection:")
# print(high_entropy_predicates[['predicate', 'frequency', 'entropy', 'degree_diversity']])
high_entropy_predicates

Unnamed: 0,predicate,frequency,entropy,degree_diversity
2,made from material,5861,0.305012,6
3,genre,1061,0.093733,9
5,has part(s),164,0.020993,7
6,has use,45,0.006996,11
7,solid solution series with,38,0.006044,7
10,field of this occupation,25,0.004199,9
13,facet of,20,0.003454,14
14,depicts,17,0.002995,8
15,"product or material produced or sold, or servi...",17,0.002995,6
19,model item,10,0.001874,18


In [24]:
def calculate_predicate_entropy(df):
    # Group by predicateLabel, class_x, and class_y to get frequency of each unique (predicate, class_x, class_y) combination
    predicate_class_freq = df.groupby(['predicateLabel', 'class_x', 'class_y']).size().reset_index(name='frequency')

    # Calculate total frequency for each predicate to use in probability calculation
    predicate_total_freq = predicate_class_freq.groupby('predicateLabel')['frequency'].sum().reset_index(name='total_frequency')

    # Merge to add total frequency to each (predicate, class_x, class_y) entry
    predicate_class_freq = predicate_class_freq.merge(predicate_total_freq, on='predicateLabel')

    # Calculate probability for each (predicate, class_x, class_y) entry
    predicate_class_freq['P_ij'] = predicate_class_freq['frequency'] / predicate_class_freq['total_frequency']

    # Calculate entropy component for each entry
    predicate_class_freq['entropy_component'] = -predicate_class_freq['P_ij'] * np.log(predicate_class_freq['P_ij'])

    # Sum entropy components for each predicate to get total entropy H(p)
    predicate_entropy = predicate_class_freq.groupby('predicateLabel')['entropy_component'].sum().reset_index(name='entropy')

    return predicate_entropy

def select_high_entropy_predicates(predicate_entropy, threshold=None):
    # If no specific threshold is given, use mean + standard deviation as the default threshold
    if threshold is None:
        threshold = predicate_entropy['entropy'].mean() + predicate_entropy['entropy'].std()

    # Filter predicates with entropy above the threshold
    high_entropy_predicates = predicate_entropy[predicate_entropy['entropy'] > threshold*0.1]
    return high_entropy_predicates

# Run entropy calculation and selection
predicate_entropy = calculate_predicate_entropy(df)
high_entropy_predicates = select_high_entropy_predicates(predicate_entropy)
high_entropy_predicates.sort_values(by=['entropy',])

Unnamed: 0,predicateLabel,entropy
10,genre,0.675459
26,named after,0.711503
32,practiced by,0.758937
25,model item,0.801819
45,uses,0.823959
28,opposite of,0.836988
16,industry,1.039721
5,fabrication method,1.073543
7,field of this occupation,1.16716
30,partially coincident with,1.213008
