In [None]:
import pandas as pd
from collections import defaultdict
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from itertools import combinations
import nltk
from nltk.stem import PorterStemmer
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import deque
nltk.download("punkt")
nltk.download("stopwords")
from graphviz import Graph, Digraph
import json
from langdetect import detect
import numpy as np
import random

In [None]:
train_df=pd.read_csv("en_only_df.csv")

In [None]:
train_df['fields_of_study']=train_df['fields_of_study'].apply(lambda x: x.strip('[]').strip("'").strip())

In [None]:
areas=train_df['fields_of_study'].unique()

In [None]:
sciences=["Biology","Medicine","Chemistry","Engineering","Computer science","Geology","Mathematics","Physics","Materials science","Environmental science"]

In [None]:
df=pd.DataFrame()
test_df=pd.DataFrame()
for area in areas:
    temp_df=train_df[(train_df['fields_of_study']==area)].iloc[:80]
    df=pd.concat([df,temp_df])
    temp_df=train_df[(train_df['fields_of_study']==area)].iloc[101:135]
    test_df=pd.concat([test_df,temp_df])
num_study=[1]*len(test_df)

In [None]:
inter_df=pd.read_csv("en_inter_df.csv")

In [None]:
for area in inter_df['fields_of_study'].unique():
    temp_df=inter_df[(inter_df['fields_of_study']==area)].iloc[:2]
    test_df=pd.concat([test_df,temp_df])
two=len(test_df)-len(num_study)
num_study+=[2]*two

In [None]:
test_df['num_of_study']=num_study

In [None]:
cooccurrence_counts = defaultdict(int)
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
kw_model = KeyBERT(model=sentence_model)
keyword_set=set()
ps=PorterStemmer()

In [None]:
for i in range(len(df)):
    doc=df.iloc[i]['text']
    field=df.iloc[i]['fields_of_study'].strip('[]')
    keywords=kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 2),stop_words='english',top_n=20,use_mmr=True, diversity=0.2)
    processed=[ps.stem(word[0]) for word in keywords]
    keyword_set.update(set(processed))
    for k1,k2 in combinations(sorted(set(processed)), 2):
        cooccurrence_counts[((k1, k2),field)] += 1

In [None]:
class Discipline_Graph():
    def __init__(self):
        self.nodes = {}  
        self.edges = {}  

    def add_node(self, source,field,num):
        if source not in self.nodes:
            self.nodes[source] = dict()
            self.nodes[source][field]=num
        else:
            if field in self.nodes[source]:
                self.nodes[source][field]+=num
            else:
                self.nodes[source][field]=num


    def add_edge(self, source, target, field,num):
        if source not in self.nodes or target not in self.nodes:
            raise ValueError("Both nodes must exist before adding an edge.")

        if source not in self.edges:
            self.edges[source] = {}
        if target not in self.edges:
            self.edges[target] = {}

        if target not in self.edges[source]:
            self.edges[source][target]={}
            self.edges[source][target].update({field:num})
            #self.edges[target][source]={}
            #self.edges[target][source].update({field:num})
        else:
           
            if field in self.edges[source][target]:
                self.edges[source][target][field]+=num
                #self.edges[target][source][field]+=num
            else:
                self.edges[source][target].update({field:num})
                #self.edges[target][source].update({field:num})  
                
        if source not in self.edges[target]:
            self.edges[target][source]={}
            self.edges[target][source].update({field:num})
        else:
            if field in self.edges[target][source]:
                self.edges[target][source][field]+=num
            else:
                self.edges[target][source].update({field:num}) 
                
    def bfs(self,start_node):
        
        def sort_neighbor(neighbor_dict,cur_field):
            field_values={}
            for neighbor, fields in neighbor_dict.items():
                field_values[neighbor]=neighbor_dict[neighbor][cur_field]
            sorted(field_values.items(), key=lambda x: x[1], reverse=True)
            
            return field_values.keys()
            
        visited=set()
        cur_hop=0
        path_dict={}
        queue=deque([(start_node,cur_hop,path_dict)])
        field_dict={}
        hop_value=[0]*4 #sum of the values
        hop_node=[0]*4 #the node of each hop
        while cur_hop<=3 and queue:
            
            cur_node,cur_hop,path_dict=queue.popleft()
            #print(f"cur_node:{cur_node}| cur_hop:{cur_hop} | path_dict:{path_dict}")
                
        
            if cur_node in visited or cur_hop>3:
                continue
            visited.add(cur_node)
            if "test" in path_dict:
                path_dict.pop("test")
            if hop_value[cur_hop]<sum(path_dict.values()):
                #print(f"Target node found : {cur_node}, dict: {path_dict}")
                hop_value[cur_hop]=sum(path_dict.values())
                field_dict=path_dict
                hop_node[cur_hop]=cur_node
            
            
            
            neighbor_list=self.edges[cur_node].keys()# to visit according to the value of edges of the field
            for neighbor in neighbor_list:# target node and attributes of the cur_node
                #print(neighbor,self.nodes[neighbor])
                if neighbor not in visited:
                    for field, num in self.nodes[neighbor].items():
                        if field in path_dict:
                            path_dict[field]+=num
                        else:
                            path_dict[field]=num
                    queue. append((neighbor,cur_hop+1,path_dict))
        return hop_node,hop_value,field_dict
    
    def measure_score(self,start_node):
        hop_node,hop_value,field_dict=self.bfs(start_node)
        n=len(field_dict.keys())
        if n==0 or n==1:
            return 0,hop_node,field_dict
        values=list(field_dict.values())
        mean=sum(values)/n
        sigma=np.std(values)
        cv=sigma/mean if mean >0 else 0
        alpha=1.5
        beta=2
        n_weight=n**alpha
        skew_penalty=(1+cv)**beta
        final_score=sum(values)*n_weight/skew_penalty
        return final_score,hop_node,field_dict
    
    def display_graph(self):
            print("Nodes and their attributes:")
            for node, attributes in self.nodes.items():
                print(f"  {node}: {attributes}")

            print("\nEdges and their attributes:")
            for source, targets in self.edges.items():
                for target, attributes in targets.items():
                    print(f"  ({source} -> {target}): {attributes}")

In [None]:
g=Discipline_Graph()

for word_tuple,num in cooccurrence_counts.items():
    g.add_node(word_tuple[0][0],word_tuple[1],num)
    g.add_node(word_tuple[0][1],word_tuple[1],num)
    g.add_edge(word_tuple[0][0],word_tuple[0][1],word_tuple[1],num)
g.display_graph()

In [None]:
def measure_inter_score(g,test_keywords):
    max_score=0
    sum_score=0
    max_score_node=None 
    max_nodes=[]
    max_field_dict={}
    for t_words in test_keywords:
        score,nodes,field_dict=g.measure_score(t_words)
        if max_score<=score:
                max_score=score
                max_score_node=t_words
                max_nodes=nodes
                max_field_dict=field_dict
        
    return max_score_node,max_score,max_nodes,max_field_dict

In [None]:
scores=[]
dicts=[]
node_list=[]
num_field=[]
for i in range(len(test_df)):
    test_cooccurrence_counts = defaultdict(int)
    text=test_df.iloc[i]['text']
    field="test"   
    test_keywords=kw_model.extract_keywords(text,keyphrase_ngram_range=(1, 2),stop_words='english',top_n=20,use_mmr=True, diversity=0.2)
    test_words=[ps.stem(words[0]) for words in test_keywords]
    keyword_set.update(set(test_words))
    for k1,k2 in combinations(sorted(set(test_words)), 2):
        test_cooccurrence_counts[(k1, k2)] += 1
    for word_tuple,num in test_cooccurrence_counts.items():
        g.add_node(word_tuple[0],field,num)
        g.add_node(word_tuple[1],field,num)
        g.add_edge(word_tuple[0],word_tuple[1],field,num)
    t_words,score,nodes,field_dict=measure_inter_score(g,test_words)
    nodes[0]=t_words
    #print(f"start_node:{t_words}(field:{test_df.iloc[i]['fields_of_study']})-> end_node:{nodes[-1]}(field:{field_dict}) : {score}")
    node_list.append(nodes)
    scores.append(score)
    dicts.append(field_dict)
    num_field.append(len(field_dict.keys()))