In [None]:
''' This sets up the Django environment '''
import os
import django
from django.db.models import Count, Q, Prefetch, Exists, OuterRef
from collections import defaultdict

PROJECTPATH = ""
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "mus.settings")
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"  # https://docs.djangoproject.com/en/4.1/topics/async/#async-safety
django.setup()

from django.conf import settings
from loguru import logger
import asyncio
from pymongo import MongoClient
from PureOpenAlex.models import DBUpdate

MONGOURL = getattr(settings, "MONGOURL")

MONGODB = MongoClient(MONGOURL)
db=MONGODB["metadata_unification_system"]

In [25]:
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, HTML
from rich import print

# PUBLICATIONS
# Load the data
df = pd.read_csv('dataframe.csv')
df.drop(columns=['Unnamed: 0'], inplace=True)

# Create result_dict
result_dict = {}
result_dict['All'] = df.iloc[:, :9]
for col in df.columns[9:]:
    result_dict[col] = df[df[col]].iloc[:, :9]

# Create widgets
text_item = widgets.Label('''Overview of publications by UT authors. \n
                            Use widgets below to filter by group, year, and type. \n''')

group_dropdown = widgets.Dropdown(
    options=list(result_dict.keys()),
    description='Group:'
)

year_slider = widgets.IntRangeSlider(
    value=[min(df['year'].min() for df in result_dict.values()),
           max(df['year'].max() for df in result_dict.values())],
    min=min(df['year'].min() for df in result_dict.values()),
    max=max(df['year'].max() for df in result_dict.values()),
    step=1,
    description='Year Range:'
)

type_dropdown = widgets.Dropdown(
    options=['All'] + list(set(type for df in result_dict.values() for type in df['type'].unique())),
    description='Type:'
)

# Create output widget
output = widgets.Output()

# Define update function
def update_output(*args):
    with output:
        output.clear_output(wait=True)
        
        # Get selected dataframe
        df = result_dict[group_dropdown.value]
        
        # Apply filters
        df = df[(df['year'] >= year_slider.value[0]) & (df['year'] <= year_slider.value[1])]
        if type_dropdown.value != 'All':
            df = df[df['type'] == type_dropdown.value]
        
        # Calculate rankings
        publisher_ranking: pd.Series= df['publisher'].value_counts().head(25)
        #journal_ranking = df['journal'].value_counts().head(25)
        journal_ranking = df.groupby(['journal', 'publisher']).size().sort_values(ascending=False).head(25).reset_index(name='count')

        # Display results
        print(f"Group: {group_dropdown.value}")
        print(f"Year range: {year_slider.value[0]}-{year_slider.value[1]}")
        print(f"Type: {type_dropdown.value}")
        print(f"Total publications: {len(df)}")
        print("\nTop Publishers:")
        display(HTML(publisher_ranking.to_frame().to_html()))
        print("\nTop Journals:")
        display(HTML(journal_ranking.to_html(index=False)))
        
        # Display interactive table

# Link widgets to update function
group_dropdown.observe(update_output, names='value')
year_slider.observe(update_output, names='value')
type_dropdown.observe(update_output, names='value')

# Display widgets and output
display(widgets.VBox([text_item, group_dropdown, year_slider, type_dropdown, output]))

# Initial update
update_output()

VBox(children=(Label(value='Overview of publications by UT authors. \n\n                            Use widget…

In [None]:
from mus_wizard import models, constants
from mus_wizard.harvester import openalex, oai_pmh
from mus_wizard.database import matching, mongo_client
from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase, AsyncIOMotorCollection

authorid = 'A5045181048'
mongoclient = mongo_client.MusMongoClient()

openalex_authors: AsyncIOMotorCollection = mongoclient.authors_openalex
pure_authors: AsyncIOMotorCollection = mongoclient.openaire_cris_persons
openalex_works: AsyncIOMotorCollection = mongoclient.works_openalex
pure_works: AsyncIOMotorCollection = mongoclient.openaire_cris_publications

author_details = openalex_authors.find_one({'id': authorid})
openalex_works.find({'authorships.author.id': authorid})

In [None]:
from rich import print
from rich.console import Console
cons = Console()
# Testing retrieval of cerif data from oai_pmh endpoint
import httpx
import xmltodict
import time

def get_person_affiliations(values:list) -> tuple[str, list[dict]]:
    affiliations = []
    if isinstance(values, list):
        for v in values:
            tmp = v.get('cerif:OrgUnit')
            org = {
                'internal_repository_id': tmp.get('@id'),
                'name': tmp.get('cerif:Name').get('#text'),
            }
            affiliations.append(org)
    elif isinstance(values, str):
        return values
    return 'affiliations', affiliations

def get_org_identifiers(values:list) -> tuple[str, dict[list]]:
    identifiers = defaultdict(list)
    if isinstance(values, list):
        for v in values:
            identifiers[v.get('@type')].append(v.get('#text'))
    return 'identifiers', identifiers

def get_org_part_of(value:dict|list) -> tuple[str, dict]|tuple[str, list[dict]]:
    try:
        if isinstance(value, dict):
            part_of = {
                'internal_repository_id': value.get('cerif:OrgUnit').get('@id'),
                'name': value.get('cerif:OrgUnit').get('cerif:Name').get('#text'),
            }
        elif isinstance(value, list):
            part_of = []
            for v in value:
                part_of.append({
                    'internal_repository_id': v.get('cerif:OrgUnit').get('@id'),
                    'name': v.get('cerif:OrgUnit').get('cerif:Name').get('#text'),
                })
    except Exception as e:
        print(f'error parsing {value}: {e}')
        part_of = None
    return 'part_of', part_of

cerif_item_mapping = {
    'persons':'cerif:Person',
    'orgs': 'cerif:OrgUnit',
    'works': 'cerif:Publication',
    'products': 'cerif:Product',
    'patents': 'cerif:Patent',
    'datasets': 'cerif:Product',
    'projects': 'cerif:Project',
    'funding':'cerif:Funding',
}

cerif_mapping = {
    'persons':{
        'internal_repository_id':'@id',
        'cerif:PersonName':{'family_names':'cerif:FamilyNames', 'first_names':'cerif:FirstNames'},
        'orcid':'cerif:ORCID',
        'scopus_id':'cerif:ScopusAuthorID',
        'scopus_affil_id':'cerif:ScopusAffiliationID',
        'cerif:Affiliation': get_person_affiliations,
        'researcher_id': 'cerif:ResearcherID',
        'isni': 'cerif:ISNI',
        'cris-id': 'cerif:CRIS-ID',
        'uuid':'cerif:UUID',
        'uri':'cerif:URI',
        'url':'cerif:URL',
    },
    'orgs': {
        'internal_repository_id':'@id',
        'cerif:Identifier':get_org_identifiers,
        'cerif:Type':{'type':'#text'},
        'cerif:PartOf':get_org_part_of,
        'cerif:Name':{'name':'#text'},
        'acronym':'cerif:Acronym',
    },
    'works': {},
    'products': {},
    'patents': {},
    'datasets': {},
    'projects': {},
    'funding': {},
    'ec_funded_resources': {},
}

cerif_keylist = {
    'persons':['@id', 'cerif:PersonName', 'cerif:Affiliation', 'cerif:PersonName', 'cerif:Affiliation', 'cerif:ORCID', 'cerif:ScopusAuthorID', 'cerif:ScopusAffiliationID', 'cerif:ResearcherID', 'cerif:ISNI', 'cerif:CRIS-ID', 'cerif:UUID', 'cerif:URI', 'cerif:URL'],
    'orgs': ['cerif:Identifier', 'cerif:Type', 'cerif:PartOf', 'cerif:Name', 'cerif:Acronym'],
    'works': [],
    'products': [],
    'patents': [],
    'datasets': [],
    'projects': [],
    'funding': [],
    'ec_funded_resources': [],
}
def check_keys(item, keylist) -> list[str]:
    missing_keys = []
    for k in item.keys():
        if k.startswith('cerif:') and k not in keylist:
            missing_keys.append(k)
    return missing_keys

def process_cerif(type:str, data:list[dict]) -> list[dict]:
    # TODO: handle nested fieldnames
    keys_missing = set()
    results=[]
    mapping = cerif_mapping[type]
    keylist = cerif_keylist[type]
    for i in data:
        item = i['metadata'].get(cerif_item_mapping[type])
        result = {}
        for key, value in mapping.items():
            if isinstance(value, str):
                result[key]=item.get(value)
            elif isinstance(value, dict):
                temp = item.get(key)
                for k, v in value.items():
                    result[k] = temp.get(v)
            elif callable(value):
                item_result = item.get(key)
                if item_result:
                    keyname, fullvalue = value(item_result)
                    result[keyname] = fullvalue
        missing = check_keys(item, keylist)
        if missing:
            [keys_missing.add(m) for m in missing]
        results.append(result)
    return results, keys_missing

def get_results(url:str) -> list[dict]:
    def fetch_response(url):
        try:
            r = httpx.get(url)
            parsed = xmltodict.parse(r.text)
            return parsed['OAI-PMH']['ListRecords']
        except Exception as e:
            print(f'error fetching {url}: {e}')
            return None
    results = []
    resume_url = url.split('&metadataPrefix')[0]
    while True:
        response = fetch_response(url)
        if not response:
            time.sleep(5)
            continue
        items = response.get('record')
        if not isinstance(items, list):
            items = [items]
        for result in items:
            results.append(result)
        if response.get('resumptionToken'):
            print(f'{response.get('resumptionToken').get('@cursor')}/{response.get("resumptionToken").get("@completeListSize")}')
            resumetoken = response.get('resumptionToken').get('#text')
            url = f"{resume_url}&resumptionToken={resumetoken}"
        else:
            return results

base_url = 'https://ris.utwente.nl/ws/oai?verb=' # Use env variable
verbs = {
    'itemsets':'ListSets',
    'schemas':'ListMetadataFormats',
    'records':'ListRecords',
    'identify':'Identify',
}

all_itemsets = {'persons':'openaire_cris_persons', 'orgs':'openaire_cris_orgunits','works':'openaire_cris_publications','products':'openaire_cris_products', 'patents':'openaire_cris_patents', 'datasets':'datasets:all', 'projects':'openaire_cris_projects', 'funding':'openaire_cris_funding'}
itemsets = {'orgs':'openaire_cris_orgunits'}
scheme = 'oai_cerif_openaire'
finalresults = {}

for type, itemset in itemsets.items():
    singleresult = {}
    url = f'https://ris.utwente.nl/ws/oai?verb=ListRecords&metadataPrefix={scheme}&set={itemset}'
    #singleresult['raw'] = get_results(url)
    singleresult['processed'], singleresult['missing_keys'] = process_cerif(type, get_results(url))
    finalresults[type]=singleresult
    collectionname=f'openaire_cris_{type}'
    collection = db[collectionname]
    print(f'Inserting {len(singleresult["processed"])} {type} records into {collectionname}. Possibly missing keys: {singleresult["missing_keys"]}')
    collection.insert_many(singleresult['processed'])



In [None]:
all_ut_orgs = []
collection = db['openaire_cris_orgs']

for org in collection.find():
    if org.get('part_of') or 'Twente' in org.get('name') or org.get('name').startswith('UT'):
        all_ut_orgs.append(org)

print(f'Found {len(all_ut_orgs)} UT organisations')
print(all_ut_orgs)

In [None]:
from mus_wizard.models import Author, Topic, Organization, Group, Affiliation, Work
from collections import defaultdict, Counter
from rich import print, box
from rich.table import Table
from rich.console import Console
cons = Console(record=True)

authors_by_faculty = defaultdict(list)
author_counts = {}
total = 0
# get list of authors by faculty
for faculty in Group.Faculties.values:
    faculty_groups = Group.objects.filter(faculty=faculty)
    faculty_affiliations = Affiliation.objects.filter(groups__in=faculty_groups)
    faculty_authors = Author.objects.filter(affiliation_details__in=faculty_affiliations)
    author_counts[faculty] = len(faculty_authors) # store the counts for printing
    total += len(faculty_authors)
    authors_by_faculty[faculty].extend(faculty_authors) # store full list of authors to get topics

# store the most common 5 topics for each faculty
# this does not need to be a seperate loop, but this is easier to read/understand
topics_by_faculty = defaultdict(list)
fields_per_faculty = defaultdict(list)
top_works_per_field = defaultdict(str)
top_works_per_topic = defaultdict(str)
top_authors_per_topic = defaultdict(str)


for faculty, authors in authors_by_faculty.items():
    faculty_topics = Topic.objects.filter(authors__in=authors)
    topics_by_faculty[faculty].extend(Counter([t for t in faculty_topics]).most_common(5))
    fields_per_faculty[faculty].extend(Counter([t.field for t in faculty_topics]))


for faculty, topics in topics_by_faculty.items():
    for topic in topics:
        works = Work.objects.filter(topics=topic[0]).order_by('-cited_by_count')
        top_works_per_topic[topic[0].openalex_id] = "\n".join([f'- {work.title} ({work.cited_by_count})' for work in works[:3]])
        authors = Author.objects.filter(topics=topic[0]).order_by('-works_count')
        top_authors_per_topic[topic[0].openalex_id] = "\n".join([f'- [link={author.openalex_id}]{author.name}[/link] ({author.works_count})' for author in authors[:3]])



result = Table(title='Authors per faculty', title_style='deep_pink2', show_header=True)
result.add_column('faculty', style='cyan')
result.add_column('# authors', style='orange1')
for k,v in author_counts.items():
    result.add_row(k, str(v))
cons.print(result)

for faculty, topics in topics_by_faculty.items():
    if faculty == Group.Faculties.OTHER:
        continue
    result2 = Table(title=f'Top topics for [bold bright_magenta]{faculty}[/bold bright_magenta]', title_style='deep_pink2', show_header=True, show_lines=True)
    result2.add_column('topics', style='orange1')
    result2.add_column('linked to # authors', style='pale_violet_red1')
    result2.add_column('top 3 works (#citations)', style='deep_pink2')
    result2.add_column('top 3 authors (#works)', style='deep_pink2')
    for topic in topics:
        result2.add_row(topic[0].name, str(topic[1]), str(top_works_per_topic[topic[0].openalex_id]), str(top_authors_per_topic[topic[0].openalex_id]))

    cons.print(result2)

#from rich.terminal_theme import SVG_EXPORT_THEME

#cons.save_svg("topics_faculty.svg", title="Author & Topic count per faculty", theme=SVG_EXPORT_THEME)
#

# Find & remove duplicate model instances

In [None]:
from PureOpenAlex.models import UTData, Department
from django.db.models import Q, Count, Window, F, Min, Max
from django.db.models.functions import RowNumber

duplicates = (
    UTData.objects.values("employee_id")
    .annotate(count=Count("employee_id"))
    .filter(count__gt=1)
)
for duplicate in duplicates:
    responses_to_check = UTData.objects.filter(
        employee_id=duplicate["employee_id"]
    ).annotate(
        row_number=Window(
            expression=RowNumber(),
            partition_by=[F("employee_id")],
            order_by=F("avatar").asc(),
        )
    )
    with transaction.atomic():
        responses_to_check.filter(row_number__gt=1).delete()
