In [None]:
''' This sets up the Django environment '''
import os
import django
from django.db.models import Count, Q, Prefetch, Exists, OuterRef
from rich import print

PROJECTPATH = ""
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "mus.settings")
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"  # https://docs.djangoproject.com/en/4.1/topics/async/#async-safety
django.setup()

from django.conf import settings
from loguru import logger
import asyncio
from pymongo import MongoClient
from PureOpenAlex.models import DBUpdate

MONGOURL = getattr(settings, "MONGOURL")

MONGODB = MongoClient(MONGOURL)
db=MONGODB["mus"]

In [13]:
from xclass_refactor.other_apis_import import OpenAIREAPI
from xclass_refactor.mus_mongo_client import MusMongoClient
import httpx
import rich
client = httpx.Client()
oa = OpenAIREAPI(MusMongoClient())
print(oa.token, oa.refreshurl)
url = 'https://api.openaire.eu/search/researchProducts'
headers = {
    'Authorization': f'Bearer {oa.get_token()}'
}
params = {'doi':'10.1080/10438599.2012.656527'}
r=client.get(url, headers=headers, params=params)
rich.inspect(r)

In [None]:
from collections import defaultdict
from rich.table import Table, Column
from rich.console import Console
db = MONGODB['metadata_unificiation_system']
colls = [MONGODB['metadata_unificiation_system']['works_openalex'],MONGODB['mus']['api_responses_works_openalex']]
yeardict = defaultdict(int)
paperlist = defaultdict(list)
paperdict = defaultdict(dict)
cons = Console()

for i, coll in enumerate(colls):

    for work in coll.find():
        yeardict[work['publication_year']] += 1
        paperdict[work['publication_year']][work['id']] = True
        paperlist[work['publication_year']].append(work['id'])
    table = Table(show_header=True, header_style="bold magenta", title=f'result {i+1}')
    table.add_column("year", style='cyan')
    table.add_column("yeardict", justify="left", style="green", no_wrap=True)
    table.add_column("paperdict", justify="left", style="red", no_wrap=True)
    table.add_column("paperlist", justify="left", style="yellow", no_wrap=True)

    for year in [2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]:
        table.add_row(str(year), str(yeardict[year]), str(len(paperdict[year])), str(len(paperlist[year])))
    cons.print(table)

In [None]:
from PureOpenAlex.models import Paper, Journal

journal = Journal.objects.filter(id__in=[16930,4789])
for journal in journal:
    print(journal.name, journal.id)

    papers = journal.papers.all()
    print(papers.count())
    for paper in papers:
        print(paper.title, paper.id)
        print([author.name for author in paper.authors.all() if author.is_ut])




In [None]:
from PureOpenAlex.models import Paper
Paper.objects.remove_duplicates()

In [None]:
from PureOpenAlex.data_export import export_paper_data
{'new_tcs_papers.csv': {
    'filters': [['TCS',''],['start_date','2016-01-01'],['end_date','2024-12-31']],

}
}

requests = {
    'new_ee_papers.csv': {
        'filters': [['EE',''],['start_date','2016-01-01'],['end_date','2024-12-31']],
    },

}

export_paper_data(requests)

In [None]:
'''

tussen 1-1-2016 en 31-12-2023

open access status

Van alle output:

% of gold, green, ...., open access per year
Number of open access data-sets, as published per year and cumulative
Number of open access software/design repositories, as published per year and cumulative

'''

from PureOpenAlex.models import Paper, Author, UTData, PureEntry, PilotPureData
import pandas as pd
from collections import defaultdict
from PureOpenAlex.constants import EEGROUPSABBR, TCSGROUPSABBR
import csv
eepapers = Paper.objects.filter_by([['EE',''],['start_date','2016-01-01'],['end_date','2024-12-31']]).get_table_prefetches()
tcspapers = Paper.objects.filter_by([['TCS',''],['start_date','2016-01-01'],['end_date','2024-12-31']]).get_table_prefetches()
eepapers.count()
tcspapers.count()
datalist= []
csvs=[]
years= [2016,2017,2018,2019,2020,2021,2022,2023,2024]
oa_types=['gold','green','hybrid','bronze', 'closed']
mus_url = 'https://openalex.samuelmok.cc/'
mus_api_url = 'https://openalex.samuelmok.cc/api/'





In [None]:
#see how many papers hav pure entries & pure pilot data
for group in [eepapers, tcspapers]:
    papers = 0
    pure_entries = 0
    paper_dupe_pure_entries = 0
    pilot_data = 0
    for paper in group:
        papers += 1
        if paper.pure_entries.first():
            if paper.pure_entries.count() > 1:
                paper_dupe_pure_entries += 1
                for pure_entry in paper.pure_entries.all():
                    pure_entries += 1
                    if pure_entry.pilot_pure_data:
                        pilot_data += 1
            else:
                pure_entries += 1
                if paper.pure_entries.first().pilot_pure_data:
                    pilot_data += 1

    print(f''''checked {papers} papers, {paper_dupe_pure_entries} with multiple pure entries \n
        found {pure_entries} pure entries in total, of which {pilot_data} have pilot data''')



In [None]:
groupn = 0
for group in [eepapers, tcspapers]:
    groupn += 1
    csvoutput = []

    for paper in group:
        paperauthors=paper.authors.filter(utdata__isnull=False)
        if groupn == 1:
            groups = paperauthors.get_ut_groups(EEGROUPSABBR)
        else:
            groups = paperauthors.get_ut_groups(TCSGROUPSABBR)
        mapping = {
            'title':paper.title,
            'doi':paper.doi,
            'year':paper.year,
            'itemtype':paper.itemtype,
            'isbn':paper.pure_entries.first().isbn if paper.pure_entries.first() else '',
            'topics':' | '.join([topic.get('display_name') for topic in paper.topics]) if paper.topics else '',
            'Authorinfo ->':'',
            'ut_authors':' | '.join([author.name for author in paperauthors]) if paperauthors else '',
            'ut_groups': ' | '.join(groups) if groups else '',
            'Openaccessinfo ->':'',
            'is_openaccess':paper.is_oa,
            'openaccess_type':paper.openaccess,
            'found_as_green':paper.is_in_pure,
            'present_in_pure':paper.has_pure_oai_match,
            'license':paper.license,
            'URLs ->':'',
            'primary_link':paper.primary_link,
            'pdf_link_primary':paper.pdf_link_primary,
            'openalex_url':paper.openalex_url,
            'pure_page_link':paper.pure_entries.first().researchutwente if paper.pure_entries.first() else '',
            'pure_file_link':paper.pure_entries.first().risutwente if paper.pure_entries.first() else '',
            'scopus_link':paper.pure_entries.first().scopus if paper.pure_entries.first() else '',
            'Journalinfo ->':'',
            'journal':paper.journal.name if paper.journal else '',
            'journal_issn':paper.journal.issn if paper.journal else '',
            'journal_e_issn':paper.journal.e_issn if paper.journal else '',
            'journal_publisher':paper.journal.publisher if paper.journal else '',
            'volume':paper.volume,
            'issue':paper.issue,
            'pages':paper.pages,
            'pagescount':paper.pagescount,
            'MUS links ->':'',
            'mus_paper_details':mus_url+'paper/'+str(paper.id),
            'mus_api_url_paper':mus_api_url+'paper/'+str(paper.id),
        }
        pureentrylist=''
        pilotpuredatalist=''

        if paper.pure_entries.first():
            for pure_entry in paper.pure_entries.all():
                if pureentrylist != '':
                    pureentrylist = ' | '.join([pureentrylist, mus_api_url+'pureentry/'+str(pure_entry.id)])
                else:
                    pureentrylist = mus_api_url+'pureentry/'+str(pure_entry.id)
                if pure_entry.pilot_pure_data:
                    if pilotpuredatalist != '':
                        pilotpuredatalist = ' | '.join([pilotpuredatalist, mus_api_url+'pilotpure/'+str(pure_entry.pilot_pure_data.id)])
                    else:
                        pilotpuredatalist = mus_api_url+'pilotpure/'+str(pure_entry.pilot_pure_data.id)

        mapping['mus_api_url_pure_entry']=pureentrylist
        mapping['mus_api_url_pure_report_details']=pilotpuredatalist
        csvoutput.append(mapping)
        if len(csvoutput) % 300 == 0:
            print(f'{len(csvoutput)}/{len(group)} papers processed')

    csvs.append(csvoutput)



In [None]:
keys = ['title',
        'doi',
        'year',
        'itemtype',
        'isbn',
        'topics',
        'Authorinfo ->',
        'ut_authors',
        'ut_groups',
        'Openaccessinfo ->',
        'is_openaccess',
        'openaccess_type',
        'found_as_green',
        'present_in_pure',
        'license',
        'URLs ->',
        'primary_link',
        'pdf_link_primary',
        'openalex_url',
        'pure_page_link',
        'pure_file_link',
        'scopus_link',
        'Journalinfo ->',
        'journal',
        'journal_issn',
        'journal_e_issn',
        'journal_publisher',
        'volume',
        'issue',
        'pages',
        'pagescount',
        'MUS links ->',
        'mus_paper_details',
        'mus_api_url_paper',
        'mus_api_url_pure_entry',
        'mus_api_url_pure_report_details'
    ]
for file in ['ee_data.csv', 'tcs_data.csv']:
    myFile = open(file, 'w', newline='',encoding='utf-8')
    writer = csv.DictWriter(myFile, fieldnames=keys)
    writer.writeheader()
    if file == 'ee_data.csv':
        writer.writerows(csvs[0])
    else:
        writer.writerows(csvs[1])
    myFile.close()



In [None]:
print(df_compact.groupby(['group', 'openaccess_type']).count())
df_dropped=df_compact.drop('group',axis=1).drop_duplicates()
print(df_dropped.groupby(['openaccess_type', 'itemtype']).count())
print(df_dropped.groupby(['year']).count())

In [None]:
from PureOpenAlex.models import Author, Paper, Authorship
authors_prefetch =Prefetch(
    'authors',
    queryset=Author.objects.all().prefetch_related('authorships','affiliations', 'affils'),
)
papers = Paper.objects.filter(year__gte=2019).prefetch_related(authors_prefetch)

results= {
    'checked':0,
    'matched':0,

}
changed_authorships = 0
changed_papers = 0
print(papers.count())
for paper in papers:
    results['checked']+=1
    match = False
    for authorship in paper.authorships.all():
        author = authorship.author
        if author.is_ut:
            for affl in author.affiliations.all():
                if 'twente' in affl.organization.name.lower():
                    if int(paper.year) in affl.years:
                        match = True
                        authorship.ut_author_year_match = True
                        authorship.save()
                        changed_authorships +=1
    if match:
        paper.has_any_ut_author_year_match = True
        paper.save()
        changed_papers +=1
        results['matched']+=1
    if results['checked'] % 1000 == 0:
        print(changed_authorships, changed_papers, results)


print(changed_authorships, changed_papers)
print(results)

In [None]:
from PureOpenAlex.models import PureEntry, PilotPureData
from django.db import transaction

alltcsitems = PilotPureData.objects.all().only('doi','pureid','title')
pureentries = PureEntry.objects.filter(year__gte=2018).only('id','doi', 'title', 'risutwente')
k=0
j=0
z=0
for item in alltcsitems:

    purematch = pureentries.filter(doi=item.doi)
    if not purematch:
        purematch = pureentries.filter(risutwente__contains=item.pureid)
    if not purematch:
        purematch = pureentries.filter(title__iexact=item.title)

    if not purematch:
        k=k+1
    else:
        j=j+1
        print(item.title)
        purematchitem = purematch.first()
        print(purematchitem)
        if purematch.count()>1:
            z=z+1
        with transaction.atomic():
            print(purematch.first().pilot_pure_data)
            purematchitem.pilot_pure_data = item
            purematchitem.save()
            print(purematch.first().pilot_pure_data)



    if j%100==0:
        print(f"{k} entries not matched")
        print(f"{j} entries matched")
        print(f'{z} multiple matches')

print(f"{k} entries not matched")
print(f"{j} entries matched")
print(f'{z} multiple matches')







In [None]:
from PureOpenAlex.models import Author, Affiliation, Organization
from PureOpenAlex.data_repair import fixMissingAffils

fixMissingAffils()


In [None]:
from pprint import pprint
from rich import print
from pymongo import MongoClient
import json
client=MongoClient('mongodb://smops:bazending@192.168.2.153:27017/')
db=client['mus']
datasets=[]
i=0
openalex_works=db['api_responses_works_openalex']
crossref_info=db['api_responses_crossref']
for document in openalex_works.find():
    crossrefdoc=None
    try:
        doi=document['doi'].replace('https://doi.org/','')
        crossrefdoc=crossref_info.find_one({'DOI':doi})
    except Exception as e:
        print('error: ',e)
        doi=None
    dataset={
        'works_openalex':document,
        'crossref':crossrefdoc,
    }
    dataset['works_openalex']['_id']=str(dataset['works_openalex']['_id'])
    try:
        dataset['crossref']['_id']=str(dataset['crossref']['_id'])
    except Exception:
        pass

    datasets.append(dataset)
    with open(f'output_{i}.json', 'w') as f:
        json.dump(dataset,f)
    i=i+1
    if i == 5:
        break
print(datasets)


In [None]:
from PureOpenAlex.models import PureAuthor, Author
from django.db import transaction
from nameparser import HumanName
from PureOpenAlex.namematcher import NameMatcher
from unidecode import unidecode
from pprint import pprint

allpureauthors = PureAuthor.objects.all()
purenames={}
purefullnames = {}
pureinitials = {}
for author in allpureauthors:
    hname=HumanName(unidecode(author.name),initials_format="{first} {middle}")
    purenames[author.id] = {
        'full': hname.full_name,
        'initials': hname.initials()+" "+hname.last
    }
    purefullnames[hname.full_name]=author.id
    pureinitials[hname.initials()+" "+hname.last]=author.id
allauthors = Author.objects.all()

authnames={}
authfullnames = {}
authinitials = {}
for author in allauthors:
    hname=HumanName(unidecode(author.name),initials_format="{first} {middle}")
    authnames[author.id] = {
        'full': hname.full_name,
        'initials': hname.initials()+" "+hname.last
    }
    authfullnames[hname.full_name]=author.id
    authinitials[hname.initials()+" "+hname.last]=author.id


purefullnameset=set(purefullnames.keys())
pureinitialsset=set(pureinitials.keys())

authfullnameset=set(authfullnames.keys())
authinitialsset=set(authinitials.keys())

print('# pure authors in db',allpureauthors.count())
print('# pure authors in set (unique names)',len(purefullnameset))
print('# authors in db',allauthors.count())
print('# authors in set (unique names)',len(authfullnameset))

intersection = purefullnameset.intersection(authfullnameset)
print('# common names',len(intersection))

listtosave=[]
from PureOpenAlex.models import PureEntry

j=0
h=0

for i,name in enumerate(intersection):
    pureauthorid=purefullnames[name]
    authorid=authfullnames[name]
    pureauthor = PureAuthor.objects.get(id=pureauthorid)
    pureentries = pureauthor.pure_entries.all()
    pureentry_c = pureauthor.pure_creators.all()
    author=Author.objects.get(id=authorid)
    for entry in pureentries:
        if author not in entry.authors.all():
            entry.authors.add(author)
            listtosave.append(entry)
            h=h+1
    for entry in pureentry_c:
        if author not in entry.authors.all():
            entry.authors.add(author)
            listtosave.append(entry)
            j=j+1
    if i%1000==0:
        print('# of entries that need updating:', len(listtosave))
        print('# of intersections checked:', i)
        print('pureentries:',h)
        print('purecreators:',j)



In [None]:

from PureOpenAlex.models import Author,PureEntry
from django.db.models import Q
noauths=PureEntry.objects.filter(authors__isnull=True).distinct()
print(noauths.count())
print(noauths.filter(Q(creators__isnull=False) | Q(contributors__isnull=False)).distinct().count())

In [None]:
from PureOpenAlex.models import Identifier, PureEntry
from collections import defaultdict
from django.db import transaction

MATCHURLCONTENT = {
    "itc.utwente.nl": "itc_content",
    "www.itc.nl": "itc_content",
    "arxiv": "arxiv",
    "zenodo": "zenodo",
    "github": "github",
    "https://10.": "doi",
    "http://10.": "doi",
}
MATCHIDTYPES = {
"doi": '',
"isbn": '',
"researchutwente": '',
"risutwente": '',
"scopus": ''
}
bulklist=[]
i=0
j=0
allentries=PureEntry.objects.all().filter(identifiers__isnull=False).only('doi', 'isbn', 'researchutwente', 'risutwente', 'scopus', 'other_links','id', 'duplicate_ids').prefetch_related("identifiers")
for entry in allentries:
    entry.doi = ""
    entry.isbn = ""
    entry.researchutwente = ""
    entry.risutwente = ""
    entry.scopus = ""
    entry.other_links= defaultdict(list)
    entry.duplicate_ids = defaultdict(list)
    for identifier in entry.identifiers.all():
        j=j+1
        duplicate=False
        if 'https://ezproxy2.utwente.nl/login?url=' in identifier.url:
            identifier.url = identifier.url.replace('https://ezproxy2.utwente.nl/login?url=','')
        if str(identifier.idtype) in MATCHIDTYPES.keys():
            if str(identifier.idtype) == 'doi':
                identifier.url = identifier.url.replace('doi.org1','doi.org/1')
                if entry.doi == "" or entry.doi == None:
                    entry.doi = identifier.url
                else:
                    duplicate=True
            if str(identifier.idtype) == 'isbn':
                identifier.url = identifier.url.strip('urn:ISBN:')
                if entry.isbn == "" or entry.isbn == None:
                    entry.isbn = identifier.url
                else:
                    duplicate=True
            if identifier.idtype == 'researchutwente':
                if entry.researchutwente == "" or entry.researchutwente == None:
                    entry.researchutwente = identifier.url
                else:
                    duplicate=True
            if identifier.idtype == 'risutwente':
                if entry.risutwente == "" or entry.risutwente == None:
                    entry.risutwente = identifier.url
                else:
                    duplicate=True
            if identifier.idtype == 'scopus':
                if entry.scopus == "" or entry.scopus == None:
                    entry.scopus = identifier.url
                else:
                    duplicate=True
            if duplicate:
                entry.duplicate_ids[str(identifier.idtype)].append(identifier.url)
        else:
            matched=False
            for key, value in MATCHURLCONTENT.items():
                if key in identifier.url and not matched:
                    if value != "doi":
                        entry.other_links[value].append(identifier.url)
                        matched=True
                    else: # doi with wrong formatting found
                        identifier.url = identifier.url.replace('doi.org1','doi.org/1')
                        if 'http://' in str(identifier.url) and not 'doi.org' in str(identifier.url):
                            identifier.url=str(identifier.url).replace('http://', 'https://doi.org/')
                        elif 'https://' in str(identifier.url) and not 'doi.org' in str(identifier.url):
                            identifier.url=str(identifier.url).replace('https://', 'https://doi.org/')
                        else:
                            identifier.url=str(identifier.url)
                        if not entry.doi or entry.doi=="":
                            entry.doi=identifier.url
                        elif identifier.url != entry.doi and identifier.url not in entry.duplicate_ids['doi']:
                            entry.duplicate_ids['doi'].append(identifier.url)
                        matched=True
            if not matched:
                if identifier.idtype=="other":
                    entry.other_links['other'].append(identifier.url)
                else:
                    print("idtype not found/not matched", identifier.idtype, identifier.url)
    bulklist.append(entry)
    if len(bulklist)==1000:
        with transaction.atomic():
            PureEntry.objects.bulk_update(bulklist, ['doi', 'isbn', 'researchutwente', 'risutwente', 'scopus', 'other_links', 'duplicate_ids'])
        bulklist=[]
        i=i+1000
        print(str(i) + " entries done")
        print(str(j) + " identifiers processed in total")


In [None]:
from PureOpenAlex.data_repair import matchAFASwithAuthor
results=matchAFASwithAuthor()

space=""
accepted=[]
rejected=[]
for result in results:
    if result[1]==1.0:
        accepted.append(result)
        continue
    curlen=len(f"{result[2].first} {result[2].last}")
    if curlen > len(space):
        space=" ".join(["" for x in range(curlen)])
    rejected.append(result)

i=0
keep=[2,9,11,13,19]
for result in rejected:
    acceptedcheck=""
    extraspace=""
    extranum=5
    if i<10:
        extraspace= " "
    if i in keep:
        accepted.append(result)
        acceptedcheck="[X]"
        extranum=2

    curspace=" ".join(["" for x in range(extranum+len(space)-len(f"{result[2].first} {result[2].last}"))])


    print(f"[{i}]{acceptedcheck} {result[2].first} {result[2].last}{curspace}{extraspace}[{int(result[1]*100)}]   {result[3]}")
    i+=1




In [None]:
from PureOpenAlex.models import Author
space=""
for result in accepted:
    curlen=len(f"{result[2].first} {result[2].last}")
    if curlen > len(space):
        space=" ".join(["" for x in range(curlen)])
i=0
accept=[]
other=[]
reject=[]

#dict: first one is the i-index of result, second is 0 (no match), 1 (first match), 2 (second match), 3 (other)
# if there is only 1 match alway accept expect if overruled by the dict below
# if there are more than 2 matches mark as other.

final={5:1, 6:2, 8:3, 15:2, 17:3, 18:3, 21:3, 26:3, 28:3, 29:3, 30:2, 31:2, 34:0, 38:1, 43:3, 47:3, 48:3, 50:0, 57:1, 59:0, 62:3, 63:3, 65:1, 67:0, 68:3, }

for result in accepted:
    print("---------------------")
    matchedauthors=Author.objects.filter(name__icontains=" ".join([result[2].first, result[2].last]))
    if matchedauthors.count()==0:
        matchedauthors=Author.objects.filter(first_name__icontains=result[2].first, last_name__icontains=result[2].last)
        if matchedauthors.count()==0:
            matchedauthors=Author.objects.filter(last_name__icontains=result[2].last)


    curspace=" ".join(["" for x in range(2+len(space)-len(f"{result[2].first} {result[2].last}"))])
    extraspace=""
    if result[1]!=1.0:
        extraspace=" "

    print(f"[{i}] {result[2].first} {result[2].last}{curspace}[{int(result[1]*100)}]{extraspace}   {result[3]}")
    if matchedauthors.count()==2:
        print(f"          2 matches found: {matchedauthors.first().name} and {matchedauthors.last().name}")
        if final[i]==0:
            print("Discarded.")
            reject.append([result,None])
        elif final[i]==1:
            print(f"Accepted {matchedauthors.first().name}.")
            accept.append([result,matchedauthors.first()])
        else:
            print(f"Accepted {matchedauthors.last().name}.")
            accept.append([result,matchedauthors.last()])
    elif matchedauthors.count()>1:
        print(f"          {matchedauthors.count()} matches found.")
        print("To others.")
        other.append([result,matchedauthors])
    elif matchedauthors.count()==0:
        print(f"          No matches found.")
        reject.append([result,None])
    else:
        print(f"          Match: {matchedauthors.first().name}")
        try:
            if final[i]==0:
                print("!DISCARDED!")
                reject.append([result,None])
            elif final[i]==1:
                print(f"Accepted.")
                accept.append([result,matchedauthors.first()])
            elif final[i]==3:
                print(f'To others.')
                other.append([result,matchedauthors.first()])
        except:
            print("Accepted.")
            accept.append([result,matchedauthors.first()])
    i+=1

print(f"Accepted: {len(accept)}, Rejected: {len(reject)}, Other: {len(other)}")



In [None]:
i=0
matching={0:42, 2:2, 3:28, 4:1, 7:0, 12:10, 15:10, 16:11}
extraaccepted=[]
print(len(accept))
print(len(reject))
for entry in other:
    #print("========================")

    #print(f"[{i}]Name:", entry[0][2])
    #print("Found authors:")
    j=0
    authorindex=None
    try:
        authorindex=matching[i]
    except:
        pass

    if type(entry[1]) is not Author:
        for author in entry[1]:
            #print(f"{i}:{j}",author.name)
            if authorindex is not None:
                if j==authorindex:
                    accept.append([entry,author])
                    extraaccepted.append([entry,author])
            j=j+1
    else:
        #print(f"{i}:{j}",author.name)
        pass
    i=i+1

print(len(accept))


In [None]:
i=0
for entry in accept:
    print(f"accept[{i}] has a list with details (accept[{i}][0]) for author {entry[1].name} (accept[{i}][1]) ")
    print(f"[{i}][0][0]: openalex api response for author")
    print(f"[{i}][0][1]: matching score")
    print(f"[{i}][0][2]: initial matching name from AFASdata")
    print(f"[{i}][0][3]: matched name in openalex")
    print(entry[0][2]," -- ", entry[0][3])
    i=i+1






In [None]:
from PureOpenAlex.models import AFASData
from django.db import transaction
for entry in accept:
    try:
        name=entry[0][2].full_name
    except:
        name = entry[0][0][2].full_name
    afas=AFASData.objects.filter(name=name).first()
    if afas:
        with transaction.atomic():
            entry[1].afas_data=afas
            entry[1].save()

In [None]:
from PureOpenAlex.models import UTData, Department
from django.db.models import Q, Count, Window, F, Min, Max
from django.db.models.functions import RowNumber

duplicates = (
    UTData.objects.values("employee_id")
    .annotate(count=Count("employee_id"))
    .filter(count__gt=1)
)
for duplicate in duplicates:
    responses_to_check = UTData.objects.filter(
        employee_id=duplicate["employee_id"]
    ).annotate(
        row_number=Window(
            expression=RowNumber(),
            partition_by=[F("employee_id")],
            order_by=F("avatar").asc(),
        )
    )
    with transaction.atomic():
        responses_to_check.filter(row_number__gt=1).delete()
