<a href="https://colab.research.google.com/github/zypchn/med-data-tr/blob/main/topic_mapping_dergipark_med.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install transformers -q

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json
import copy
from transformers import pipeline
from concurrent.futures import ProcessPoolExecutor, as_completed, ThreadPoolExecutor
from typing import List, Dict
import re

In [3]:
f = open("journals.json")   # list of all journal names
journals = json.load(f)
len(journals)

2952

In [4]:
# Cleaning journal names to prevent mis-classification

records = []
for key, value in journals.items():
  set_name = key
  title = value

  pattern = r'\b\w*(?:Üniversite|Fakülte|Enstitü|Dergi|University|Faculty|Institution|Journal)\w*\b'
  cleaned_title = re.sub(pattern, '', title)
  cleaned_title = re.sub(r'\s+', ' ', cleaned_title).strip()
  rec = {
      "set_name": key,
      "title": cleaned_title if len(cleaned_title) != 0 else title
  }
  records.append(rec)

In [5]:
df = pd.DataFrame.from_records(records, index=None)
df.head()

Unnamed: 0,set_name,title
0,mulkiye,Mülkiye
1,yerblm,Cumhuriyet Yerbilimleri
2,marubsed,Para ve Sermaye Piyasaları
3,marusbd,Marmara Siyasal Bilimler
4,marustd,Sanat - Tasarım


In [6]:
df.shape      # matches the count of initial records

(2952, 2)

In [7]:
# topics generated by claude.AI

topics = [
    "Mühendislik, Teknoloji ve Fen Bilimleri",
    "Tıp ve Sağlık Bilimleri",
    "Sanat, Tasarım ve Mimarlık",
    "Ekonomi ve İşletme",
    "Spor Bilimleri",
    "Hukuk",
    "Tarih ve Coğrafya",
    "Dil ve Edebiyat",
    "Teoloji ve Din Araştırmaları",
    "Yaşam ve Çevre Bilimleri",
    "Siyaset ve Uluslararası İlişkiler",
    "Psikoloji ve Davranış Bilimleri",
    "İletişim ve Medya Çalışmaları",
    "Eğitim",
    "Sosyal Bilimler"
]

In [None]:
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/mDeBERTa-v3-base-mnli-xnli")

In [9]:
res = classifier(
      records[12]["title"],
      candidate_labels=topics)
records[12]["title"], res["labels"][0], res["scores"][0]

('Marmara İktisadi ve İdari Bilimler',
 'Ekonomi ve İşletme',
 0.9570614099502563)

In [10]:
# Topic mapping with parallel processing

class TopicMapper:
  def __init__(self, max_workers: int=3):
    self.max_workers = max_workers
    self.records_topics = []

  def _get_topic(self, record: Dict, classifier, topics: List) -> None:
    try:
      set_name = record["set_name"]
      title = record["title"]
      res = classifier(title, candidate_labels=topics)
      topic = res["labels"][0]
      topic_prob = res["scores"][0]
      rec = {
          "set_name": set_name,
          "title": title,
          "topic": topic,
          "topic_prob": topic_prob
      }
      self.records_topics.append(rec)
      print(f"topic extraction is successful for {title}")
    except Exception as e:
      print(e)

  def get_all_topics(self, records: List[Dict], classifier, topics: List) -> List[Dict]:
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            futures = [
                executor.submit(self._get_topic, record, classifier, topics)
                for record in records
            ]

            for future in as_completed(futures):
                try:
                    future.result()
                except Exception as e:
                    print(f"Thread error: {e}")

        return self.records_topics

In [None]:
mapper = TopicMapper(max_workers=10)
results = mapper.get_all_topics(records, classifier, topics)

In [12]:
len(results)    # matches the count of initial records

2952

In [13]:
df = pd.DataFrame.from_records(results, index=None)
df.head()

Unnamed: 0,set_name,title,topic,topic_prob
0,maruaebd,Marmara Atatürk Eğitim Eğitim Bilimleri,Eğitim,0.838757
1,maruaad,Marmara Avrupa Araştırmaları Avrupa Araştırmaları,Siyaset ve Uluslararası İlişkiler,0.208243
2,marusbd,Marmara Siyasal Bilimler,Sosyal Bilimler,0.863213
3,marumj,Marmara Medical,Tıp ve Sağlık Bilimleri,0.45438
4,marusad,Marmara Sosyal Araştırmalar,Sosyal Bilimler,0.712313


In [14]:
df.to_pickle("topic_mappings.pkl")

In [15]:
topic_df = pd.read_pickle("topic_mappings.pkl")
topic_df.head()

Unnamed: 0,set_name,title,topic,topic_prob
0,maruaebd,Marmara Atatürk Eğitim Eğitim Bilimleri,Eğitim,0.838757
1,maruaad,Marmara Avrupa Araştırmaları Avrupa Araştırmaları,Siyaset ve Uluslararası İlişkiler,0.208243
2,marusbd,Marmara Siyasal Bilimler,Sosyal Bilimler,0.863213
3,marumj,Marmara Medical,Tıp ve Sağlık Bilimleri,0.45438
4,marusad,Marmara Sosyal Araştırmalar,Sosyal Bilimler,0.712313


In [16]:
topic_df.shape

(2952, 4)

In [17]:
counts = topic_df["topic"].value_counts()
counts

Unnamed: 0_level_0,count
topic,Unnamed: 1_level_1
Ekonomi ve İşletme,414
Sosyal Bilimler,386
Eğitim,338
Tıp ve Sağlık Bilimleri,321
"Mühendislik, Teknoloji ve Fen Bilimleri",302
İletişim ve Medya Çalışmaları,230
Yaşam ve Çevre Bilimleri,185
Dil ve Edebiyat,134
Hukuk,131
Siyaset ve Uluslararası İlişkiler,122


In [18]:
# keeping these for another project
"""
included_topics = [
    "Tıp ve Sağlık Bilimleri",
    "Ekonomi ve İşletme",
    "Mühendislik, Teknoloji ve Fen Bilimleri"

    "Tarih ve Coğrafya",
    "Sanat, Tasarım ve Mimarlık",
    "Dil ve Edebiyat"
]
"""

# medical
med = ["Tıp ve Sağlık Bilimleri"]

In [19]:
selected_journals_df = topic_df[topic_df["topic"].isin(med)]

In [24]:
selected_journals_df.head()

Unnamed: 0,set_name,title,topic,topic_prob
3,marumj,Marmara Medical,Tıp ve Sağlık Bilimleri,0.45438
13,iucerrahpasa,Cerrahpaşa Tıp,Tıp ve Sağlık Bilimleri,0.690656
16,iujfp,of of Pharmacy of Istanbul,Tıp ve Sağlık Bilimleri,0.418258
28,iuonkder,Türk Onkoloji,Tıp ve Sağlık Bilimleri,0.795952
44,adiyamansaglik,Adıyaman Sağlık Bilimleri,Tıp ve Sağlık Bilimleri,0.778775


In [20]:
f = open("tur_records_filtered.json")     # previously filtered data (not by topic)
data = json.load(f)

records = []
def convert_to_dict(data):
  for d in data:
    rec = {
        "set_name": d[0],
        "year": d[1],
        "title": d[2],
        "dergipark_url": d[3],
        "pdf_url": d[4]
    }
    records.append(rec)

convert_to_dict(data)

In [21]:
rec_df = pd.DataFrame.from_records(records, index=None)
rec_df.head()

Unnamed: 0,set_name,year,title,dergipark_url,pdf_url
0,mulkiye,2023,Kapitalizmin Kurucu Dinamizmi Olarak Göç Olgus...,https://dergipark.org.tr/tr/pub/mulkiye/issue/...,https://dergipark.org.tr/tr/download/article-f...
1,mulkiye,2022,Prof. Dr. Tuncer Bulutay’ı Tanımak,https://dergipark.org.tr/tr/pub/mulkiye/issue/...,https://dergipark.org.tr/tr/download/article-f...
2,mulkiye,2022,"Eril Tahakküm, Toplumsal Cinsiyet ve Gazetecil...",https://dergipark.org.tr/tr/pub/mulkiye/issue/...,https://dergipark.org.tr/tr/download/article-f...
3,mulkiye,2022,NEOLİBERAL EMEK PİYASASINDA BİREYCİLİK VE BEYA...,https://dergipark.org.tr/tr/pub/mulkiye/issue/...,https://dergipark.org.tr/tr/download/article-f...
4,mulkiye,2023,‘YÜZYIL’LIK SERENCAM: BAŞKENT ANKARA’NIN DÖNÜŞ...,https://dergipark.org.tr/tr/pub/mulkiye/issue/...,https://dergipark.org.tr/tr/download/article-f...


In [22]:
rec_df.shape

(244868, 5)

In [29]:
set_names = list(selected_journals_df["set_name"])
topic_filtered_records = rec_df[rec_df["set_name"].isin(set_names)]

In [30]:
topic_filtered_records.shape

(24136, 5)

In [31]:
topic_filtered_records.head()

Unnamed: 0,set_name,year,title,dergipark_url,pdf_url
1242,iucerrahpasa,2019,A 16-Year Analysis of Antifungal Susceptibilit...,https://dergipark.org.tr/tr/pub/iucerrahpasa/i...,https://dergipark.org.tr/tr/download/article-f...
1243,iucerrahpasa,2019,Factors Affecting Mortality in Rapidly Progre...,https://dergipark.org.tr/tr/pub/iucerrahpasa/i...,https://dergipark.org.tr/tr/download/article-f...
1244,iucerrahpasa,2019,Tedavinin Sirolimus’tan Everolimus’a değiştiri...,https://dergipark.org.tr/tr/pub/iucerrahpasa/i...,https://dergipark.org.tr/tr/download/article-f...
1245,iucerrahpasa,2019,Case of Rhabdomyolysis in a Patient Undergoing...,https://dergipark.org.tr/tr/pub/iucerrahpasa/i...,https://dergipark.org.tr/tr/download/article-f...
1246,iucerrahpasa,2019,Assessing the Consistency of Clinical Diagnosi...,https://dergipark.org.tr/tr/pub/iucerrahpasa/i...,https://dergipark.org.tr/tr/download/article-f...


In [32]:
topic_filtered_records.to_pickle("topic_filtered_records.pkl")