In [1]:
import pandas as pd
import re
import ast

df = pd.read_csv("works.csv")

df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# keep only printable characters
df = df.applymap(lambda x: re.sub(r"[^\x20-\x7E]", " ", x) if isinstance(x, str) else x)

df = df.replace(r'^\s*$', None, regex=True)

# clean DOIs (remove URL prefixes and spaces)
def clean_doi(doi):
    if not isinstance(doi, str):
        return None
    doi = doi.strip().lower()
    doi = re.sub(r'https?://(dx\.)?doi\.org/', '', doi)
    doi = doi.split(' ')[0]
    return doi or None

df["doi"] = df["doi"].apply(clean_doi)

df = df.drop_duplicates(subset=["doi", "title"])

df.to_csv("works_clean.csv", index=False)


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  df = df.applymap(lambda x: re.sub(r"[^\x20-\x7E]", " ", x) if isinstance(x, str) else x)


In [2]:
df_c = pd.read_csv("works_clean.csv")

print("Shape:", df_c.shape)
print("\nColumns:", df_c.columns.tolist())
print("\nSample rows:")
print(df_c.head(2))

print("\nMissing DOIs:", df_c['doi'].isna().sum())
print("Missing abstracts (EN):", df_c['abstract_en'].isna().sum())
print("Missing titles:", df_c['title'].isna().sum())

#text check
print("\nExample title sample:")
for t in df_c['title'].dropna().sample(min(3, len(df_c)), random_state=1):
    print("-", t[:80])


Shape: (6536, 11)

Columns: ['orgunit', 'title', 'pub_type', 'date_issued', 'authors', 'doi', 'publisher', 'abstract_en', 'abstract_de', 'pdf_paths', 'pdf_count']

Sample rows:
          orgunit                                              title  \
0            E194  Unplugged Decision Tree Learning   A Learning ...   
1  E194;E235;E105  Opportunities and pitfalls of regression algor...   

        pub_type date_issued  \
0  Inproceedings        2025   
1        Article  2025-02-01   

                                             authors  \
0                     Lehner, Lukas,Landman, Martina   
1  Huymajer, Marco,Filzmoser, Peter,Mazak, Alexan...   

                              doi                    publisher  \
0     10.1007/978-3-031-73257-7_4  Springer Nature Switzerland   
1  10.1016/j.engappai.2024.109599                          NaN   

                                         abstract_en abstract_de pdf_paths  \
0  Artificial intelligence (AI) is now deeply ing...         Na

In [3]:
import requests
import time

def fetch_openalex_metadata(doi):
    if not isinstance(doi, str) or not doi:
        return None
    url = f"https://api.openalex.org/works/https://doi.org/{doi}"
    try:
        r = requests.get(url, timeout=10)
        if r.status_code != 200:
            return None
        data = r.json()
        return {
            "openalex_id": data.get("id"),
            "concepts": [c["display_name"] for c in data.get("concepts", [])],
            "cited_by_count": data.get("cited_by_count"),
            "referenced_works_count": len(data.get("referenced_works", []))
        }
    except requests.RequestException:
        return None

metadata_records = []
for i, doi in enumerate(df["doi"]):
    if pd.isna(doi):
        metadata_records.append({})
        continue
    meta = fetch_openalex_metadata(doi)
    metadata_records.append(meta or {})
    if i % 20 == 0:
        print(f"Processed {i} / {len(df)} DOIs")
    time.sleep(0.5)  

meta_df = pd.DataFrame(metadata_records)
df_enriched = pd.concat([df, meta_df], axis=1)

df_enriched.to_csv("works_enriched.csv", index=False)


Processed 0 / 6536 DOIs


KeyboardInterrupt: 

In [4]:
df = pd.read_csv("works_enriched.csv")

print("Shape:", df.shape)
print("\nNew columns:", [c for c in df.columns if c in ['openalex_id', 'concepts', 'cited_by_count', 'referenced_works_count']])

sample = df[df['openalex_id'].notna()].head(10)[
    ['title', 'doi', 'concepts', 'cited_by_count', 'referenced_works_count']
]
print("\nSample enriched rows:")
print(sample.to_string(index=False))

print("\nEnriched entries count:", df['openalex_id'].notna().sum())
print("Missing OpenAlex IDs:", df['openalex_id'].isna().sum())

Shape: (6683, 15)

New columns: ['openalex_id', 'concepts', 'cited_by_count', 'referenced_works_count']

Sample enriched rows:
                                                                                                                                   title                            doi                                                                                                                                                                                                                                                                                                                       concepts  cited_by_count  referenced_works_count
                                           Unplugged Decision Tree Learning   A Learning Activity for Machine Learning Education in K-12    10.1007/978-3-031-73257-7_4                                                                                                                                                    ['Computer science', 'Decision 

In [5]:
df = pd.read_csv("works_enriched.csv")

# normalize concepts
def normalize_concepts(val):
    if pd.isna(val):
        return ""
    try:
        items = ast.literal_eval(val) if isinstance(val, str) and val.startswith('[') else val.split(';')
        items = [x.strip().lower() for x in items if isinstance(x, str)]
        return "; ".join(sorted(set(items)))
    except Exception:
        return ""

df["concepts_norm"] = df["concepts"].apply(normalize_concepts)

# clean abstracts
df["abstract_en"] = df["abstract_en"].fillna("").str.replace(r"\s+", " ", regex=True).str.strip()
df["title"] = df["title"].fillna("").str.strip()

df_ready = df[(df["abstract_en"].str.len() > 50) | (df["concepts_norm"].str.len() > 0)]

df_ready.to_csv("works_ready.csv", index=False)


In [6]:
df = pd.read_csv("works_ready.csv")

print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())

print("\nMissing abstracts:", df['abstract_en'].isna().sum())
print("Missing concepts:", df['concepts'].isna().sum())
print("Missing DOIs:", df['doi'].isna().sum())

print("\nSample entries:")
print(df[['title', 'concepts', 'abstract_en']].head(3))

# check how many rows are actually usable for retrieval
usable = df[(df['abstract_en'].str.len() > 50) | (df['concepts'].notna())]
print(f"\nUsable entries: {len(usable)} / {len(df)} ({len(usable)/len(df)*100:.1f}%)")

sample_concepts = df['concepts'].dropna().sample(5, random_state=42).tolist()
print("\nSample concept lists:")
for c in sample_concepts:
    print("-", c)


print("\nAverage abstract length (chars):", df['abstract_en'].dropna().apply(len).mean())


Shape: (5814, 16)

Columns: ['orgunit', 'title', 'pub_type', 'date_issued', 'authors', 'doi', 'publisher', 'abstract_en', 'abstract_de', 'pdf_paths', 'pdf_count', 'openalex_id', 'concepts', 'cited_by_count', 'referenced_works_count', 'concepts_norm']

Missing abstracts: 1314
Missing concepts: 1583
Missing DOIs: 1699

Sample entries:
                                               title  \
0  Unplugged Decision Tree Learning   A Learning ...   
1  Opportunities and pitfalls of regression algor...   
2    SPiKE: 3D Human Pose from Point Cloud Sequences   

                                            concepts  \
0  ['Computer science', 'Decision tree', 'Artific...   
1  ['Computer science', 'Residual', 'Regression',...   
2  ['Computer science', 'Point cloud', 'Spike (so...   

                                         abstract_en  
0  Artificial intelligence (AI) is now deeply ing...  
1  The residual value of heavy equipment is essen...  
2                                                N

In [7]:
df = pd.read_csv("works_ready.csv")

print(df['concepts'].head(3))
print(type(df['concepts'].iloc[0]))  

# parse stringified lists
def safe_parse_list(x):
    if isinstance(x, str) and x.startswith('['):
        try:
            return ast.literal_eval(x)
        except:
            return [x]
    return x if isinstance(x, list) else [x] if pd.notna(x) else []

df['concepts'] = df['concepts'].apply(safe_parse_list)
df['concepts_norm'] = df['concepts_norm'].apply(safe_parse_list)

print(df['concepts'].iloc[0])
print(type(df['concepts'].iloc[0]))


0    ['Computer science', 'Decision tree', 'Artific...
1    ['Computer science', 'Residual', 'Regression',...
2    ['Computer science', 'Point cloud', 'Spike (so...
Name: concepts, dtype: object
<class 'str'>
['Computer science', 'Decision tree', 'Artificial intelligence', 'Machine learning', 'Decision tree learning', 'Tree (set theory)', 'Mathematics', 'Mathematical analysis']
<class 'list'>


In [8]:
def clean_text(s):
    if pd.isna(s): return ""
    s = str(s).replace("\\n", " ").replace("\\r", " ")
    s = s.replace("  ", " ").strip()
    return s

for col in ['title', 'abstract_en', 'abstract_de']:
    df[col] = df[col].apply(clean_text)

df = df.drop(columns=["orgunit"])


In [9]:
df.to_csv("works_final.csv", index=False)


In [10]:
df = pd.read_csv("works_final.csv")

print("Shape:", df.shape)
print("Columns:", df.columns.tolist())

print("\nMissing abstracts:", df['abstract_en'].isna().sum())
print("Missing concepts:", df['concepts'].isna().sum())
print("Missing DOIs:", df['doi'].isna().sum())

# verify that concepts are stored as lists, not strings
def check_and_parse(x):
    if isinstance(x, str) and x.startswith('['):
        try: return ast.literal_eval(x)
        except: return [x]
    return x if isinstance(x, list) else [x] if pd.notna(x) else []

df['concepts'] = df['concepts'].apply(check_and_parse)

print("\nExample row:")
print(df[['title','concepts','abstract_en']].iloc[0])

print("\nAverage abstract length (chars):", df['abstract_en'].dropna().apply(len).mean())
print("Average concept count:", df['concepts'].apply(len).mean())


Shape: (5814, 15)
Columns: ['title', 'pub_type', 'date_issued', 'authors', 'doi', 'publisher', 'abstract_en', 'abstract_de', 'pdf_paths', 'pdf_count', 'openalex_id', 'concepts', 'cited_by_count', 'referenced_works_count', 'concepts_norm']

Missing abstracts: 1314
Missing concepts: 0
Missing DOIs: 1699

Example row:
title          Unplugged Decision Tree Learning  A Learning A...
concepts       [Computer science, Decision tree, Artificial i...
abstract_en    Artificial intelligence (AI) is now deeply ing...
Name: 0, dtype: object

Average abstract length (chars): 1137.6671111111111
Average concept count: 10.438080495356036
