In [4]:
# server_flask_threaded.py
"""
Notebook-friendly Flask server that wraps scopus_app_core.process_author and
adds three Excel sheets per author file: Articles (all), BAU Articles (affiliated),
and APP (Participation Score) for years 2022,2023,2024.

Usage (Jupyter):
    import server_flask_threaded as server  # module will start the server thread
    # Server object is 'server' at module level; to stop:
    server.server.shutdown()

Or run as script:
    python server_flask_threaded.py
"""
from __future__ import annotations

import os
import io
import json
import socket
import traceback
import threading
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import pandas as pd
from flask import Flask, request, jsonify, send_file, send_from_directory, abort
from werkzeug.serving import make_server
from werkzeug.utils import secure_filename

# -------------------- Config / paths --------------------
ROOT = Path.cwd()
STATIC_DIR = ROOT / "static"
AUTHORS_DIR = ROOT / "authors"
UPLOADS = ROOT / "uploads"

for d in (STATIC_DIR, AUTHORS_DIR, UPLOADS):
    d.mkdir(parents=True, exist_ok=True)

# default years for APP calculation
APP_YEARS = [2022, 2023, 2024]

ALLOWED_EXTS = {".csv", ".xlsx", ".xls"}
PORT = int(os.environ.get("PORT", 5000))

# -------------------- Try to import user's core --------------------
try:
    import scopus_app_core as core  # type: ignore
except Exception:
    core = None
    _import_err = traceback.format_exc()
    print("Warning: failed to import scopus_app_core.py. Endpoints using it will error.")
    print(_import_err)

# -------------------- App & util funcs --------------------
app = Flask(__name__, static_folder=str(STATIC_DIR), template_folder=str(STATIC_DIR))
STATE: Dict[str, Optional[object]] = {"citescore_path": None, "cs_table": None, "cs_by_source": None}


def get_lan_ip() -> str:
    try:
        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
        s.connect(("8.8.8.8", 80))
        ip = s.getsockname()[0]
        s.close()
        return ip
    except Exception:
        return "127.0.0.1"


def allowed_file(filename: str) -> bool:
    return Path(filename).suffix.lower() in ALLOWED_EXTS


# -------------------- Small helpers for BAU detection & APP calc --------------------
def _candidate_bau_filters() -> List[str]:
    """Possible column names that might carry BAU info in author Excels."""
    return [
        "BAU Author IDs", "BAU_Author_IDs", "bau_author_ids", "bau_auids",
        "BAU Author Names", "bau_author_names", "BAU Author Organizations",
        "BAU_Author_Organizations", "BAU Author Organizations",
        "affiliation_ids", "affiliation id", "affiliations", "institutions",
        "Author Org Map JSON", "AuthorOrgMapJSON", "author_org_map_json"
    ]


def _detect_bau_rows(df: pd.DataFrame, aff_id: str) -> pd.DataFrame:
    """
    Attempt to detect BAU-affiliated rows. Looks for many possible columns and
    tests whether the aff_id appears.
    """
    aff_id = str(aff_id).strip()
    if not aff_id:
        return df.iloc[0:0].copy()

    cols = [c for c in df.columns]
    # first, exact BAU Author IDs style
    candidates = [c for c in cols if c.lower().replace("_", " ") in (x.lower() for x in _candidate_bau_filters())]
    # fallback: any column whose values contain the aff_id substring
    bau_mask = pd.Series(False, index=df.index)
    for c in candidates:
        try:
            # stringify column
            s = df[c].astype(str).fillna("")
            bau_mask = bau_mask | s.str.contains(repr(aff_id).strip("'\""), na=False)
            # also test raw numeric match
            bau_mask = bau_mask | s.str.contains(aff_id, na=False)
        except Exception:
            continue

    # If none of the candidate columns exist, try a heuristic on 'Author Org Map JSON' or 'Institutions'
    if not bau_mask.any():
        for c in ("Author Org Map JSON", "author_org_map_json", "Institutions", "institutions"):
            if c in df.columns:
                try:
                    s = df[c].astype(str).fillna("")
                    # look for aff id inside json or semicolon-separated lists
                    bau_mask = bau_mask | s.str.contains(aff_id, na=False) | s.str.contains(rf"\b{aff_id}\b", na=False)
                except Exception:
                    pass

    # As a last resort, test whether aff_id appears inside any column's stringified content
    if not bau_mask.any():
        for c in cols:
            try:
                s = df[c].astype(str).fillna("")
                if s.str.contains(aff_id, na=False).any():
                    bau_mask = bau_mask | s.str.contains(aff_id, na=False)
            except Exception:
                continue

    return df[bau_mask].copy()


def _qc_from_percentile(pct: Optional[float]) -> float:
    """
    Map CiteScore Percentile to QC as per your table:
      0–10%  -> 1.4
      11–25% -> 1.0
      26–50% -> 0.8
      51–75% -> 0.6
      76–100%-> 0.4
    (If missing or out-of-range, default to 0.0)
    """
    try:
        if pct is None:
            return 0.0
        p = float(pct)
    except Exception:
        return 0.0
    if 0 <= p <= 10:
        return 1.4
    if 11 <= p <= 25:
        return 1.0
    if 26 <= p <= 50:
        return 0.8
    if 51 <= p <= 75:
        return 0.6
    if 76 <= p <= 100:
        return 0.4
    return 0.0


def _author_coefficient(num_authors: Optional[int]) -> float:
    try:
        n = int(num_authors)
    except Exception:
        return 0.0
    if n <= 0:
        return 0.0
    if n == 1:
        return 1.2
    return round(1.2 / n, 4)


def compute_app_sheet(df_articles: pd.DataFrame, aff_id: Optional[str], years: List[int] = APP_YEARS) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Returns (app_details_df, app_summary_df).
    - app_details_df: one row per eligible article (subtype 'ar', year in years)
        columns: eid, title, year, authors_count, cs_percentile, QC, AC, Contribution
        and columns copied from original as helpful.
    - app_summary_df: per-year sums and thresholds + total.
    """
    if df_articles is None or df_articles.empty:
        return pd.DataFrame(), pd.DataFrame()

    df = df_articles.copy()

    # Normalize year column if present
    year_col = None
    for c in ("year", "Year", "publication_year", "coverDate", "Full Date"):
        if c in df.columns:
            year_col = c
            break

    def _to_year(v):
        try:
            if pd.isna(v):
                return None
            s = str(v).strip()
            # first try integer
            if s.isdigit() and len(s) == 4:
                return int(s)
            # common format YYYY-MM-DD
            if len(s) >= 4 and s[:4].isdigit():
                return int(s[:4])
            return None
        except Exception:
            return None

    df["_pub_year"] = df[year_col].apply(_to_year) if year_col else None
    if df["_pub_year"] is None:
        df["_pub_year"] = pd.Series([None] * len(df))

    # Filter by subtype 'ar' (article) — exact match or lowercase contains 'ar' at start.
    subtype_col = None
    for c in ("subtype", "Publication type", "PublicationType", "subtypedescription"):
        if c in df.columns:
            subtype_col = c
            break
    if subtype_col:
        # accept exact 'ar' or 'article' case-insensitive
        df["_is_article"] = df[subtype_col].astype(str).str.lower().isin({"ar", "article", "research article", "research-article", "research"})
    else:
        # If no subtype available, conservatively assume all are articles (user asked to include only ar; but we can't detect)
        df["_is_article"] = True

    # Filter eligible rows
    df_elig = df[df["_is_article"] & df["_pub_year"].isin(years)].copy()
    if df_elig.empty:
        # empty details and a zero summary
        summary = pd.DataFrame([{"Year": y, "APP": 0.0} for y in years] + [{"Year": "Total", "APP": 0.0}])
        return pd.DataFrame(), summary

    # Authors count column detection
    auth_col = None
    for c in ("authors_count", "Number of Authors", "authors_count"):
        if c in df_elig.columns:
            auth_col = c
            break
    if not auth_col:
        # try 'Authors' number from semicolon-separated list
        if "Authors" in df_elig.columns:
            df_elig["authors_count_infer"] = df_elig["Authors"].astype(str).apply(lambda s: 0 if not s.strip() else len([x for x in s.split(";") if x.strip()]))
            auth_col = "authors_count_infer"
        else:
            df_elig["authors_count_infer"] = 1
            auth_col = "authors_count_infer"

    # CiteScore percentile column detection
    pct_col = None
    for c in ("cs_percentile", "CiteScore Percentile", "cs_percentile"):
        if c in df_elig.columns:
            pct_col = c
            break

    # Compute coefficients
    def _compute_row(row):
        n_auth = row.get(auth_col)
        ac = _author_coefficient(n_auth)
        pct = None
        if pct_col:
            try:
                pct = float(row.get(pct_col)) if row.get(pct_col) not in (None, "", "nan") else None
            except Exception:
                pct = None
        qc = _qc_from_percentile(pct)
        contrib = round(ac * qc, 2)
        return pd.Series({"AC": ac, "QC": qc, "Contribution": contrib, "CiteScore Percentile": pct})

    comps = df_elig.apply(_compute_row, axis=1)
    df_out = pd.concat([df_elig.reset_index(drop=True), comps.reset_index(drop=True)], axis=1)

    # Create summary per year
    sums = df_out.groupby("_pub_year")["Contribution"].sum().reindex(years, fill_value=0.0)
    summary_rows = [{"Year": int(y), "APP": round(float(sums.loc[y]), 2)} for y in years]
    total = round(sum(r["APP"] for r in summary_rows), 2)
    summary_rows.append({"Year": "Total", "APP": total})

    summary_df = pd.DataFrame(summary_rows)

    # attach any BAU filtering notes: (we don't need it here; BAU articles will be separate)
    return df_out, summary_df


# -------------------- Endpoints --------------------
@app.get("/")
def index():
    idx = STATIC_DIR / "index.html"
    if idx.exists():
        return send_from_directory(str(STATIC_DIR), "index.html")
    return jsonify({"ok": True, "message": "Place static/index.html in a 'static' folder."})


@app.post("/api/upload_citescore")
def api_upload_citescore():
    """
    Accept multipart file upload (field 'file'). Loads CiteScore via core.load_citescore_table
    and builds cs_by_source.
    """
    if core is None:
        return jsonify({"ok": False, "error": "scopus_app_core.py not importable."}), 500
    if "file" not in request.files:
        return jsonify({"ok": False, "error": "No file uploaded"}), 400
    f = request.files["file"]
    if f.filename == "":
        return jsonify({"ok": False, "error": "Empty filename"}), 400
    if not allowed_file(f.filename):
        return jsonify({"ok": False, "error": f"Unsupported extension {f.filename}"}), 400

    dest = UPLOADS / secure_filename(f.filename)
    f.save(dest)
    try:
        cs_table = core.load_citescore_table(dest)
        cs_by_source = core.build_cs_by_source(cs_table, serial_sleep=0.0)
    except Exception as e:
        traceback.print_exc()
        return jsonify({"ok": False, "error": f"Failed to load CiteScore: {e}"}), 500

    STATE["citescore_path"] = str(dest)
    STATE["cs_table"] = cs_table
    STATE["cs_by_source"] = cs_by_source
    return jsonify({"ok": True, "path": str(dest)})


@app.post("/api/run_author")
def api_run_author():
    """
    JSON body:
      { "auid": "55537877400", "aff_id": "60021379" }
    Runs core.process_author, then post-processes the returned Excel to add:
      - Articles (all rows)
      - BAU Articles (rows matching aff_id heuristics)
      - APP (Participation Score) for years 2022-2024, subtype 'ar' only
    """
    if core is None:
        return jsonify({"ok": False, "error": "scopus_app_core.py not importable."}), 500
    data = request.get_json(force=True, silent=True) or {}
    auid = str(data.get("auid", "")).strip()
    aff_id = str(data.get("aff_id", "")).strip()
    years = data.get("years", APP_YEARS)
    try:
        years = [int(y) for y in years] if isinstance(years, (list, tuple)) else APP_YEARS
    except Exception:
        years = APP_YEARS

    if not auid or not auid.isdigit():
        return jsonify({"ok": False, "error": "Provide numeric auid"}), 400
    if STATE.get("cs_table") is None or STATE.get("cs_by_source") is None:
        return jsonify({"ok": False, "error": "Please upload CiteScore first (POST /api/upload_citescore)"}), 400

    try:
        # call your core processing (this should write the author file to authors/)
        out_path = core.process_author(
            author_id=auid,
            cs_table=STATE["cs_table"],
            out_dir=AUTHORS_DIR,
            sleep=0.05,
            serial_sleep=0.0,
        )
    except TypeError:
        # fallback if core.process_author has different signature
        try:
            out_path = core.process_author(auid, STATE["cs_table"], AUTHORS_DIR, 0.05, 0.0)  # older signature
        except Exception as e:
            traceback.print_exc()
            return jsonify({"ok": False, "error": f"process_author failed: {e}"}), 500
    except Exception as e:
        traceback.print_exc()
        return jsonify({"ok": False, "error": f"process_author failed: {e}"}), 500

    if not out_path:
        return jsonify({"ok": False, "error": "process_author returned no file path"}), 500

    out_fp = Path(out_path)
    if not out_fp.exists():
        return jsonify({"ok": False, "error": f"Author file not found after processing: {out_path}"}), 500

    # Read all existing sheets, then write new file with added sheets
    try:
        # read all sheets
        xl = pd.read_excel(out_fp, sheet_name=None)
    except Exception as e:
        traceback.print_exc()
        return jsonify({"ok": False, "error": f"Failed to read generated excel: {e}"}), 500

    # Ensure we have an 'articles' sheet or detect close variants
    articles_df = None
    for sname in xl:
        if sname.lower() == "articles" or "article" in sname.lower():
            articles_df = xl[sname].copy()
            break
    if articles_df is None:
        # fallback: try first sheet
        first = next(iter(xl.keys()))
        articles_df = xl[first].copy()

    # Build Articles sheet (all rows unfiltered)
    articles_sheet = articles_df.copy()

    # Build BAU Articles sheet (attempt detection)
    bau_sheet = _detect_bau_rows(articles_df, aff_id)

    # Build APP sheet(s)
    app_details_df, app_summary_df = compute_app_sheet(articles_df, aff_id, years=years)

    # Recompose workbook: keep other original sheets too, and add new ones
    try:
        new_sheets: Dict[str, pd.DataFrame] = {}
        # keep original sheets (but do not duplicate 'articles' if we will add 'Articles' separately)
        for sname, df in xl.items():
            new_sheets[sname] = df

        # Add the "Articles" sheet (all rows) and "BAU Articles", "APP Details", "APP Summary"
        new_sheets["Articles"] = articles_sheet
        new_sheets["BAU Articles"] = bau_sheet
        new_sheets["APP Details"] = app_details_df
        new_sheets["APP Summary"] = app_summary_df

        # Write back (overwrite original file)
        with pd.ExcelWriter(out_fp, engine="openpyxl") as writer:
            for sname, df in new_sheets.items():
                try:
                    # Some dataframes might be empty — write an informative placeholder
                    if df is None or (isinstance(df, pd.DataFrame) and df.empty):
                        pd.DataFrame({"Info": [f"Sheet {sname} is empty."]}).to_excel(writer, sheet_name=sname, index=False)
                    else:
                        df.to_excel(writer, sheet_name=sname, index=False)
                except Exception:
                    # best-effort: write minimal info if sheet write fails
                    pd.DataFrame({"Info": [f"Failed to write sheet {sname}"]}).to_excel(writer, sheet_name=sname, index=False)

    except Exception as e:
        traceback.print_exc()
        return jsonify({"ok": False, "error": f"Failed to append new sheets: {e}"}), 500

    return jsonify({"ok": True, "path": str(out_fp), "filename": out_fp.name})


@app.post("/api/run_authors_batch")
def api_run_authors_batch():
    """
    JSON: { "auids": "123,456,789", "aff_id": "60021379" }
    """
    if core is None:
        return jsonify({"ok": False, "error": "scopus_app_core.py not importable."}), 500
    data = request.get_json(force=True, silent=True) or {}
    auids_raw = data.get("auids", "")
    aff_id = str(data.get("aff_id", "") or "").strip()
    toks = [t.strip() for t in str(auids_raw).replace(";", ",").split(",") if t.strip()]
    if not toks:
        return jsonify({"ok": False, "error": "No AU-IDs provided"}), 400

    results = []
    for auid in toks:
        try:
            resp = api_run_author_inner(auid, aff_id)
            results.append(resp)
        except Exception as e:
            results.append({"auid": auid, "ok": False, "error": str(e)})
    return jsonify({"ok": True, "results": results})


def api_run_author_inner(auid: str, aff_id: str):
    """Helper used by batch runner — mirrors /api/run_author logic but returns dict."""
    # call process_author
    try:
        out_path = core.process_author(author_id=auid, cs_table=STATE["cs_table"], out_dir=AUTHORS_DIR, sleep=0.05, serial_sleep=0.0)
    except Exception as e:
        return {"auid": auid, "ok": False, "error": f"process_author failed: {e}"}
    if not out_path:
        return {"auid": auid, "ok": False, "error": "No output produced"}

    out_fp = Path(out_path)
    if not out_fp.exists():
        return {"auid": auid, "ok": False, "error": "Output file missing after processing"}

    # same post-processing as /api/run_author
    try:
        xl = pd.read_excel(out_fp, sheet_name=None)
        articles_df = None
        for sname in xl:
            if sname.lower() == "articles" or "article" in sname.lower():
                articles_df = xl[sname].copy()
                break
        if articles_df is None:
            first = next(iter(xl.keys()))
            articles_df = xl[first].copy()
        bau_sheet = _detect_bau_rows(articles_df, aff_id)
        app_details_df, app_summary_df = compute_app_sheet(articles_df, aff_id, years=APP_YEARS)

        new_sheets = {}
        for sname, df in xl.items():
            new_sheets[sname] = df
        new_sheets["Articles"] = articles_df
        new_sheets["BAU Articles"] = bau_sheet
        new_sheets["APP Details"] = app_details_df
        new_sheets["APP Summary"] = app_summary_df

        with pd.ExcelWriter(out_fp, engine="openpyxl") as writer:
            for sname, df in new_sheets.items():
                if df is None or (isinstance(df, pd.DataFrame) and df.empty):
                    pd.DataFrame({"Info": [f"Sheet {sname} is empty."]}).to_excel(writer, sheet_name=sname, index=False)
                else:
                    df.to_excel(writer, sheet_name=sname, index=False)
    except Exception as e:
        return {"auid": auid, "ok": False, "error": f"Post-processing failed: {e}"}

    return {"auid": auid, "ok": True, "path": str(out_fp), "filename": out_fp.name}


@app.get("/api/authors")
def api_list_authors():
    items = []
    for p in sorted(AUTHORS_DIR.glob("*.xlsx")):
        try:
            x = pd.ExcelFile(p)
            n = 0
            if "articles" in (s.lower() for s in x.sheet_names):
                n = len(pd.read_excel(x, sheet_name="articles"))
            items.append({"filename": p.name, "path": str(p), "size": p.stat().st_size, "articles": n})
        except Exception:
            items.append({"filename": p.name, "path": str(p), "size": p.stat().st_size, "articles": None})
    return jsonify({"ok": True, "items": items, "count": len(items)})


@app.get("/api/download")
def api_download():
    p = request.args.get("path") or ""
    if not p:
        return jsonify({"error": "Provide path query param"}), 400
    fp = Path(p).resolve()
    if not fp.exists():
        return jsonify({"error": "File not found"}), 404
    # ensure file is under AUTHORS_DIR for safety
    try:
        if AUTHORS_DIR not in fp.parents and fp.parent != AUTHORS_DIR:
            return jsonify({"error": "Invalid path (must be in authors/)"}), 400
    except Exception:
        pass
    return send_file(str(fp), as_attachment=True)


@app.get("/api/health")
def api_health():
    return jsonify({"ok": True, "status": "up"})


# -------------------- Notebook-friendly server class --------------------
class ServerThread(threading.Thread):
    def __init__(self, app, host: str = "127.0.0.1", port: int = PORT):
        super().__init__(daemon=True)
        self.srv = make_server(host, port, app)
        self.ctx = app.app_context()
        self.ctx.push()
        self.host = host
        self.port = port

    def run(self):
        self.srv.serve_forever()

    def shutdown(self):
        try:
            self.srv.shutdown()
        except Exception:
            pass


# Stop previous server object if reloading in notebook
try:
    server  # type: ignore
    try:
        server.shutdown()
    except Exception:
        pass
except NameError:
    pass

server = ServerThread(app, host="0.0.0.0", port=PORT)
server.start()

LAN_IP = get_lan_ip()
print(f"Local: http://127.0.0.1:{PORT}")
print(f"LAN:   http://{LAN_IP}:{PORT}")


Traceback (most recent call last):
  File "C:\Users\yusef.atteyih\AppData\Local\Temp\ipykernel_25692\3878367549.py", line 48, in <module>
    import scopus_app_core as core  # type: ignore
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ModuleNotFoundError: No module named 'scopus_app_core'

Local: http://127.0.0.1:5000
LAN:   http://10.0.66.249:5000


127.0.0.1 - - [02/Sep/2025 17:22:16] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [02/Sep/2025 17:22:16] "GET /api/authors HTTP/1.1" 200 -
127.0.0.1 - - [02/Sep/2025 17:22:27] "POST /api/run_author HTTP/1.1" 500 -
