In [1]:
import csv, glob, os, zipfile
from collections import Counter

BASE = "/content"  # adjust if files are in another folder
OUT_DIR = f"{BASE}/merged"
os.makedirs(OUT_DIR, exist_ok=True)

In [2]:
# 1) Merge TSVs
tsv_files = glob.glob(f"{BASE}/medical_vocab_*.tsv")
counts = Counter()
for path in tsv_files:
    with open(path, "r", encoding="utf-8") as f:
        reader = csv.reader(f, delimiter="\t")
        for row in reader:
            if not row: continue
            term, cnt = row[0], int(row[1]) if len(row) > 1 else 1
            counts[term] += cnt

merged_tsv = os.path.join(OUT_DIR, "medical_vocab_all.tsv")
with open(merged_tsv, "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f, delimiter="\t")
    for term, cnt in sorted(counts.items()):
        writer.writerow([term, cnt])

print(f"✔ Merged TSV written: {merged_tsv} ({len(counts)} unique terms)")

✔ Merged TSV written: /content/merged/medical_vocab_all.tsv (371461 unique terms)


In [4]:
# 2) Merge word lists
word_files = glob.glob(f"{BASE}/pubmed_words_*.txt")
all_words = set()
for path in word_files:
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            all_words.add(line.strip().lower())

merged_words = os.path.join(OUT_DIR, "pubmed_words_all.txt")
with open(merged_words, "w", encoding="utf-8") as f:
    for w in sorted(all_words):
        f.write(w + "\n")

print(f"✔ Merged words written: {merged_words} ({len(all_words)} unique words)")


✔ Merged words written: /content/merged/pubmed_words_all.txt (371461 unique words)


In [5]:
# 3) Merge phrases
phrase_files = glob.glob(f"{BASE}/mesh_phrases_*")
all_phrases = set()
for path in phrase_files:
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            all_phrases.add(line.strip())

merged_phrases = os.path.join(OUT_DIR, "mesh_phrases_all.txt")
with open(merged_phrases, "w", encoding="utf-8") as f:
    for p in sorted(all_phrases):
        f.write(p + "\n")

print(f"✔ Merged phrases written: {merged_phrases} ({len(all_phrases)} unique phrases)")


✔ Merged phrases written: /content/merged/mesh_phrases_all.txt (994605 unique phrases)


In [6]:
# 4) Zip outputs
zip_path = f"{OUT_DIR}/mesh_all.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z:
    z.write(merged_tsv, arcname="medical_vocab_all.tsv")
    z.write(merged_words, arcname="pubmed_words_all.txt")
    z.write(merged_phrases, arcname="mesh_phrases_all.txt")

print("✔ Final ZIP ready:", zip_path)

✔ Final ZIP ready: /content/merged/mesh_all.zip


In [7]:
# 5) Download
from google.colab import files
files.download(zip_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>