# 1. Create working folders for tutorial

In [14]:
!mkdir -p /content/dep_collocation_tutorial
!mkdir -p /content/dep_collocation_tutorial/OANC
!mkdir -p /content/dep_collocation_tutorial/NICER
!mkdir -p /content/dep_collocation_tutorial/NICER/nicer_plain
!mkdir -p /content//dep_collocation_tutorial/collocation

# /content
# 　　└── dep_collocation_tutorial
#     　　　　　├── OANC <-- reference corpus
#     　　　　　├── NICER
#     　　　　　│  　　 └── nicer_plain <-- learner corpus
#     　　　　　└── collocation <-- extracted collocation


# 2. Create a Python library (mylib.py)

In [None]:
# Create a Python library

%%writefile /content/dep_collocation_tutorial/mylib.py

EXCLUDE_OBJ_WORD = {"my", "your", "her", "its", "our", "their", "a", "an", "the"}
SUBJECT_DEPS = {"nsubj", "nsubjpass", "expl"}
EXCLUDE_ADVERBS = {
    "however", "therefore", "thus", "furthermore", "moreover", "nevertheless",
    "nonetheless", "instead", "consequently", "accordingly", "meanwhile",
    "besides", "otherwise", "hence", "so", "how", "why", "when"
}

def has_subject_between(left_tok, verb_tok, sent):
    i1, i2 = left_tok.i, verb_tok.i
    for t in sent:
        if i1 < t.i < i2 and t.dep_ in SUBJECT_DEPS and t.head == verb_tok:
            return True
    return False

def is_followed_by_comma(tok):
    if tok.i + 1 >= len(tok.doc):
        return False
    next_token = tok.doc[tok.i + 1]
    return next_token.is_punct and next_token.text == ","

def is_how_plus_adv(tok):
    if tok.pos_ != "ADV":
        return False
    if tok.i - 1 >= 0:
        prev_token = tok.doc[tok.i - 1]
        return prev_token.text.lower() == "how"
    return False

def is_hyphenated_token(tok):
    """
    トークン tok がハイフン結合語なら True を返す。
    - "part" "-" "time" → "part" と "time" が True
    """
    HYPHENS = {"-", "‐", "‒", "–", "−"}  # hyphen, en dash, minus など
    text = tok.text
    # トークン自身がハイフン記号なら True
    if text in HYPHENS:
        return True
    # 直前または直後がハイフン記号トークンの場合
    doc = tok.doc
    i = tok.i
    if i - 1 >= 0 and doc[i - 1].text in HYPHENS:
        return True
    if i + 1 < len(doc) and doc[i + 1].text in HYPHENS:
        return True
    return False

def is_excluded_ing(token) -> bool:
    """
    token が -ing 語で、例外リストに含まれず、
    直前のトークンが VERB の場合に True。
    """
    EXCEPT_ING = {"thing", "something", "anything", "nothing"}
    word = token.lower_

    if word.endswith("ing") and word not in EXCEPT_ING:
        doc = token.doc
        if token.i > 0 and doc[token.i - 1].pos_ == "VERB":
            return True
    return False

# 判定ヘルパー
def is_adj_noun(token):
    return (
        token.dep_ in ("amod", "compound")
        and token.head.pos_ == "NOUN"
        and token.i < token.head.i
        and not is_hyphenated_token(token)
        and not is_hyphenated_token(token.head)
    )

def is_verb_obj(token):
    return (
        token.dep_ in {"dobj"}
        and token.text.lower() not in EXCLUDE_OBJ_WORD
        and not is_excluded_ing(token)
        and token.head.pos_ == "VERB"
        and token.head.lemma_ != "be"
        and token.head.i < token.i
        and not is_hyphenated_token(token)
        and not is_hyphenated_token(token.head)
    )

def is_verb_adv(token):
    return (
        token.dep_ in ("advmod", "npadvmod", "prt")
        and token.head.pos_ == "VERB"
        and token.head.lemma_ != "be"
        and token.head.i < token.i
        and not is_hyphenated_token(token)
        and not is_hyphenated_token(token.head)
    )

def is_adv_verb(token, sent):
    return (
        token.dep_ in ("advmod", "npadvmod")
        and token.head.pos_ == "VERB"
        and token.head.lemma_ != "be"
        and token.i < token.head.i
        and token.text.lower() not in EXCLUDE_ADVERBS
        and not is_followed_by_comma(token)
        and not has_subject_between(token, token.head, sent)
        and not is_how_plus_adv(token)
        and not is_hyphenated_token(token)
        and not is_hyphenated_token(token.head)
    )

def is_adv_adj(token):
    return (
        token.dep_ in ("advmod", "npadvmod")
        and token.head.pos_ == "ADJ"
        and token.i < token.head.i
        and not is_hyphenated_token(token)
        and not is_hyphenated_token(token.head)
    )

#3. Dowonload learner corpus (NICER)
✅**Replace 'xxx'** in the code below with the appropriate username and password<br>
Refer to https://sugiura-ken.org/sgr/nicer/nicer-1-3-2/

In [None]:
!wget --user="xxx" --password="xxx" "http://venus.hum.nagoya-u.ac.jp/nice-download/NICER1_3_2.zip" -O /content/dep_collocation_tutorial/NICER/NICER1_3_2.zip

In [None]:
# Unzip the file
# A folder named "2020-11-24NICER1_3_2" will be created.

!unzip -d /content/dep_collocation_tutorial/NICER /content/dep_collocation_tutorial/NICER/NICER1_3_2.zip

In [None]:
# Extract only learner essays from the NICER corpus
# Convert CHAT format to plain text
# Output the texts to the 'nicer_plain' folder

import os
import re
from pathlib import Path
from glob import glob

# === Input / Output Folders ===
INPUT_DIR = "/content/dep_collocation_tutorial/NICER/2020-11-24NICER1_3_2/NICER_NNS"
OUTPUT_DIR = "/content/dep_collocation_tutorial/NICER/nicer_plain"

# Create the output folder
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Extraction pattern: lines starting with *JPN001:, etc.
pattern = re.compile(r'^\*JPN[0-9]+:\s*')

# Sort the list of input files
txt_files = sorted(glob(str(Path(INPUT_DIR) / "*.txt")))

print(f"{len(txt_files)} text files found in {INPUT_DIR}")

processed_count = 0

for i, fp in enumerate(txt_files, start=1):
    p = Path(fp)
    selected = []
    with open(fp, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            if pattern.match(line):
                cleaned = pattern.sub("", line)
                if not cleaned.endswith("\n"):
                    cleaned += "\n"
                selected.append(cleaned)

    # ★ Skip the first line (already cleaned)
    selected = selected[1:]

    out_path = Path(OUTPUT_DIR) / f"{p.stem}_plain.txt"

    with open(out_path, "w", encoding="utf-8", errors="ignore") as wf:
        wf.writelines(selected)

    processed_count += 1
    print(f"[{i}/{len(txt_files)}] Saved: {out_path.name} (lines: {len(selected)})")

print("✅ Done.")
print(f"Total processed files: {processed_count}")


#4. Extract dependency pairs from NICER learner corpus

In [None]:
# pair_extractor.py
import os, sys, time, threading, re, shutil
import pandas as pd
import spacy

# ===== Environment Check =====
try:
    import google.colab  # type: ignore
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

# ===== Path Settings =====
if IN_COLAB:
    # colab
    #from google.colab import drive
    #drive.mount("/content/drive", force_remount=False)

    #WORKING_DIR_PATH = "/content/drive/MyDrive/dep_collocation_tutorial"
    WORKING_DIR_PATH = "/content/dep_collocation_tutorial"
    CORPUS_DIR = "NICER/nicer_plain"
    INPUT_ROOT = os.path.join(WORKING_DIR_PATH, CORPUS_DIR) # /content/dep_collocation_tutorial/NICER/nicer_plain
    OUTPUT_DIR = os.path.join(WORKING_DIR_PATH, "collocation") # /content/dep_collocation_tutorial/collocation
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    sys.path.append(WORKING_DIR_PATH) # Specify the directory to load the library 'mylib'
else:
    # Local environment
    sys.path.append(os.path.abspath(os.path.dirname(__file__)))# Look for the library in the directory where the current script is located

import mylib  # is_adj_noun / is_verb_obj / is_verb_adv / is_adv_verb / is_adv_adj

# ===== Common Utilities =====
def get_all_txt_files(root_folder: str):
    txt_files = []
    for dirpath, _, filenames in os.walk(root_folder):
        for f in filenames:
            if f.lower().endswith(".txt"):
                txt_files.append(os.path.join(dirpath, f))
    return sorted(txt_files, key=lambda x: os.path.basename(x))

def add_result(results, file_path, pair_type, sentence_text, i1, i2, token1, token2, span):
    ZERO_WIDTH = "".join(["\u200b", "\u200c", "\u200d", "\ufeff"])  # ZWSP/ZWNJ/ZWJ/BOM
    lemma1 = token1.lemma_.lower()
    lemma2 = token2.lemma_.lower()
    lemma1 = re.sub(f"[{ZERO_WIDTH}]", "", lemma1)
    lemma2 = re.sub(f"[{ZERO_WIDTH}]", "", lemma2)
    results.append({
        "file": os.path.basename(file_path),
        "pair_type": pair_type,
        "sentence": sentence_text,
        "i1": i1, "i2": i2,
        "lemma1": lemma1, "lemma2": lemma2,
        "word1": token1.text, "word2": token2.text,
        "pos1": token1.pos_, "pos2": token2.pos_,
        "tag1": token1.tag_, "tag2": token2.tag_,
        "dep1": token1.dep_, "dep2": token2.dep_,
        "pair": f"{token1.text} - {token2.text}",
        "lemma_pair": f"{lemma1} - {lemma2}",
        "span": span
    })

def build_nlp():
    nlp = spacy.load("en_core_web_sm", disable=["ner", "textcat"])
    if "sentencizer" not in nlp.pipe_names:
        nlp.add_pipe("sentencizer", first=True)
    return nlp

def process_files(INPUT_ROOT: str, OUTPUT_DIR: str, log_func=print):
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    txt_files = get_all_txt_files(INPUT_ROOT)
    if not txt_files:
        log_func(f"ERROR: No .txt files in {INPUT_ROOT}")
        return

    nlp_parser = build_nlp()
    all_results = []
    t0 = time.time()

    for idx, file_path in enumerate(txt_files, 1):
        try:
            rel = os.path.relpath(file_path, INPUT_ROOT)
        except ValueError:
            rel = os.path.basename(file_path)

        try:
            with open(file_path, "r", encoding="utf-8", errors="replace") as f:
                lines = f.read().splitlines()
        except Exception as e:
            log_func(f"[WARN] Skip (read error): {rel} -> {e}")
            continue

        msg = f"files {idx}/{len(txt_files)} : {rel}"
        if log_func is print:
            print("\r" + msg, end="", flush=True)
        else:
            log_func(msg)

        file_results = []
        for line_text in lines:
            if not line_text.strip():
                continue
            doc_line = nlp_parser(line_text)
            for sent in doc_line.sents:
                s_start = sent.start
                for token in sent:
                    if mylib.is_adj_noun(token):
                        i1, i2 = token.i - s_start, token.head.i - s_start
                        if i1 > i2: i1, i2 = i2, i1
                        span = sent[i1:i2+1].text
                        add_result(file_results, rel, "ADJ_NOUN", sent.text, i1, i2, token, token.head, span)
                    if mylib.is_verb_obj(token):
                        i1, i2 = token.head.i - s_start, token.i - s_start
                        span = sent[i1:i2+1].text
                        add_result(file_results, rel, "VERB_OBJ", sent.text, i1, i2, token.head, token, span)
                    if mylib.is_verb_adv(token):
                        i1, i2 = token.head.i - s_start, token.i - s_start
                        span = sent[i1:i2+1].text
                        add_result(file_results, rel, "VERB_ADV", sent.text, i1, i2, token.head, token, span)
                    if mylib.is_adv_verb(token, sent):
                        i1, i2 = token.i - s_start, token.head.i - s_start
                        span = sent[i1:i2+1].text
                        add_result(file_results, rel, "ADV_VERB", sent.text, i1, i2, token, token.head, span)
                    if mylib.is_adv_adj(token):
                        i1, i2 = token.i - s_start, token.head.i - s_start
                        span = sent[i1:i2+1].text
                        add_result(file_results, rel, "ADV_ADJ", sent.text, i1, i2, token, token.head, span)

        all_results.extend(file_results)

    df_all = pd.DataFrame(all_results)
    if not df_all.empty:
        out_all = os.path.join(OUTPUT_DIR, "pairs_all.csv")
        df_all.to_csv(out_all, index=False, encoding="utf-8-sig")
        for pt in ["ADJ_NOUN", "VERB_OBJ", "VERB_ADV", "ADV_ADJ", "ADV_VERB"]:
            sub = df_all[df_all["pair_type"] == pt]
            if not sub.empty:
                freq = sub["lemma_pair"].value_counts().reset_index()
                freq.columns = ["lemma_pair", "frequency"]
                freq.to_csv(os.path.join(OUTPUT_DIR, f"frequencies_{pt}.csv"), index=False, encoding="utf-8-sig")
        log_func(f"\n✓ Results saved to: {OUTPUT_DIR}")
    else:
        log_func("\nNo dependency pairs were extracted.")
    log_func(f"✓ Total time: {time.time()-t0:.2f} sec")

# ===== Entry Point =====
if IN_COLAB:
    print("[Colab] INPUT_DIR:", INPUT_ROOT)

    if not os.path.isdir(INPUT_ROOT):
        raise FileNotFoundError(f"Not found: {INPUT_ROOT}")

    if "/content/drive" in INPUT_ROOT:
        # ===== Cache Settings (Google Drive path) =====
        shutil.copytree(INPUT_ROOT, "/content/text_cache", dirs_exist_ok=True)
        INPUT_CACHE = "/content/text_cache"
        print("[Colab] Copied to :", INPUT_CACHE)
        print("[Colab] Using cache ...")
    else:
        # ===== Direct Processing (virtual / local path) =====
        INPUT_CACHE = INPUT_ROOT
        print("[Colab] Using virtual drive path directly ...")

    print("[Colab] OUTPUT_DIR:", OUTPUT_DIR)
    process_files(INPUT_CACHE, OUTPUT_DIR, log_func=print)

else:
    # --- Local GUI: Select input/output via GUI and start ---
    import tkinter as tk
    from tkinter import filedialog, messagebox, ttk

    class App(tk.Tk):
        def __init__(self):
            super().__init__()
            self.title("PairExtractor")
            self.geometry("720x520")
            self.input_dir = tk.StringVar()
            self.output_dir = tk.StringVar()
            # Specify the PNG file
            #icon = tk.PhotoImage(file="icon.png")
            #self.iconphoto(True, icon)

            pad = {"padx": 10, "pady": 8}

            frm_in = ttk.LabelFrame(self, text="Input")
            frm_in.pack(fill="x", **pad)
            ttk.Button(frm_in, text="Input folder", command=self.select_input).pack(anchor="w", padx=10, pady=(10,4))
            ttk.Entry(frm_in, textvariable=self.input_dir, state="readonly").pack(fill="x", padx=10, pady=(0,10))

            frm_out = ttk.LabelFrame(self, text="Output")
            frm_out.pack(fill="x", **pad)
            ttk.Button(frm_out, text="Output folder", command=self.select_output).pack(anchor="w", padx=10, pady=(10,4))
            ttk.Entry(frm_out, textvariable=self.output_dir, state="readonly").pack(fill="x", padx=10, pady=(0,10))

            frm_start = ttk.Frame(self); frm_start.pack(fill="x", **pad)
            self.btn_start = ttk.Button(frm_start, text="Extract start", command=self.on_start, state="disabled")
            self.btn_start.pack(side="left")
            self.lbl_status = ttk.Label(frm_start, text="Select input/output folders.")
            self.lbl_status.pack(side="left", padx=10)

            frm_log = ttk.LabelFrame(self, text="Log")
            frm_log.pack(fill="both", expand=True, padx=10, pady=10)
            cnt = ttk.Frame(frm_log)
            cnt.pack(fill="both", expand=True)
            self.txt_log = tk.Text(cnt, height=14, wrap="none")
            self.txt_log.pack(side="left", fill="both", expand=True)
            self.scroll_y = ttk.Scrollbar(cnt, orient="vertical", command=self.txt_log.yview)
            self.scroll_y.pack(side="right", fill="y")
            self.txt_log.configure(yscrollcommand=self.scroll_y.set)

        def select_input(self):
            p = filedialog.askdirectory(title="Select input folder (e.g., JPN_text)")
            if p:
                self.input_dir.set(p); self.update_state()

        def select_output(self):
            p = filedialog.askdirectory(title="Select output folder (empty or existing)")
            if p:
                self.output_dir.set(p); self.update_state()

        def update_state(self):
            self.btn_start.config(state=(tk.NORMAL if self.input_dir.get() and self.output_dir.get() else tk.DISABLED))

        def log(self, msg):
            self.txt_log.insert("end", msg + "\n"); self.txt_log.see("end"); self.update_idletasks()

        def on_start(self):
            in_p, out_p = self.input_dir.get().strip(), self.output_dir.get().strip()
            if not in_p or not out_p:
                messagebox.showwarning("Warning", "Both input and output folders must be selected."); return
            self.btn_start.config(state="disabled"); self.lbl_status.config(text="Running...")
            threading.Thread(target=self._run, args=(in_p, out_p), daemon=True).start()

        def _run(self, in_p, out_p):
            try:
                process_files(in_p, out_p, log_func=self.log)
                self.lbl_status.config(text="Done.")
            except Exception as e:
                self.log(f"[ERROR] {e}"); messagebox.showerror("Error", str(e)); self.lbl_status.config(text="Error.")
            finally:
                self.btn_start.config(state="normal")

    if __name__ == "__main__":
        App().mainloop()

#5. Download OANC
See below for details on the OANC corpus<br>
https://anc.org/

In [None]:
# Download OANC (use --no-check-certificate to avoid certificate errors)
!wget --no-check-certificate -O /content/dep_collocation_tutorial/OANC/OANC.zip \
  https://www.anc.org/OANC/OANC-1.0.1-UTF8.zip

# Unzip to /content/dep_collocation_tutorial/OANC
!unzip -o /content/dep_collocation_tutorial/OANC/OANC.zip -d /content/dep_collocation_tutorial/OANC

For the tutorial, copy only:
- written_1/journal/verbatim
- written_2/non-fiction/OUP

In [None]:
from pathlib import Path
import shutil

# Original root and list of source directories to copy
base_dir = Path("/content/dep_collocation_tutorial/OANC/OANC")
input_dirs = [
    base_dir / "data" / "written_1" / "journal" / "verbatim",
    base_dir / "data" / "written_2" / "non-fiction" / "OUP",
]

# Copy destination
dest_root = Path("/content/dep_collocation_tutorial/OANC/text_selected")
dest_root.mkdir(parents=True, exist_ok=True)

copied, skipped = 0, 0

for src_root in input_dirs:
    if not src_root.is_dir():
        print(f"[SKIP] Not found: {src_root}")
        continue
    for src in src_root.rglob("*.txt"):
        # Preserve relative paths from base_dir during copy
        rel = src.relative_to(base_dir)
        dest = dest_root / rel
        dest.parent.mkdir(parents=True, exist_ok=True)

        shutil.copy2(src, dest)
        copied += 1


print(f"Copied {copied} files, skipped {skipped}, into {dest_root} (structure preserved)")



Fix sentence-internal line breaks in OANC texts

In [None]:
import os
import re
from glob import glob

SRC_ROOT = "/content/dep_collocation_tutorial/OANC/text_selected"
DST_ROOT = "/content/dep_collocation_tutorial/OANC/text_selected_norm"

import re

KAIGYO = "###KAIGYO###"
DANRAKU = "###DANRAKU###"

def normalize_with_tokens(text: str) -> str:
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    text = text.replace("\n", KAIGYO)
    text = re.sub(r'(?:' + re.escape(KAIGYO) + r'){2,}', DANRAKU, text)
    text = text.replace(KAIGYO, " ")
    text = text.replace(DANRAKU, "\n")
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r' *\n *', '\n', text)
    return text.strip()


def read_text(path: str) -> str:
    for enc in ("utf-8", "utf-8-sig", "latin-1"):
        try:
            with open(path, "r", encoding=enc, errors="strict") as f:
                return f.read()
        except Exception:
            continue
    with open(path, "r", encoding="utf-8", errors="replace") as f:
        return f.read()

def write_text(path: str, text: str) -> None:
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8", newline="\n") as f:
        f.write(text)

def main():
    files = glob(os.path.join(SRC_ROOT, "**", "*.txt"), recursive=True)
    if not files:
        print(f"No .txt files found under: {SRC_ROOT}")
        return

    processed = 0
    skipped = 0
    failed = 0

    for i, src_path in enumerate(files, 1):
        rel_path = os.path.relpath(src_path, SRC_ROOT)
        dst_path = os.path.join(DST_ROOT, rel_path)

        try:
            raw = read_text(src_path)
            norm = normalize_with_tokens(raw)
            write_text(dst_path, norm)
            processed += 1
            if i % 200 == 0:
                print(f"[{i}/{len(files)}] Processed so far: {processed}, Skipped: {skipped}, Failed: {failed}")
        except UnicodeDecodeError:
            skipped += 1
            print(f"SKIP (decode): {src_path}")
        except Exception as e:
            failed += 1
            print(f"ERROR ({type(e).__name__}): {src_path} -> {e}")

    print(f"Done. Processed: {processed}, Skipped(decode): {skipped}, Failed: {failed}")
    print(f"Output root: {DST_ROOT}")

if __name__ == "__main__":
    main()


Read texts from text_selected_norm, analyze with spaCy, and save as .spacy files at OANC/spacy<br>
This may take a couple of minutes.

In [None]:
# Install the following for the local environment
# pip install -U spacy
# python -m spacy download en_core_web_sm

import os
import sys
import time
import gc
import spacy
from spacy.tokens import DocBin
import multiprocessing as mp

# ===== Check Environment =====
try:
    import google.colab  # noqa: F401
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

# ===== Common Settings =====
MODEL = "en_core_web_sm"
N_PROCESS = 4
BATCH_SIZE = 20
RELOAD_MB_THRESHOLD = 100            # Threshold in MB (e.g., reload every 100MB)
RELOAD_EVERY = 50
OUTPUT_SUBDIR = "spacy"

# ===== Common Utilities =====
def build_nlp():
    nlp = spacy.load(MODEL, exclude=["ner", "textcat"])
    if "sentencizer" not in nlp.pipe_names:
        nlp.add_pipe("sentencizer", first=True)
    return nlp

def iter_nonempty_lines(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            s = line.strip()
            if s:
                yield s

def save_parsed_docs(input_path, output_path, nlp, n_process=2, batch_size=20):
    stats = {"lines": 0, "docs": 0, "words": 0, "per_doc_words": []}
    doc_bin = DocBin(store_user_data=False)
    # Create a doc for each line break
    for doc in nlp.pipe(iter_nonempty_lines(input_path), n_process=n_process, batch_size=batch_size):
        doc_bin.add(doc)
        stats["docs"] += 1
        w = sum(1 for t in doc if t.is_alpha)
        stats["words"] += w
        stats["per_doc_words"].append(w)
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    print(output_path)
    doc_bin.to_disk(output_path)
    return stats

def list_txt_recursive(root_dir):
    out = []
    for cur, _, files in os.walk(root_dir):
        for fn in files:
            if fn.lower().endswith(".txt"):
                out.append(os.path.join(cur, fn))
    out.sort()
    return out

_prev_len = 0
def status_line(msg: str):
    global _prev_len
    s = f"\r{msg}"
    pad = " " * max(0, _prev_len - len(msg))
    sys.stdout.write(s + pad)
    sys.stdout.flush()
    _prev_len = len(msg)

def to_spacy_out_path(txt_abs_path, input_root, output_root, in_colab, base_dir=None):
    rel = os.path.relpath(txt_abs_path, input_root)
    rel_no_ext = os.path.splitext(rel)[0]
    if in_colab:
        return os.path.normpath(os.path.join(output_root, rel_no_ext + ".spacy"))
    else:
        return os.path.join(output_root, OUTPUT_SUBDIR, rel_no_ext + ".spacy")

def run_pipeline(input_dirs, output_dir, in_colab, base_dir, n_process, log_func=print, cancel_checker=None):
    processed_file_n = 0
    txt_files = []
    file_map = {}
    for root in input_dirs:
        if os.path.isdir(root):
            files = list_txt_recursive(root)
            txt_files.extend(files)
            for f in files:
                file_map[f] = root
        else:
            log_func(f"WARNING: Not found: {root}")

    if not txt_files:
        log_func(f"ERROR: No .txt files found under: {input_dirs}")
        return

    info = f"{len(input_dirs)} target folders" if in_colab else f"'{input_dirs[0]}'"
    log_func(f"Processing {len(txt_files)} files from {info} -> {output_dir}")
    os.makedirs(output_dir, exist_ok=True)

    nlp = build_nlp()
    start_all = time.time()
    base_for_rel = (base_dir if in_colab else input_dirs[0])
    bytes_since_reload = 0
    threshold_bytes = RELOAD_MB_THRESHOLD * 1024 * 1024

    for i, in_path in enumerate(txt_files, 1):
        if cancel_checker and cancel_checker():
            log_func("\nProcessing cancelled by user.")
            break
        iter_start = time.time()
        out_path = to_spacy_out_path(in_path, file_map[in_path], output_dir, in_colab, base_dir)

        # Update size; reload when over threshold
        try:
            sz = os.path.getsize(in_path)
        except OSError:
            sz = 0
        bytes_since_reload += sz

        stats = save_parsed_docs(
            input_path=in_path,
            output_path=out_path,
            nlp=nlp,
            n_process=n_process,
            batch_size=BATCH_SIZE
        )
        # Progress display
        elapsed_iter = time.time() - iter_start
        elapsed_total = time.time() - start_all
        rel_show = os.path.relpath(in_path, base_for_rel)

        avg_w = (stats["words"] / stats["docs"]) if stats["docs"] else 0
        log_func(
            f"[{i}/{len(txt_files)}] {rel_show} "
            f"last: {elapsed_iter:.2f}s | total: {elapsed_total:.2f}s | "
            f"lines: {stats['docs']:,} | words: {stats['words']:,} | words/line: {avg_w:.1f}"
        )

        # === Reload Conditions ====
        need_reload = False
        if RELOAD_EVERY and (i % RELOAD_EVERY == 0):
            need_reload = True
        if bytes_since_reload >= threshold_bytes:
            need_reload = True

        if need_reload:
            nlp = build_nlp()
            bytes_since_reload = 0
            gc.collect()
            log_func("\nReloaded!")
        processed_file_n = i

    total_time = time.time() - start_all
    log_func(f"\n{processed_file_n} files processed in {total_time:.2f} seconds.")

# ===== Entry Point =====
def main():
    if IN_COLAB:
        base_dir = "/content/dep_collocation_tutorial/OANC"
        #base_dir = "/content/OANC"
        input_dirs = [
            os.path.join(base_dir, "text_selected_norm"),
        ]
        output_dir = os.path.join(base_dir, "spacy")
        n_proc = 1
        run_pipeline(input_dirs, output_dir, in_colab=True, base_dir=base_dir, n_process=n_proc)
    else:
        # --- Local GUI: Set input, output, and processes ---
        import tkinter as tk
        from tkinter import filedialog, messagebox, ttk
        import threading

        class App(tk.Tk):
            def __init__(self):
                super().__init__()
                self.title("spaCyDataCreator")
                self.geometry("780x580")

                self.input_dir = tk.StringVar()
                self.output_dir = tk.StringVar()

                cores = os.cpu_count() or 1
                upper = max(1, cores - 2)      # Maximum = CPU cores minus 2 (at least 1)
                suggested = min(max(1, cores - 1), upper)  # Recommended = cores - 1 (capped at max)
                self.upper = upper
                self.n_proc = tk.IntVar(value=1)

                pad = {"padx": 10, "pady": 8}

                # Input
                frm_in = ttk.LabelFrame(self, text="Input")
                frm_in.pack(fill="x", **pad)
                ttk.Button(frm_in, text="Input folder", command=self.select_input).pack(anchor="w", padx=10, pady=(10, 4))
                ttk.Entry(frm_in, textvariable=self.input_dir, state="readonly").pack(fill="x", padx=10, pady=(0, 10))

                # Output
                frm_out = ttk.LabelFrame(self, text="Output")
                frm_out.pack(fill="x", **pad)
                ttk.Button(frm_out, text="Output folder", command=self.select_output).pack(anchor="w", padx=10, pady=(10, 4))
                ttk.Entry(frm_out, textvariable=self.output_dir, state="readonly").pack(fill="x", padx=10, pady=(0, 10))

                # Process
                frm_np = ttk.LabelFrame(self, text="Processes")
                frm_np.pack(fill="x", **pad)
                row = ttk.Frame(frm_np); row.pack(fill="x", padx=10, pady=(8, 10))
                ttk.Label(row, text=f"n_process (1–{upper}):").pack(side="left")
                self.spin = ttk.Spinbox(row, from_=1, to=upper, textvariable=self.n_proc, width=6, justify="center")
                self.spin.pack(side="left", padx=8)
                ttk.Label(frm_np, text="Try smaller values first.").pack(anchor="w", padx=12, pady=(0, 8))

                # Start
                frm_start = ttk.Frame(self);
                frm_start.pack(fill="x", **pad)
                self.btn_start = ttk.Button(frm_start, text="Start", command=self.on_start, state="disabled")
                self.btn_start.pack(side="left")

                # Cancel
                self.cancel_flag = threading.Event()
                self.btn_cancel = ttk.Button(frm_start, text="Cancel", command=self.on_cancel, state="disabled")
                self.btn_cancel.pack(side="left", padx=(8, 0))

                self.lbl_status = ttk.Label(frm_start, text="Select input/output and processes.")
                self.lbl_status.pack(side="left", padx=10)

                # Log + Scrollbar
                frm_log = ttk.LabelFrame(self, text="Log")
                frm_log.pack(fill="both", expand=True, **pad)
                cnt = ttk.Frame(frm_log); cnt.pack(fill="both", expand=True, padx=10, pady=10)
                cnt.grid_rowconfigure(0, weight=1)
                cnt.grid_columnconfigure(0, weight=1)
                self.txt_log = tk.Text(cnt, height=18, wrap="none")
                self.txt_log.grid(row=0, column=0, sticky="nsew")
                scroll_y = ttk.Scrollbar(cnt, orient="vertical", command=self.txt_log.yview)
                scroll_y.grid(row=0, column=1, sticky="ns")
                self.txt_log.configure(yscrollcommand=scroll_y.set)

            def select_input(self):
                p = filedialog.askdirectory(title="Select input folder (recursive .txt)")
                if p:
                    if not list_txt_recursive(p):
                        messagebox.showwarning("Warning", f"No .txt files found under:\n{p}\nSelect another folder.")
                        return
                    self.input_dir.set(p)
                    self.update_start_state()

            def select_output(self):
                p = filedialog.askdirectory(title="Select output folder (same as input is OK)")
                if p:
                    self.output_dir.set(p)
                    self.update_start_state()

            def update_start_state(self):
                in_p, out_p = self.input_dir.get(), self.output_dir.get()
                ok = bool(in_p and out_p)
                self.btn_start.config(state=(tk.NORMAL if ok else tk.DISABLED))

            def log(self, msg: str):
                self.txt_log.insert("end", msg + "\n")
                self.txt_log.see("end")
                self.update_idletasks()

            def on_start(self):
                in_p, out_p = self.input_dir.get().strip(), self.output_dir.get().strip()
                if not in_p or not out_p:
                    messagebox.showwarning("Warning", "Both input and output folders must be selected.")
                    return
                txts = list_txt_recursive(in_p)
                if not txts:
                    messagebox.showwarning("Warning", f"No .txt files under:\n{in_p}")
                    return

                try:
                    n_proc = int(self.n_proc.get())
                except Exception:
                    messagebox.showwarning("Warning", "n_process must be an integer.")
                    return
                if not (1 <= n_proc <= self.upper):
                    messagebox.showwarning("Warning", f"n_process must be between 1 and {self.upper}.")
                    return

                self.btn_start.config(state="disabled")
                self.lbl_status.config(text=f"Running... (n_process={n_proc})")
                self.cancel_flag.clear()
                self.btn_cancel.config(state="normal")

                def _runner():
                    try:
                        run_pipeline(
                            [in_p], out_p, in_colab=False, base_dir=None, n_process=n_proc,
                            log_func=self.log, cancel_checker=self.check_cancel
                        )
                        if self.check_cancel():
                            self.lbl_status.config(text="Cancelled.")
                        else:
                            self.lbl_status.config(text="Done.")
                    except Exception as e:
                        self.log(f"[ERROR] {e}")
                        messagebox.showerror("Error", str(e))
                        self.lbl_status.config(text="Error.")
                    finally:
                        self.btn_start.config(state="normal")
                        self.btn_cancel.config(state="disabled")
                        self.cancel_flag.clear()

                import threading
                threading.Thread(target=_runner, daemon=True).start()

            def on_cancel(self):
                self.cancel_flag.set()
                self.lbl_status.config(text="Cancelling...may take a whole")

            def check_cancel(self):
                return self.cancel_flag.is_set()

        App().mainloop()

if __name__ == "__main__":
    mp.freeze_support() # Safety measure for multiprocessing with Windows/PyInstaller

    main()

#6. Calculate MI and t-score
Read pair_all.csv and caliculate frequencies and association strength scores based on OANC.

In [None]:
import os, re, math, time
import pandas as pd
import spacy
from spacy.tokens import DocBin
try:
    # colab environment
    # Comment out if not using with Google Drive
    #from google.colab import drive
    #drive.mount("/content/drive", force_remount=False)
    _IN_COLAB = True
except Exception:
    drive = None
    _IN_COLAB = False

import shutil, os

import sys
if _IN_COLAB:
    #sys.path.append('/content/drive/MyDrive/dep_collocation_tutorial')
    sys.path.append('/content/dep_collocation_tutorial')
try:
    import mylib
except Exception as e:
    if not _IN_COLAB:
        sys.path.append(os.getcwd())
        try:
            import mylib
        except Exception as e2:
            raise ImportError(
                "Failed to import mylib. On local environments, place mylib.py in the same folder as the script"
            ) from e2
    else:
        raise

# ------------------------------------------------------------
# Configuration (Specify the paths)
# ------------------------------------------------------------
# Input CSV path
#CSV_PATH = "/content/drive/MyDrive/dep_collocation_tutorial/collocation/pairs_all.csv"
CSV_PATH = "/content/dep_collocation_tutorial/collocation/pairs_all.csv"

# Reference corpus folder path (with .spacy files)
#REF_FOLDER = "/content/drive/MyDrive/dep_collocation_tutorial/OANC/spacy"
REF_FOLDER = "/content/dep_collocation_tutorial/OANC/spacy"

# Output CSV file name (if not specified, defaults to input CSV name + '_scores.csv')"
OUTPUT_CSV_NAME = ""  # 例: "pairs_all_scores.csv"
# ------------------------------------------------------------

# Copy temporarily to /content and process from there (recommended for Google Drive)
USE_CACHE  = False
CACHE_ROOT = "/content/ref_spacy_cache"

if USE_CACHE:
    if not os.path.isdir(REF_FOLDER):
        raise FileNotFoundError(f"Not found: {REF_FOLDER}")

    if "/content/drive" in REF_FOLDER:
        # ===== Google Drive path: copy to cache =====
        shutil.copytree(REF_FOLDER, CACHE_ROOT, dirs_exist_ok=True)
        REF_FOLDER = CACHE_ROOT
        print("Using cached input at:", REF_FOLDER)
    else:
        # ===== Virtual drive path: use directly =====
        print("Using virtual drive path directly:", REF_FOLDER)

def norm_pt(s: str) -> str:
    return str(s).strip().upper()

def norm_lemma(s: str) -> str:
    s = str(s).strip().lower()
    s = re.sub(r'^[^\w]+|[^\w]+$', '', s)
    return s

def iter_spacy_files(root):
    # Recursively find .spacy files (include subfolders)
    paths = []
    for dp, _, files in os.walk(root):
        for f in files:
            if f.lower().endswith(".spacy"):
                paths.append(os.path.join(dp, f))
    return sorted(paths)

def collect_counts_from_ref_spacy(ref_folder, nlp, *, needed_rows, needed_left, needed_right, log_func):
    pair_ref_freq, lemma1_ref_freq, lemma2_ref_freq = {}, {}, {}
    total_ref_freq_by_type = {"ADJ_NOUN": 0, "VERB_OBJ": 0, "VERB_ADV": 0, "ADV_VERB": 0, "ADV_ADJ": 0}

    spacy_files = iter_spacy_files(ref_folder)
    if not spacy_files:
        raise FileNotFoundError("No '.spacy' files were found in the reference corpus folder.。")

    total_files = len(spacy_files)
    log_func(f"Start loading reference corpus: {total_files} .spacy files")

    for file_idx, path in enumerate(spacy_files, start=1):
        base = os.path.basename(path)
        try:
            db = DocBin().from_disk(path)
        except Exception as e:
            log_func(f"⚠ Skipped (Filed loading): {base} :: {e}")
            continue

        doc_counter = 0
        for doc in db.get_docs(nlp.vocab):
            doc_counter += 1

            for token in doc:
                '''
                それぞれの依存ペアのif条件（ペア抽出条件）は、PairExtractorと同じにする
                sent = token.sent を加える
                '''
                sent = token.sent # Needed to keep it consistent with PairExtracor.py code

                # --- ADJ_NOUN ---
                if mylib.is_adj_noun(token):
                    l1, l2 = norm_lemma(token.lemma_), norm_lemma(token.head.lemma_)
                    total_ref_freq_by_type["ADJ_NOUN"] += 1
                    if ("ADJ_NOUN", l1, l2) in needed_rows:
                        pair_ref_freq[("ADJ_NOUN", l1, l2)] = pair_ref_freq.get(("ADJ_NOUN", l1, l2), 0) + 1
                    if l1 in needed_left.get("ADJ_NOUN", set()):
                        lemma1_ref_freq[("ADJ_NOUN", l1)] = lemma1_ref_freq.get(("ADJ_NOUN", l1), 0) + 1
                    if l2 in needed_right.get("ADJ_NOUN", set()):
                        lemma2_ref_freq[("ADJ_NOUN", l2)] = lemma2_ref_freq.get(("ADJ_NOUN", l2), 0) + 1

                # --- VERB_OBJ ---
                if mylib.is_verb_obj(token):
                    l1, l2 = norm_lemma(token.head.lemma_), norm_lemma(token.lemma_)
                    total_ref_freq_by_type["VERB_OBJ"] += 1
                    if ("VERB_OBJ", l1, l2) in needed_rows:
                        pair_ref_freq[("VERB_OBJ", l1, l2)] = pair_ref_freq.get(("VERB_OBJ", l1, l2), 0) + 1
                    if l1 in needed_left.get("VERB_OBJ", set()):
                        lemma1_ref_freq[("VERB_OBJ", l1)] = lemma1_ref_freq.get(("VERB_OBJ", l1), 0) + 1
                    if l2 in needed_right.get("VERB_OBJ", set()):
                        lemma2_ref_freq[("VERB_OBJ", l2)] = lemma2_ref_freq.get(("VERB_OBJ", l2), 0) + 1

                # --- VERB_ADV ---
                if mylib.is_verb_adv(token):
                    l1, l2 = norm_lemma(token.head.lemma_), norm_lemma(token.lemma_)
                    total_ref_freq_by_type["VERB_ADV"] += 1
                    if ("VERB_ADV", l1, l2) in needed_rows:
                        pair_ref_freq[("VERB_ADV", l1, l2)] = pair_ref_freq.get(("VERB_ADV", l1, l2), 0) + 1
                    if l1 in needed_left.get("VERB_ADV", set()):
                        lemma1_ref_freq[("VERB_ADV", l1)] = lemma1_ref_freq.get(("VERB_ADV", l1), 0) + 1
                    if l2 in needed_right.get("VERB_ADV", set()):
                        lemma2_ref_freq[("VERB_ADV", l2)] = lemma2_ref_freq.get(("VERB_ADV", l2), 0) + 1

                # --- ADV_VERB ---
                if mylib.is_adv_verb(token, sent):
                    l1, l2 = norm_lemma(token.lemma_), norm_lemma(token.head.lemma_)
                    total_ref_freq_by_type["ADV_VERB"] += 1
                    if ("ADV_VERB", l1, l2) in needed_rows:
                        pair_ref_freq[("ADV_VERB", l1, l2)] = pair_ref_freq.get(("ADV_VERB", l1, l2), 0) + 1
                    if l1 in needed_left.get("ADV_VERB", set()):
                        lemma1_ref_freq[("ADV_VERB", l1)] = lemma1_ref_freq.get(("ADV_VERB", l1), 0) + 1
                    if l2 in needed_right.get("ADV_VERB", set()):
                        lemma2_ref_freq[("ADV_VERB", l2)] = lemma2_ref_freq.get(("ADV_VERB", l2), 0) + 1

                # --- ADV_ADJ ---
                if mylib.is_adv_adj(token):
                    l1, l2 = norm_lemma(token.lemma_), norm_lemma(token.head.lemma_)
                    total_ref_freq_by_type["ADV_ADJ"] += 1
                    if ("ADV_ADJ", l1, l2) in needed_rows:
                        pair_ref_freq[("ADV_ADJ", l1, l2)] = pair_ref_freq.get(("ADV_ADJ", l1, l2), 0) + 1
                    if l1 in needed_left.get("ADV_ADJ", set()):
                        lemma1_ref_freq[("ADV_ADJ", l1)] = lemma1_ref_freq.get(("ADV_ADJ", l1), 0) + 1
                    if l2 in needed_right.get("ADV_ADJ", set()):
                        lemma2_ref_freq[("ADV_ADJ", l2)] = lemma2_ref_freq.get(("ADV_ADJ", l2), 0) + 1

        msg = f"\r[{file_idx}/{total_files}] {base}: {doc_counter} docs"
        if log_func is print:
            print("\r" + msg, end="", flush=True)
        else:
            log_func(msg)

    return pair_ref_freq, lemma1_ref_freq, lemma2_ref_freq, total_ref_freq_by_type

import numpy as np

def add_assoc_scores(df: pd.DataFrame) -> pd.DataFrame:
    # Convert required columns to float
    f_xy = df["pair_ref_freq"].astype(float).to_numpy()
    f_x  = df["lemma1_ref_freq"].astype(float).to_numpy()
    f_y  = df["lemma2_ref_freq"].astype(float).to_numpy()
    N_t  = df["total_ref_type_freq"].astype(float).to_numpy()

    # ===== MI =====
    # MI = log2( (f_xy * N_t) / (f_x * f_y) )
    mi = np.full(len(df), np.nan, dtype=float)
    valid_mi = (f_xy > 0) & (f_x > 0) & (f_y > 0) & (N_t > 0)
    mi[valid_mi] = np.log2((f_xy[valid_mi] * N_t[valid_mi]) / (f_x[valid_mi] * f_y[valid_mi]))

    # ===== t-score =====
    # t = (f_xy - (f_x*f_y)/N_t) / sqrt(f_xy)
    tscore = np.full(len(df), np.nan, dtype=float)
    valid_t = (f_xy > 0) & (f_x > 0) & (f_y > 0) & (N_t > 0)
    expected = np.zeros(len(df), dtype=float)
    expected[valid_t] = (f_x[valid_t] * f_y[valid_t]) / N_t[valid_t]
    tscore[valid_t] = (f_xy[valid_t] - expected[valid_t]) / np.sqrt(f_xy[valid_t])

    # Update the DataFrame
    df["MI"] = mi
    df["t_score"] = tscore

    return df


def run_pipeline(csv_path, ref_folder, log_func=print):
    global CSV_PATH, REF_FOLDER
    CSV_PATH = csv_path
    REF_FOLDER = ref_folder

    if not os.path.isfile(CSV_PATH):
        raise FileNotFoundError(f"Input CSV not found.: {CSV_PATH}")
    if not os.path.isdir(REF_FOLDER):
        raise FileNotFoundError(f"Reference corpus folder not found.: {REF_FOLDER}")

    # Read pair_all.csv (extracted from learner data) into a DataFrame
    df = pd.read_csv(CSV_PATH, encoding="utf-8-sig", dtype=str, na_filter=False)

    # Check required columns
    for col in ["pair_type","lemma1","lemma2"]:
        if col not in df.columns:
            raise ValueError(f"Missing required column(s).: {col}")

    # Normalization
    df["pair_type"] = df["pair_type"].map(norm_pt)
    df["lemma1"]    = df["lemma1"].map(norm_lemma)
    df["lemma2"]    = df["lemma2"].map(norm_lemma)

    #Create a deduplicated list
    needed_rows  = {(pt, l1, l2) for pt, l1, l2 in zip(df["pair_type"], df["lemma1"], df["lemma2"]) }
    needed_left, needed_right = {}, {}
    for pt, l1, l2 in needed_rows:
        needed_left.setdefault(pt, set()).add(l1)
        needed_right.setdefault(pt, set()).add(l2)

    # Since DocBin is already parsed, load only the minimal model
    nlp = spacy.blank("en")

    start_time = time.perf_counter()

    # Get frequencies from the reference corpus
    # pair_counts   = frequency of the pair (e.g., frequency of "big problem")
    # marg1_counts  = frequency of the left element (e.g., frequency of "big")
    # marg2_counts  = frequency of the right element (e.g., frequency of "problem")
    # total_by_type = frequency of the pair type (e.g., frequency of ADJ_NOUN)
    pair_counts, marg1_counts, marg2_counts, total_by_type = collect_counts_from_ref_spacy(
        REF_FOLDER, nlp,
        needed_rows=needed_rows,
        needed_left=needed_left,
        needed_right=needed_right
        ,log_func = log_func
    )

    '''
    After running collect_counts_from_ref_spacy, variables such as pair_counts will contain values like the following:
    pair_counts = {
        ("ADJ_NOUN", "big", "problem"): 25,
        ("VERB_OBJ", "eat", "apple"): 15,
    }
    marg1_counts = {
        ("ADJ_NOUN", "big"): 50,
    }
    '''
    # Initialize list
    lemma1_freqs, lemma2_freqs, pair_freqs, total_freqs = [], [], [], []

    # Example of DataFrame df (after loading the CSV)
    # ------------------------------------------------------
    # file     pair_type   lemma1   lemma2   sentence ...
    # 001.txt  ADJ_NOUN    big      problem  "It is a big problem."
    # 002.txt  VERB_OBJ    eat      apple    "I eat an apple."
    # 003.txt  ADV_VERB    quickly  run      "He quickly runs."
    # ------------------------------------------------------

    # Use itertuples for row-wise processing
    for i, row in enumerate(df.itertuples(index=False), start=1):
        # From each row, get the dependency type and the left/right lemmas, then normalize
        pt  = norm_pt(row.pair_type)   # e.g., "ADJ_NOUN"
        l1p = norm_lemma(row.lemma1)   # e.g., "big"
        l2p = norm_lemma(row.lemma2)   # e.g., "problem"

        # --- Retrieve co-occurrence and marginal frequencies from reference tables ---

        # f_xy: frequency of the pair (dependency type, lemma1, lemma2)
        #   e.g., ("ADJ_NOUN", "big", "problem") → 25 times
        # Retrieve the value using the tuple (pt, l1p, l2p) as the key
        f_xy = pair_counts.get((pt, l1p, l2p), 0)

        # f_x: left-side frequency (marginal) of (dependency type, lemma1)
        #   e.g., ("ADJ_NOUN", "big") → 50 times (total count of "big" modifying nouns)
        f_x  = marg1_counts.get((pt, l1p), 0)

        # f_y: right-side frequency (marginal) of (dependency type, lemma2)
        #   e.g., ("ADJ_NOUN", "problem") → 120 times (total count of "problem" being modified by adjectives)
        f_y  = marg2_counts.get((pt, l2p), 0)

        # N_t: total token count for the dependency type
        #   e.g., ("ADJ_NOUN") → 10,000 times across the entire corpus
        N_t  = total_by_type.get(pt, 0)

        # --- Append to lists for later use as DataFrame columns ---
        lemma1_freqs.append(f_x)   # marginal frequency of the left element
        lemma2_freqs.append(f_y)   # marginal frequency of the right element
        pair_freqs.append(f_xy)    # co-occurrence frequency of the pair itself
        total_freqs.append(N_t)    # total count of the dependency type

    # After the loop, add new columns to the original DataFrame df
    df["lemma1_ref_freq"]      = lemma1_freqs       # marginal frequency of the left lemma
    df["lemma2_ref_freq"]      = lemma2_freqs       # marginal frequency of the right lemma
    df["pair_ref_freq"]        = pair_freqs         # co-occurrence frequency of the pair
    df["total_ref_type_freq"]  = total_freqs        # total frequency of the dependency type
    df = add_assoc_scores(df)  # add association strength scores

    log_func(f"✓ Finished: {len(df)} rows, time={time.perf_counter()-start_time:.2f}s")

    if OUTPUT_CSV_NAME:
        out_path = os.path.join(os.path.dirname(CSV_PATH), OUTPUT_CSV_NAME)
    else:
        out_path = f"{os.path.splitext(CSV_PATH)[0]}_scores.csv"

    df.to_csv(out_path, index=False, encoding="utf-8-sig")
    print(f"✓ Created output file: {out_path}")
    return out_path

def main():
    if _IN_COLAB:
        #if not os.path.exists("/content/drive"):
            #drive.mount("/content/drive")
        #  In Colab, run with the default path as is
        run_pipeline(CSV_PATH, REF_FOLDER)
    else:
        # --- Local uses GUI ---
        import threading
        import tkinter as tk
        from tkinter import ttk, filedialog, messagebox

        class App(tk.Tk):
            def __init__(self):
                super().__init__()
                self.title("ScoreCalculator")
                self.geometry("720x520")
                # 値は StringVar で保持
                self.csv_path = tk.StringVar()
                self.ref_dir  = tk.StringVar()

                pad = {"padx": 10, "pady": 8}

                frm_in = ttk.LabelFrame(self, text="Input")
                frm_in.pack(fill="x", **pad)

                # Input file (.csv)
                ttk.Button(frm_in, text="Input file (.csv)", command=self.select_csv)\
                    .pack(anchor="w", padx=10, pady=(10,4))
                ttk.Entry(frm_in, textvariable=self.csv_path, state="readonly")\
                    .pack(fill="x", padx=10, pady=(0,10))

                # Reference folder (spacy)
                ttk.Button(frm_in, text="Reference folder (spacy)", command=self.select_ref)\
                    .pack(anchor="w", padx=10, pady=(10,4))
                ttk.Label(frm_in, text="Select \"spacy\" folder").pack(anchor="w", padx=10, pady=(0, 10))
                ttk.Entry(frm_in, textvariable=self.ref_dir, state="readonly")\
                    .pack(fill="x", padx=10, pady=(0,10))

                frm_start = ttk.Frame(self); frm_start.pack(fill="x", **pad)
                self.btn_start = ttk.Button(frm_start, text="Start", command=self.on_start, state="disabled")
                self.btn_start.pack(side="left")
                self.lbl_status = ttk.Label(frm_start, text="Select CSV and reference folder.")
                self.lbl_status.pack(side="left", padx=10)

                frm_log = ttk.LabelFrame(self, text="Log")
                frm_log.pack(fill="both", expand=True, padx=10, pady=10)
                cnt = ttk.Frame(frm_log)
                cnt.pack(fill="both", expand=True)
                self.txt_log = tk.Text(cnt, height=14, wrap="none")
                self.txt_log.pack(side="left", fill="both", expand=True)
                self.scroll_y = ttk.Scrollbar(cnt, orient="vertical", command=self.txt_log.yview)
                self.scroll_y.pack(side="right", fill="y")
                self.txt_log.configure(yscrollcommand=self.scroll_y.set)

            def select_csv(self):
                path = filedialog.askopenfilename(
                    title="Select CSV",
                    filetypes=[("CSV files","*.csv"), ("All files","*.*")]
                )
                if path:
                    self.csv_path.set(path)
                    self._update_start_state()

            def select_ref(self):
                path = filedialog.askdirectory(
                    title="Select the reference corpus folder (.spacy files)"
                )
                if path:
                    # Check if the folder name is "spacy"
                    if os.path.basename(path).lower() != "spacy":
                        messagebox.showwarning(
                            "Invalid folder",
                            "The selected folder is not named 'spacy'.\nPlease select the correct 'spacy' folder."
                        )
                        self.select_ref()
                        return
                    # Recursively search for .spacy files (including subfolders)
                    spacy_files = []
                    for root, _, files in os.walk(path):
                        for f in files:
                            if f.lower().endswith(".spacy"):
                                spacy_files.append(os.path.join(root, f))

                    if not spacy_files:
                        messagebox.showwarning(
                            "No .spacy files",
                            "The selected folder does not contain any .spacy files.\nPlease select the correct folder."
                        )
                        # Ask the user to reselect
                        self.select_ref()
                        return

                    self.ref_dir.set(path)
                    self._update_start_state()

            def _update_start_state(self):
                if self.csv_path.get() and self.ref_dir.get():
                    self.btn_start.configure(state="normal")
                    self.lbl_status.configure(text="Ready to start.")
                else:
                    self.btn_start.configure(state="disabled")
                    self.lbl_status.configure(text="Select CSV and reference folder.")

            def on_start(self):
                self.btn_start.configure(state="disabled")
                self.lbl_status.configure(text="Processing...")
                self.log("Processing started...\n")

                def worker():
                    try:
                        out_path = run_pipeline(self.csv_path.get(), self.ref_dir.get(), log_func=self.log)
                        self.log(f"Done: {out_path}\n")
                        messagebox.showinfo("Done", f"Finished:\n{out_path}")
                    except Exception as e:
                        self._append_log(f"Error: {e}\n")
                        messagebox.showerror("Error", str(e))
                    finally:
                        self.btn_start.configure(state="normal")
                        self.lbl_status.configure(text="Finished.")

                threading.Thread(target=worker, daemon=True).start()

            def log(self, msg):
                self.txt_log.insert("end", msg + "\n");
                self.txt_log.see("end");
                self.update_idletasks()


        App().mainloop()

if __name__ == "__main__":
    main()
