In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/RMAL_spacy

/content/drive/MyDrive/RMAL_spacy


In [None]:
import os
import spacy
from spacy.matcher import Matcher



# spaCyの言語モデルの読み込み（例: 英語）
nlp = spacy.load("en_core_web_sm")  # 必要に応じて変更（日本語なら ja_core_news_sm）

# Matcherの初期化
matcher = Matcher(nlp.vocab)

pattern = [
   {'DEP': {'IN': ['poss','case','det','amod','compound','adjmod','advmod','nummod','predet''nmod']}, 'POS':{'NOT_IN':['SCONJ','ADP']},'OP': '*'},
   {'POS': {'IN': ['NOUN','PRON','PROPN']}, 'TAG': {'NOT_IN': ['PRP$']},}
]

matcher.add("THINK_THAT_NP_V", [pattern], greedy="FIRST")
# dataフォルダのファイルを読み込んで処理
data_dir = "data"

# === 各ファイルを1行ずつ処理 ===
for fname in os.listdir(data_dir):
    if fname.endswith(".txt"):
        file_path = os.path.join(data_dir, fname)
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                doc = nlp(line.strip())
                matches = matcher(doc)
                for match_id, start, end in matches:
                    span = doc[start:end]
                    print(f"{fname}：{span.text}")

[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
JPN527_plain.txt：interviewer
JPN527_plain.txt：these experiences
JPN527_plain.txt：I
JPN527_plain.txt：good job
JPN527_plain.txt：the story
JPN527_plain.txt：It
JPN527_plain.txt：this time
JPN527_plain.txt：money
JPN527_plain.txt：I
JPN527_plain.txt：time
JPN527_plain.txt：money
JPN510_plain.txt：what
JPN510_plain.txt：children
JPN510_plain.txt：they
JPN510_plain.txt：video games
JPN510_plain.txt：sports
JPN510_plain.txt：it
JPN510_plain.txt：they
JPN510_plain.txt：the education
JPN510_plain.txt：They
JPN510_plain.txt：it
JPN510_plain.txt：a competition
JPN510_plain.txt：such good scores
JPN510_plain.txt：100 point
JPN510_plain.txt：They
JPN510_plain.txt：a lot
JPN510_plain.txt：things
JPN510_plain.txt：textbooks
JPN510_plain.txt：what
JPN510_plain.txt：their teachers
JPN510_plain.txt：this work
JPN510_plain.txt：they
JPN510_plain.txt：various knowledge
JPN510_plain.txt：they
JPN510_plain.txt：the University
JPN510_plain.txt：their jobs
JPN510_plain.txt：the situation
JPN510_

In [None]:
from spacy.matcher import DependencyMatcher
import spacy

nlp = spacy.load("en_core_web_sm")
dep_matcher = DependencyMatcher(nlp.vocab)

# パターン定義（think + 主語 + 動詞）
pattern = [
    # anchor: think（または say, know, guess も可能）
    {
        "RIGHT_ID": "anchor_verb",
        "RIGHT_ATTRS": {
            "LEMMA": {"IN": ["think","say","guess"]},
            "POS": "VERB"
        }
    },
    # 補文（ccomp）への従属動詞（例: "is", "runs", "will win"）
    {
        "LEFT_ID": "anchor_verb",
        "REL_OP": ">",
        "RIGHT_ID": "embedded_verb",
        "RIGHT_ATTRS": {
            "DEP": "ccomp",
            "POS": "VERB"
        }
    },
    # 埋め込み動詞の主語（例: "he", "the boy", "she"）
    {
        "LEFT_ID": "embedded_verb",
        "REL_OP": ">",
        "RIGHT_ID": "subject_of_embedded",
        "RIGHT_ATTRS": {
            "DEP": "nsubj",
            "POS": {"IN": ["NOUN", "PROPN", "PRON"]}
        }
    }
]

pattern = [
    # anchor: know
    {
        "RIGHT_ID": "know_verb",
        "RIGHT_ATTRS": {
            "LEMMA": "know",
            "POS": "VERB"
        }
    },
    # 疑問詞: what, where, who など（wh-words）
    {
        "LEFT_ID": "know_verb",
        "REL_OP": ">",
        "RIGHT_ID": "wh_word",
        "RIGHT_ATTRS": {
            "TAG": {"IN": ["WP", "WRB"]}  # WP: who/what, WRB: where/when/why/how
        }
    },
    # 疑問詞節の中の動詞
    {
        "LEFT_ID": "wh_word",
        "REL_OP": ">>",
        "RIGHT_ID": "wh_clause_verb",
        "RIGHT_ATTRS": {
            "POS": "VERB"
        }
    },
    # 疑問詞節の中の主語
    {
        "LEFT_ID": "wh_clause_verb",
        "REL_OP": ">",
        "RIGHT_ID": "wh_clause_subject",
        "RIGHT_ATTRS": {
            "DEP": "nsubj",
            "POS": {"IN": ["NOUN", "PROPN", "PRON"]}
        }
    }
]

# パターン登録
dep_matcher.add("THINK_S_V", [pattern])

# === 各ファイルの行を処理 ===
for filename in os.listdir(data_dir):
    if filename.endswith(".txt"):
        file_path = os.path.join(data_dir, filename)
        with open(file_path, encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                doc = nlp(line)
                matches = dep_matcher(doc)
                for match_id, token_ids in matches:
                    token_ids_sorted = sorted(token_ids)
                    span = doc[min(token_ids_sorted):max(token_ids_sorted)+1]
                    print(f"{filename}：{span.text}")
