In [None]:
import re
from transformers import AutoTokenizer
from nltk import ngrams
from collections import Counter



## STEP 1

### experiments

In [None]:
import re
from transformers import AutoTokenizer
from nltk import ngrams
from collections import Counter

# Load the tokenizer (BERT in this example)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
text = ("The patient was diagnosed with myocarditis and cardiomyopathy. "
        "Also, pheeiong simenowon. The patient had chronic kidney disease and myocarditis symptoms "
        "along with the severe symptoms of the pheeiong simenowon.")
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)

# --- Step 1: Merge subword tokens to extract candidate unknown words ---
candidate_terms = []
i = 0
while i < len(tokens):
    token = tokens[i]
    if i < len(tokens) - 1 and tokens[i+1].startswith("##"):
        current_term = token
        while i + 1 < len(tokens) and tokens[i+1].startswith("##"):
            i += 1
            current_term += tokens[i][2:]
        candidate_terms.append(current_term)
    i += 1

print("Candidate Terms from Subwords:", candidate_terms)

# --- Step 2: Remove duplicates to get unique new words ---
unique_new_words = list(set(candidate_terms))
print("Unique New Words:", unique_new_words)

# --- Step 3: Build a sequence of new words in the order of occurrence ---
unknown_sequence = []
i = 0
while i < len(tokens):
    token = tokens[i]
    # If token is the start of a merged word (followed by subword tokens)
    if i < len(tokens) - 1 and tokens[i+1].startswith("##"):
        current_term = token
        while i + 1 < len(tokens) and tokens[i+1].startswith("##"):
            i += 1
            current_term += tokens[i][2:]
        if current_term in unique_new_words:
            unknown_sequence.append(current_term)
    else:
        if token in unique_new_words:
            unknown_sequence.append(token)
    i += 1

print("Sequence of New Words in Order:", unknown_sequence)

# --- Step 4: N-gram analysis on the sequence of new words ---
# We check for bigrams and trigrams (adjust as needed)
final_phrases = set()
for n in [2, 3]:
    n_grams = list(ngrams(unknown_sequence, n))
    ngram_counts = Counter(n_grams)
    # Consider phrases that appear at least twice
    for gram, count in ngram_counts.items():
        if count >= 2:
            final_phrases.add(" ".join(gram))

print("Frequent 2/3-grams among new words:", final_phrases)

# --- Step 5: Combine the phrases and remaining unique new words ---
# Remove individual words that are part of any detected phrase
words_in_phrases = set()
for phrase in final_phrases:
    words_in_phrases.update(phrase.split())

# Remove the component words from the unique new words
remaining_words = set(unique_new_words) - words_in_phrases

# The final result is the union of the detected phrases and the remaining words
final_detected = list(final_phrases.union(remaining_words))
print("Final detected new terms:", final_detected)


Tokens: ['the', 'patient', 'was', 'diagnosed', 'with', 'my', '##oca', '##rdi', '##tis', 'and', 'card', '##iom', '##yo', '##pathy', '.', 'also', ',', 'ph', '##ee', '##ion', '##g', 'sim', '##eno', '##won', '.', 'the', 'patient', 'had', 'chronic', 'kidney', 'disease', 'and', 'my', '##oca', '##rdi', '##tis', 'symptoms', 'along', 'with', 'the', 'severe', 'symptoms', 'of', 'the', 'ph', '##ee', '##ion', '##g', 'sim', '##eno', '##won', '.']
Candidate Terms from Subwords: ['myocarditis', 'cardiomyopathy', 'pheeiong', 'simenowon', 'myocarditis', 'pheeiong', 'simenowon']
Unique New Words: ['pheeiong', 'simenowon', 'cardiomyopathy', 'myocarditis']
Sequence of New Words in Order: ['myocarditis', 'cardiomyopathy', 'pheeiong', 'simenowon', 'myocarditis', 'pheeiong', 'simenowon']
Frequent 2/3-grams among new words: {'pheeiong simenowon'}
Final detected words & phrases: ['pheeiong simenowon', 'cardiomyopathy', 'myocarditis']


In [None]:
# Define your model name and auth token
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
auth_token = "####"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, token=auth_token, trust_remote_code=True)

text = (
    "To assess the efficacy of our gene name extraction tool, we will analyze a text chunk containing a variety of gene symbols and full names. This includes well-known genes like TP53, BRCA1, and EGFR, alongside developmental genes such as SHH, WNT3A, and HOXD13. We will also test its ability to identify less common genes like FOXP2, STAT3, and VEGF, as well as genes involved in metabolism such as APOE, LDLR, and INS. Furthermore, we will include genes with numerical suffixes like CDK4 and ERBB2, and genes with hyphenated names such as HLA-DRB1 and TNF-alpha. Finally, we will incorporate gene families like the KRAS family and the MYC family to see if the function can handle these broader references."
)

text = re.sub(r"[^A-Za-z0-9\s-]", "", text)


# 1) Tokenize
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)

# 2) Group subtokens into words at each '▁'
words_subtokens = []
current = []
for tok in tokens:
    if tok.startswith("▁"):
        if current:
            words_subtokens.append(current)
        current = [tok]
    else:
        current.append(tok)
if current:
    words_subtokens.append(current)

# 3) Reconstruct the actual words (strip '▁' and concat)
reconstructed = []
for subtoks in words_subtokens:
    w = subtoks[0].lstrip("▁") + "".join(subtoks[1:])
    w = re.sub(r"[^\w]+$", "", w)  # drop trailing punctuation
    reconstructed.append(w)

# 4) Build the sequence of “unknown” words (those split into >1 subtoken)
unknown_sequence = [
    reconstructed[i]
    for i, subtoks in enumerate(words_subtokens)
    if len(subtoks) > 1
]
print("Candidate Terms from Subwords:", unknown_sequence)

# 5) Find frequent 2‑ and 3‑grams in that sequence (threshold ≥2)
freq_2grams = []
freq_3grams = []
for n in (2, 3):
    counts = Counter(ngrams(unknown_sequence, n))
    freq = [" ".join(gram) for gram, cnt in counts.items() if cnt >= 2]
    if n == 2:
        freq_2grams = freq
    else:
        freq_3grams = freq

print("Frequent 2-grams among new words:", freq_2grams)
print("Frequent 3-grams among new words:", freq_3grams)

# 6) Assemble final_unknown_terms, collapsing any repeated phrases
freq_phrases = set(tuple(ng.split()) for ng in (freq_2grams + freq_3grams))
final_unknown_terms = []
i = 0
while i < len(unknown_sequence):
    matched = False
    # try longer phrases first
    for n in (3, 2):
        if i + n <= len(unknown_sequence) and tuple(unknown_sequence[i:i+n]) in freq_phrases:
            phrase = " ".join(unknown_sequence[i:i+n])
            if phrase not in final_unknown_terms:
                final_unknown_terms.append(phrase)
            i += n
            matched = True
            break
    if not matched:
        w = unknown_sequence[i]
        if w not in final_unknown_terms:
            final_unknown_terms.append(w)
        i += 1

print("final_unknown_terms:", final_unknown_terms)


Tokens: ['▁To', '▁assess', '▁the', '▁eff', 'ic', 'acy', '▁of', '▁our', '▁gene', '▁name', '▁extr', 'action', '▁tool', '▁we', '▁will', '▁analyze', '▁a', '▁text', '▁chunk', '▁containing', '▁a', '▁variety', '▁of', '▁gene', '▁symbols', '▁and', '▁full', '▁names', '▁This', '▁includes', '▁well', '-', 'known', '▁genes', '▁like', '▁T', 'P', '5', '3', '▁BR', 'CA', '1', '▁and', '▁E', 'G', 'FR', '▁alongside', '▁development', 'al', '▁genes', '▁such', '▁as', '▁SH', 'H', '▁W', 'NT', '3', 'A', '▁and', '▁HO', 'X', 'D', '1', '3', '▁We', '▁will', '▁also', '▁test', '▁its', '▁ability', '▁to', '▁identify', '▁less', '▁common', '▁genes', '▁like', '▁FO', 'X', 'P', '2', '▁STAT', '3', '▁and', '▁V', 'E', 'GF', '▁as', '▁well', '▁as', '▁genes', '▁involved', '▁in', '▁met', 'abol', 'ism', '▁such', '▁as', '▁A', 'PO', 'E', '▁L', 'DL', 'R', '▁and', '▁IN', 'S', '▁Furthermore', '▁we', '▁will', '▁include', '▁genes', '▁with', '▁numerical', '▁suffix', 'es', '▁like', '▁CD', 'K', '4', '▁and', '▁ER', 'BB', '2', '▁and', '▁genes',

### New Terms DETECTION Algorithm [step 1]

In [None]:
# Define your model name and auth token
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
auth_token = "####"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=auth_token, trust_remote_code=True)

In [None]:
def extract_unknown_terms(text, tokenizer):
    """
    Given a string `text` and a SentencePiece‐style tokenizer,
    returns a list of 'unknown' words and repeated unknown‐word phrases
    (2‑ or 3‑grams appearing ≥2×) in their order of first appearance.
    """
    text = re.sub(r"[^A-Za-z0-9\s-]", "", text)

    # 1) Tokenize
    tokens = tokenizer.tokenize(text)

    # 2) Group subtokens into words at each '▁'
    words_subtokens = []
    current = []
    for tok in tokens:
        if tok.startswith("▁"):
            if current:
                words_subtokens.append(current)
            current = [tok]
        else:
            current.append(tok)
    if current:
        words_subtokens.append(current)

    # 3) Reconstruct words (strip '▁', concat, drop trailing punctuation)
    reconstructed = []
    for subtoks in words_subtokens:
        w = subtoks[0].lstrip("▁") + "".join(subtoks[1:])
        w = re.sub(r"[^\w]+$", "", w)
        reconstructed.append(w)

    # 4) Build sequence of unknown words (split into >1 subtoken)
    unknown_sequence = [
        reconstructed[i]
        for i, subtoks in enumerate(words_subtokens)
        if len(subtoks) > 1
    ]

    # 5) Find repeated 2‑ and 3‑grams (threshold ≥2)
    freq_ngrams = []
    for n in (2, 3):
        for gram, cnt in Counter(ngrams(unknown_sequence, n)).items():
            if cnt >= 2:
                freq_ngrams.append(" ".join(gram))
    freq_tuples = set(tuple(ng.split()) for ng in freq_ngrams)

    # 6) Assemble final list, collapsing repeated phrases
    final_unknown_terms = []
    i = 0
    while i < len(unknown_sequence):
        matched = False
        for n in (3, 2):  # try longer first
            if i + n <= len(unknown_sequence) and tuple(unknown_sequence[i:i+n]) in freq_tuples:
                phrase = " ".join(unknown_sequence[i:i+n])
                if phrase not in final_unknown_terms:
                    final_unknown_terms.append(phrase)
                i += n
                matched = True
                break
        if not matched:
            w = unknown_sequence[i]
            if w not in final_unknown_terms:
                final_unknown_terms.append(w)
            i += 1

    return final_unknown_terms


In [None]:
text = ("To assess the efficacy of our gene name extraction tool, we will analyze a text chunk containing a variety of gene symbols and full names. This includes well known genes like TP53, BRCA1, and EGFR, alongside developmental genes such as SHH, WNT3A, and HOXD13. We will also test its ability to identify less common genes like FOXP2, STAT3, and VEGF, as well as genes involved in metabolism such as APOE, LDLR, and INS. Furthermore, we will include genes with numerical suffixes like CDK4 and ERBB2, and genes with hyphenated names such as HLA-DRB1 and TNF-alpha. Finally, we will incorporate gene families like the KRAS family and the MYC family to see if the function can handle these broader references.")

print("final_unknown_terms:", extract_unknown_terms(text, tokenizer))


final_unknown_terms: ['efficacy', 'extraction', 'TP53', 'BRCA1', 'EGFR', 'developmental', 'SHH', 'WNT3A', 'HOXD13', 'FOXP2', 'STAT3', 'VEGF', 'metabolism', 'APOE', 'LDLR', 'INS', 'suffixes', 'CDK4', 'ERBB2', 'hyphenated', 'HLA-DRB1', 'TNF-alpha', 'KRAS', 'MYC']


## STEP 2

In [None]:
!huggingface-cli login
!pip install -U bitsandbytes
!pip install flash-attn --no-build-isolation




In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(model_name,  token="####")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token="####",
    load_in_8bit=True, # This line requires bitsandbytes
    use_flash_attention_2=True,
    torch_dtype=torch.float16,
    )

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]