In [514]:
from openai import OpenAI
import requests
import os
from dotenv import load_dotenv

load_dotenv()

True

In [515]:
from pymongo import MongoClient

client = MongoClient("mongodb://root:tyan123@192.168.50.217:27017")

db = client["speech2IPA"]
collection = db["phonemes"]

def insert_db(text, phonemes, word_alignments = "", char_alignments = ""):
    document = {
        "text": text,
        "phonemes": phonemes,
        "word_alignments": word_alignments,
        "char_alignments": char_alignments
    }

    collection.update_one(
        {"text": document["text"]},
        {"$set": document},
        upsert=True
    )

    print("✅ Upsert done")

In [516]:
def call_ollama_api(prompt: str) -> str:
    try:
        # API endpoint của Ollama (mặc định chạy local)
        url = "http://192.168.50.217:11434/api/chat"
        
        payload = {
            # "model": "deepseek-v3.1:671b-cloud",
            "model": "gpt-oss:120b-cloud",
            # "model": "qwen3-vl:235b-cloud",
            "messages": [{
                "role": "user",
                "content": prompt
            }],
            # "think": "medium",
            "think": "medium",
            "stream": False  # Set False để nhận toàn bộ response một lần
        }
        response = requests.post(url, json=payload)
        response.raise_for_status()
        print(response.json())
        return response.json()["message"]["content"]
        
    except Exception as e:
        print(f"API call failed: {e}")
        return None

In [517]:
PROMPT_ALIGNMENT_LETTER = """
You are a linguistic alignment assistant.  
Your task is to align each **phoneme** in the IPA transcription of a word or multi-word phrase with its corresponding **grapheme(s)** (letters) in the English spelling.

Mapping rules:
- If multiple letters map to one phoneme (e.g., "ch" → "tʃ"), group them together.  
- There are DIPHTHONGS - ["tʃ", "dʒ", "oʊ", "oʊ̃", "əʊ", "aʊ", "əɹ", "ɜɹ", "ɔɹ"], which are single phonemes made of two sounds, so they should be treated as one unit.
- ["w", "j"] (the y-sound) are **semivowels**, so the next phoneme usually has same letter as them.
- If one letter contributes to multiple phonemes, repeat it as needed.  
- If the phrase has multiple words, insert an empty phoneme entry between words to mark the space.  
- Also, make sure **no letter from the original input is missing** from the output — every character (including punctuation such as apostrophes ’, hyphens -, diacritics, etc.except spaces and punctuation) must appear in at least one "letter" field with the next letter.
- DO NOT invent or drop any phonemes or letters. If there are some phonemes or letter that do not map to anything, just group to the previous phoneme or letter.

Input:
Text: "{text}"  
Phonemes: "{phonemes}"

Output format:
A JSON list of objects, each with fields:
- "phoneme": string
- "letter": string (or letters that correspond to it)

Example input:
Text: "it's been a while since we've all seen each other"
Phonemes: "ɪts bɪn ʌ waɪl sɪns wiv ɔl sin itʃ ʌðɝ"

Example output:
[
  {{"phoneme": "ɪ", "letter": "i"}},
  {{"phoneme": "t", "letter": "t"}},
  {{"phoneme": "s", "letter": "'s"}},
  {{"phoneme": "", "letter": ""}},
  {{"phoneme": "b", "letter": "b"}},
  {{"phoneme": "ɪ", "letter": "ee"}},
  {{"phoneme": "n", "letter": "n"}},
  {{"phoneme": "", "letter": ""}},
  {{"phoneme": "ʌ", "letter": "a"}},
  {{"phoneme": "", "letter": ""}},
  {{"phoneme": "w", "letter": "wh"}},
  {{"phoneme": "aɪ", "letter": "i"}},
  {{"phoneme": "l", "letter": "le"}},
  {{"phoneme": "", "letter": ""}},
  {{"phoneme": "s", "letter": "s"}},
  {{"phoneme": "ɪ", "letter": "i"}},
  {{"phoneme": "n", "letter": "n"}},
  {{"phoneme": "s", "letter": "ce"}},
  {{"phoneme": "", "letter": ""}},
  {{"phoneme": "w", "letter": "w"}},
  {{"phoneme": "i", "letter": "e"}},
  {{"phoneme": "v", "letter": "'ve"}},
  {{"phoneme": "", "letter": ""}},
  {{"phoneme": "ɔ", "letter": "a"}},
  {{"phoneme": "l", "letter": "ll"}},
  {{"phoneme": "", "letter": ""}},
  {{"phoneme": "s", "letter": "s"}},
  {{"phoneme": "i", "letter": "ee"}},
  {{"phoneme": "n", "letter": "n"}},
  {{"phoneme": "", "letter": ""}},
  {{"phoneme": "i", "letter": "ea"}},
  {{"phoneme": "tʃ", "letter": "ch"}},
  {{"phoneme": "", "letter": ""}},
  {{"phoneme": "ʌ", "letter": "o"}},
  {{"phoneme": "ð", "letter": "th"}},
  {{"phoneme": "ɝ", "letter": "er"}}
]

Rules:
- Use phoneme order to guide alignment, left to right.
- Preserve duplicates if a letter contributes to multiple phonemes.
- Do not add extra commentary — return only valid JSON. 
- Every "letter" value MUST be a contiguous substring of Word (case preserved) and MUST appear in Word at the specified start..end indices
"""

PROMPT_ALIGNMENT_WORD = """
You are a word-to-phoneme alignment assistant.
Your task is to align each word in a given English text with its corresponding group of phonemes from a continuous phoneme string.

Rules:
- Do NOT change, remove, or add ANY characters in the Text or Phonemes input.
- Do NOT insert new phonemes, do NOT correct phonemes, do NOT guess missing sounds.
- ["w", "j"] (the y-sound) are **semivowels**, so the next phoneme usually has same letter as them.
- The total phoneme sequence in the output MUST be exactly the same string as the input (same length, same order).
- Split the continuous phoneme string into segments so that each segment corresponds to one word in the text.
- Word boundaries must match: one word gets one continuous phoneme segment. No reordering.
- Output must preserve the exact spacing of the original text (use a single space between words).
- Do not add extra commentary — return only valid JSON. 

Input:
Text: "{text}"
Phonemes: "{phonemes}"

Output format:
A JSON list of objects, each with fields:
- "word": string
- "phonemes": string

Example output:
[
  {{"word": "february's", "phonemes": "fɛbjuɝiz"}},
  {{"word": "quick", "phonemes": "kwɪk"}},
  {{"word": "movement", "phonemes": "muvmɛnt"}},
]
"""

PROMPT_TEST = """
You are a linguistic alignment assistant.
Your task is to align each **phoneme** in the IPA transcription of a word or multi-word phrase with its corresponding **grapheme(s)** (letters) in the English spelling.

Core mapping rules:
- If multiple letters map to one phoneme (e.g., "ch" → "tʃ"), group them together.
- Treat the following as **single phoneme units (diphthongs/affricates/rhotics)**: ["tʃ","dʒ","oʊ","oʊ̃","əʊ","aʊ","əɹ","ɜɹ","ɔɹ"].
- ["w","j"] are **semivowels** and typically share letters with the **next phoneme in the **same word**.
- If one letter contributes to multiple phonemes, repeat it as needed.
- Insert an empty phoneme/letter pair {{"phoneme":"","letter":""}} between words to mark spaces.
- **Do not invent or drop** any phonemes or letters.

Word-boundary integrity (VERY IMPORTANT):
1) **Letters are confined to their own word.** A letter from word W must not be mapped to a phoneme that belongs to a different word, **except** standard English contractions attaching left (e.g., "'s", "n't", "'re", "'ve", "'ll", "'d") which may attach to the host word.
2) **Leading-silent letters (liaison/elision)**: If the first letter(s) of a word have **no corresponding phoneme** due to cross-word coarticulation (e.g., “out together” pronounced without /t/ at the start of “together”), attach those leading letters to the **first phoneme chunk of the same word**, not to the previous word.
3) **Maximal contiguity inside each word**: Within a word, each "letter" value must be a **contiguous substring** of that word and chunks must be **greedy forward** so that concatenating all "letter" fields of that word (in order) exactly reconstructs the original spelling of that word (case preserved).
4) **No cross-word borrowing for clean-up**: If something doesn't map, you may merge with the **previous phoneme or previous letter only within the same word**. Do **not** use a phoneme/letter from the previous word to fix the next word, and vice versa.

Coverage/validation:
- Every non-space character from the original Text (including punctuation such as apostrophes ’, hyphens -, diacritics, etc.) must appear in at least one "letter" field of **its own word**.
- The concatenation of all "letter" fields across the whole output (including the empty entries for spaces) must equal the original Text exactly.
- The number and order of phoneme units must follow the provided phoneme string (left → right). Do not add or remove phonemes.

Input:
Text: "{text}"
Phonemes: "{phonemes}"

Output format:
Return **only** a JSON list of objects (no commentary). Each object:
- "phoneme": string (may be "" only for inter-word space markers)
- "letter": string (a contiguous substring from the original Text, case preserved; may be "" only for inter-word space markers)

Example (space handling + elision at word start):
Text: "out together"
Phonemes: "aʊt ɨɡɛðɝ"

Valid output:
[
  {{"phoneme":"aʊ","letter":"ou"}},
  {{"phoneme":"t","letter":"t"}},
  {{"phoneme":"","letter":""}},
  {{"phoneme":"ɨ","letter":"to"}},      // leading 't' of "together" has no phoneme; attach to first phoneme of SAME word
  {{"phoneme":"ɡ","letter":"g"}},
  {{"phoneme":"ɛ","letter":"e"}},
  {{"phoneme":"ð","letter":"th"}},
  {{"phoneme":"ɝ","letter":"er"}}
]

Another example (contractions allowed to attach left):
Text: "january's sale"
Phonemes: "dʒænjuwɛɹiseɪl"
[
  {{"phoneme": "dʒ", "letter": "j"}},
  {{"phoneme": "æ", "letter": "a"}},
  {{"phoneme": "n", "letter": "n"}},
  {{"phoneme": "j", "letter": "u"}},
  {{"phoneme": "u", "letter": "u"}},   
  {{"phoneme": "w", "letter": "a"}},   
  {{"phoneme": "ɛ", "letter": "a"}},   
  {{"phoneme": "ɹ", "letter": "r"}},
  {{"phoneme": "i", "letter": "y"}}
  {{"phoneme": "", "letter": "'s"}},
  {{"phoneme": "", "letter": ""}},
  {{"phoneme": "s", "letter": "s"}},
  {{"phoneme": "eɪ", "letter": "a"}},
  {{"phoneme": "l", "letter": "le"}},
]

Rules recap:
- Use phoneme order to guide alignment, left to right.
- Preserve duplicates if a letter contributes to multiple phonemes (repeat as needed).
- Keep letters within their own word; on elision, attach leading letters to the first following phoneme of the same word.
- Return ONLY valid JSON as specified.

"""

In [518]:
text_input = "february's sale"
phoneme_input = "fɛbjuwɛɹiseɪl"

In [519]:
insert_db(text_input, phoneme_input)

✅ Upsert done


In [520]:
response = call_ollama_api(PROMPT_ALIGNMENT_WORD.format(
    text=text_input,
    phonemes=phoneme_input
))
print(response)

{'model': 'gpt-oss:120b-cloud', 'remote_model': 'gpt-oss:120b', 'remote_host': 'https://ollama.com:443', 'created_at': '2025-10-29T11:22:17.200783884Z', 'message': {'role': 'assistant', 'content': '[\n  {"word": "february\'s", "phonemes": "fɛbjuwɛɹi"},\n  {"word": "sale", "phonemes": "seɪl"}\n]', 'thinking': 'We need to align words to phoneme segments. Text is "february\'s sale". Two words: "february\'s" and "sale". Phoneme string: "fɛbjuwɛɹiseɪl". Need to split into two contiguous segments corresponding to each word, preserving exactly.\n\nWe need to figure segmentation. Let\'s parse phonemes: f ɛ b j u w ɛ ɹ i s e ɪ l ? Wait actual string: "fɛbjuwɛɹiseɪl". Let\'s list characters: f, ɛ, b, j, u, w, ɛ, ɹ, i, s, e, ɪ, l. That\'s 13 symbols.\n\nWord "february\'s": likely pronounced /ˈfɛbjuːˌɛriːz/ or something. But note the phoneme string: fɛbjuwɛɹi s eɪl? Actually "sale" is /seɪl/. The later part "seɪl" corresponds to sale phonemes: s e ɪ l. Indeed we have "...s eɪl" but we see "seɪl" i

In [521]:
import json 

result = json.loads(response)
phonemes = ""

for item in result:
    phonemes += item['phonemes'] + " "

phonemes.strip()

'fɛbjuwɛɹi seɪl'

In [522]:
response = call_ollama_api(PROMPT_TEST.format(
    text=text_input,
    phonemes=phonemes
))
print(response)

{'model': 'gpt-oss:120b-cloud', 'remote_model': 'gpt-oss:120b', 'remote_host': 'https://ollama.com:443', 'created_at': '2025-10-29T11:22:45.147110174Z', 'message': {'role': 'assistant', 'content': '[\n  {"phoneme":"f","letter":"f"},\n  {"phoneme":"ɛ","letter":"e"},\n  {"phoneme":"b","letter":"b"},\n  {"phoneme":"j","letter":"r"},\n  {"phoneme":"u","letter":"u"},\n  {"phoneme":"w","letter":""},\n  {"phoneme":"ɛ","letter":"a"},\n  {"phoneme":"ɹ","letter":"r"},\n  {"phoneme":"i","letter":"y"},\n  {"phoneme":"","letter":"\'s"},\n  {"phoneme":"","letter":""},\n  {"phoneme":"s","letter":"s"},\n  {"phoneme":"eɪ","letter":"a"},\n  {"phoneme":"l","letter":"le"}\n]', 'thinking': 'We need to produce JSON list aligning phonemes to letters for "february\'s sale" with phoneme string "fɛbjuwɛɹi seɪl ". Note there is a trailing space in phonemes indicating maybe word-space after sale. We\'ll need to map.\n\nFirst, split text into words including apostrophe s: "february\'s" and "sale". The text: f e b 

In [523]:
temp = json.loads(response)
temp1 = ""
temp2 = ""
for t in temp:
    temp1 += t['letter']
    temp2 += t['phoneme']
    if t['letter'] == "" and t['phoneme'] == "":
        temp1 += " "    

temp1

"february's sale"

In [524]:
temp2

'fɛbjuwɛɹiseɪl'