# 짭오링고(듀오링고 아닙니다~)

필요한 테스크

1. 영어 단어 파일.csv(랜덤으로 단어 픽)
2. 단어를 통한 문장 생성기(예문 만들기 용)
3. 유사 단어 생성기(단어 찾기 게임)
4. 한글 번역기
5. 스트림랩

In [12]:
import os, torch, re, random, time
from typing import List
import pandas as pd
from dotenv import load_dotenv


load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")

os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

DEVICE = 0 if torch.cuda.is_available() else -1
print("Device:", "cuda:0" if DEVICE == 0 else "cpu")


Device: cuda:0


In [13]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, set_seed
set_seed(1337)

def get_generator(model_id: str = "google/flan-t5-small", token: str | bool = None):
    gen = pipeline(
        task="text2text-generation",
        model=model_id,
        device=DEVICE,
        token=(token if token is not None else HF_TOKEN or True),
    )
    return gen


def get_translator_en2ko(model_id: str = "facebook/m2m100_418M", token: str | bool = None):
    tok = AutoTokenizer.from_pretrained(model_id, token=(token if token is not None else HF_TOKEN or True))
    mdl = AutoModelForSeq2SeqLM.from_pretrained(model_id, token=(token if token is not None else HF_TOKEN or True))
    tr = pipeline(
        task="translation",
        model=mdl,
        tokenizer=tok,
        src_lang="en",
        tgt_lang="ko",
        device=DEVICE,
        token=(token if token is not None else HF_TOKEN or True),
    )
    return tr

def get_translator_ko2en(model_id: str = "facebook/m2m100_418M", token: str | bool = None):
    tok = AutoTokenizer.from_pretrained(model_id, token=(token if token is not None else HF_TOKEN or True))
    mdl = AutoModelForSeq2SeqLM.from_pretrained(model_id, token=(token if token is not None else HF_TOKEN or True))
    tr = pipeline(
        task="translation",
        model=mdl,
        tokenizer=tok,
        src_lang="ko",
        tgt_lang="en",
        device=DEVICE,
        token=(token if token is not None else HF_TOKEN or True),
    )
    return tr

gen = get_generator()
en2ko = get_translator_en2ko()  # 게임은 EN→KO 힌트를 씀

# 스모크 테스트
print("[GEN]", gen("Write a short sentence with the word 'time'.", max_new_tokens=24)[0]["generated_text"])
print("[TR ]", en2ko("This is an online pipeline test.", max_new_tokens=24)[0]["translation_text"])


Device set to use cuda:0
Device set to use cuda:0


[GEN] time s time s time s time s time s time s
[TR ] 이것은 온라인 파이프라인 테스트입니다.


In [14]:
CSV_PATH = "Concreteness_english.csv"
WORD_COL = "Word"

def load_wordlist():
    if os.path.exists(CSV_PATH):
        try:
            df = pd.read_csv(CSV_PATH)
            if WORD_COL in df.columns:
                words = (df[WORD_COL].astype(str).str.strip().str.lower()
                         .dropna().unique().tolist())
                words = [w for w in words if w.isalpha() and len(w) >= 2]
            else:
                words = []
        except Exception:
            words = []
    else:
        words = []
    if not words:
        words = ["apple", "river", "music", "future", "pattern", "me", "friend", "time"]
    return words

WORDS = load_wordlist()
pd.DataFrame({"Word": WORDS[:20]})


Unnamed: 0,Word
0,roadsweeper
1,traindriver
2,tush
3,hairdress
4,pharmaceutics
5,hoover
6,shopkeeping
7,pushiness
8,underdevelop
9,tirelessness


In [15]:
MAX_WORDS       = 6      # 생성 문장 단어 상한 (난이도)
MAX_NEW_TOKENS  = 24
RETRIES_GEN     = 6
WORD_RE         = re.compile(r"[A-Za-z']+")

def tokenize_words(text: str) -> List[str]:
    return WORD_RE.findall(text)

def enforce_sentence_end(s: str) -> str:
    s = s.strip()
    if not re.search(r"[.!?]$", s):
        s += "."
    return s

def is_tautology(sent: str, word: str) -> bool:
    w = re.escape(word.lower())
    return bool(re.search(rf"\b{w}\b\s+is\s+(a|the)?\s*\b{w}\b", sent.lower()))

def build_prompt(word: str) -> str:
    # 정의문 금지 + 단어 1회만
    return (f"Write one natural English sentence under {MAX_WORDS} words "
            f"that uses the word '{word}' exactly once. "
            f"Avoid definitions like \"{word} is a {word}\" and keep it conversational.")

def generate_sentence_with_word(word: str, gen_pipe) -> str:
    word = word.lower()
    for _ in range(RETRIES_GEN):
        out = gen_pipe(build_prompt(word),
                       max_new_tokens=MAX_NEW_TOKENS,
                       do_sample=False, num_return_sequences=1)[0]["generated_text"]
        out = enforce_sentence_end(out)
        toks = tokenize_words(out)
        has_word = re.search(rf"\b{re.escape(word)}\b", out, flags=re.IGNORECASE) is not None
        if has_word and 3 <= len(toks) <= MAX_WORDS and not is_tautology(out, word):
            # 단어 2회 이상 반복 억제(관용적 반복 허용 여지로 ≤2)
            if len(re.findall(rf"\b{re.escape(word)}\b", out, flags=re.IGNORECASE)) <= 2:
                return out
    # 폴백 (항상 단어 포함, 짧고 자연스럽게)
    if word == "me":
        return "This is me."
    return f"I like {word}."

def translate_en2ko(text: str, tr_pipe) -> str:
    return tr_pipe(text, max_new_tokens=MAX_NEW_TOKENS)[0]["translation_text"]


In [16]:
from ipywidgets import VBox, HBox, Button, HTML, Layout
from IPython.display import display

class MemoryOrderGame:
    def __init__(self, words, gen_pipe, tr_pipe):
        self.words = words
        self.gen = gen_pipe
        self.tr  = tr_pipe
        
        self.round_id = 0
        self.score = {"correct": 0, "total": 0}
        self.hint_on = False
        
        # 상단 라벨
        self.lbl_round = HTML("<b>라운드:</b> 0")
        self.lbl_hint  = HTML("<b>한국어 힌트:</b> —")
        self.lbl_count = HTML("<b>단어 수:</b> —")
        self.lbl_selected = HTML("<b>당신의 선택:</b> —")
        self.lbl_result   = HTML("")
        self.lbl_score    = HTML("<b>스코어:</b> 0 / 0 (정확도 0.0%)")
        
        # 토큰 버튼 컨테이너
        self.token_box = HBox()
        
        # 컨트롤 버튼
        self.btn_reset    = Button(description="선택 초기화", layout=Layout(width="120px"))
        self.btn_reshuffle= Button(description="재셔플", layout=Layout(width="120px"))
        self.btn_hint     = Button(description="힌트 토글", layout=Layout(width="120px"))
        self.btn_check    = Button(description="정답 확인", button_style="primary", layout=Layout(width="120px"))
        self.btn_next     = Button(description="다음 라운드 ▶", layout=Layout(width="140px"))
        
        self.btn_reset.on_click(self.on_reset)
        self.btn_reshuffle.on_click(self.on_reshuffle)
        self.btn_hint.on_click(self.on_hint)
        self.btn_check.on_click(self.on_check)
        self.btn_next.on_click(self.on_next)
        
        self.box = VBox([
            HBox([self.lbl_round, self.lbl_count]),
            self.lbl_hint,
            self.token_box,
            self.lbl_selected,
            HBox([self.btn_reset, self.btn_reshuffle, self.btn_hint, self.btn_check, self.btn_next]),
            self.lbl_result,
            self.lbl_score
        ])
        
        # 상태
        self.word = None
        self.sent_en = None
        self.sent_ko = None
        self.tokens = []
        self.shuffled = []
        self.selected_idx = []
        self.correct = None
        
        self.new_round()
    
    # --- 내부 렌더링 ---
    def _render_tokens(self):
        buttons = []
        for i, tok in enumerate(self.shuffled):
            b = Button(description=tok, layout=Layout(width="auto"))
            b.disabled = (i in self.selected_idx) or (self.correct is True)
            def _on_click(btn, idx=i):
                if idx not in self.selected_idx and not self.correct:
                    self.selected_idx.append(idx)
                    self._update_selected_label()
                    self._render_tokens()
            b.on_click(_on_click)
            buttons.append(b)
        self.token_box.children = tuple(buttons)
    
    def _update_selected_label(self):
        chosen = [self.shuffled[i] for i in self.selected_idx]
        self.lbl_selected.value = "<b>당신의 선택:</b> " + (" ".join(chosen) if chosen else "—")
    
    def _update_top(self):
        self.lbl_round.value = f"<b>라운드:</b> {self.round_id}"
        self.lbl_count.value = f"<b>단어 수:</b> {len(self.tokens)}"
        hint_html = f"<b>한국어 힌트:</b> {self.sent_ko}"
        if self.hint_on and self.tokens:
            hint_html += f" &nbsp;&nbsp; <i>(첫 단어 힌트: <b>{self.tokens[0]}</b>)</i>"
        self.lbl_hint.value = hint_html
        
        acc = (self.score["correct"]/self.score["total"]*100) if self.score["total"] else 0.0
        self.lbl_score.value = f"<b>스코어:</b> {self.score['correct']} / {self.score['total']} (정확도 {acc:.1f}%)"
    
    # --- 라운드 ---
    def new_round(self):
        for _ in range(50):  # 적당한 길이 나올 때까지 시도
            w = random.choice(self.words)
            s = generate_sentence_with_word(w, self.gen)
            toks = tokenize_words(s)
            if 3 <= len(toks) <= MAX_WORDS:
                break
        else:
            w, s, toks = "me", "This is me.", ["This", "is", "me"]
        
        self.round_id += 1
        self.word = w
        self.sent_en = s
        self.sent_ko = translate_en2ko(s, self.tr)
        self.tokens = toks
        self.shuffled = toks[:]
        random.shuffle(self.shuffled)
        self.selected_idx = []
        self.correct = None
        self.lbl_result.value = ""
        
        self._render_tokens()
        self._update_selected_label()
        self._update_top()
    
    # --- 컨트롤 핸들러 ---
    def on_reset(self, _):
        self.selected_idx = []
        self.correct = None
        self.lbl_result.value = ""
        self._render_tokens()
        self._update_selected_label()
    
    def on_reshuffle(self, _):
        random.shuffle(self.shuffled)
        self.selected_idx = []
        self.correct = None
        self.lbl_result.value = ""
        self._render_tokens()
        self._update_selected_label()
    
    def on_hint(self, _):
        self.hint_on = not self.hint_on
        self._update_top()
    
    def on_check(self, _):
        if len(self.selected_idx) != len(self.tokens):
            self.lbl_result.value = "<span style='color:#d97706'>모든 단어를 순서대로 선택해 주세요!</span>"
            return
        pred = [self.shuffled[i] for i in self.selected_idx]
        gold = self.tokens
        ok = ([p.lower() for p in pred] == [g.lower() for g in gold])
        self.correct = ok
        self.score["total"] += 1
        if ok: self.score["correct"] += 1
        if ok:
            self.lbl_result.value = f"<span style='color:#059669'>✅ 정답! 원문: {self.sent_en}</span>"
        else:
            self.lbl_result.value = f"<span style='color:#dc2626'>❌ 오답! 원문: {self.sent_en}</span>"
        self._update_top()
        self._render_tokens()
    
    def on_next(self, _):
        self.new_round()

game = MemoryOrderGame(WORDS, gen, en2ko)
display(game.box)


VBox(children=(HBox(children=(HTML(value='<b>라운드:</b> 1'), HTML(value='<b>단어 수:</b> 4'))), HTML(value='<b>한국어 …