In [2]:
import sys
import os
sys.path.append(os.path.abspath("..")) 

import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy import Column, Integer, String, DateTime, func
from sqlalchemy.orm import declarative_base
from sqlalchemy import inspect
from sqlalchemy import text
from database import engine

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from database import Base, SessionLocal, engine, ensure_views_from_files, init_db
from main.sql import load_dict, count_overlap_word

In [3]:
init_db()
ensure_views_from_files()

with engine.begin() as conn:
    conn.execute(text("DROP TABLE IF EXISTS TranslationLog"))

## Loading Database/Identifying Low Count Categories

In [4]:
def get_category_counts() -> pd.DataFrame:
    """Query all categories and their word counts from the database."""
    query = """
    SELECT word_category, COUNT(DISTINCT word_id) as word_count
    FROM WordDict
    GROUP BY word_category
    ORDER BY word_count ASC
    """
    with engine.connect() as conn:
        df = pd.read_sql(text(query), conn)
    return df

def identify_low_count_categories(threshold: int = 5) -> pd.DataFrame:
    """Identify categories with word count below the threshold (likely misclassified)."""
    df = get_category_counts()
    low_count_df = df[df['word_count'] <= threshold]
    print(f"Found {len(low_count_df)} categories with {threshold} or fewer words:")
    return low_count_df

def get_words_in_category(category: str) -> pd.DataFrame:
    """Get all words belonging to a specific category for review."""
    query = """
    SELECT word_id, word, word_category, word_rarity, pinyin, meaning, sentence, sentence_pinyin, sentence_meaning
    FROM WordDict
    WHERE word_category = :category
    """
    with engine.connect() as conn:
        df = pd.read_sql(text(query), conn, params={"category": category})
    return df

def update_word_category(word_id: int, new_category: str) -> bool:
    """Update a single word's category by its ID."""
    query = """
    UPDATE WordDict
    SET word_category = :new_category
    WHERE word_id = :word_id
    """
    try:
        with engine.begin() as conn:
            result = conn.execute(text(query), {"new_category": new_category, "word_id": word_id})
        print(f"Updated word ID {word_id} to category '{new_category}'")
        return True
    except Exception as e:
        print(f"Error updating word ID {word_id}: {e}")
        return False

def batch_update_category(old_category: str, new_category: str) -> int:
    """Move all words from one category to another. Returns number of affected rows."""
    query = """
    UPDATE WordDict
    SET word_category = :new_category
    WHERE category = :old_category
    """
    try:
        with engine.begin() as conn:
            result = conn.execute(text(query), {"new_category": new_category, "old_category": old_category})
            affected = result.rowcount
        print(f"Moved {affected} words from '{old_category}' to '{new_category}'")
        return affected
    except Exception as e:
        print(f"Error batch updating category: {e}")
        return 0

def batch_update_category_by_ids(word_ids: list, new_category: str) -> int:
    """Update multiple words to a new category by their IDs. Returns number of affected rows."""
    if not word_ids:
        print("No word IDs provided.")
        return 0
    
    # Create placeholders for the IN clause
    placeholders = ', '.join([f':id_{i}' for i in range(len(word_ids))])
    query = f"""
    UPDATE WordDict
    SET word_category = :new_category
    WHERE word_id IN ({placeholders})
    """
    
    # Build params dict
    params = {"new_category": new_category}
    for i, word_id in enumerate(word_ids):
        params[f'id_{i}'] = word_id
    
    try:
        with engine.begin() as conn:
            result = conn.execute(text(query), params)
            affected = result.rowcount
        print(f"Updated {affected} words to category '{new_category}'")
        print(f"Word IDs: {word_ids}")
        return affected
    except Exception as e:
        print(f"Error batch updating by IDs: {e}")
        return 0

def get_all_categories() -> list:
    """Get a list of all unique categories for reference."""
    query = "SELECT DISTINCT word_category FROM WordDict ORDER BY word_category"
    with engine.connect() as conn:
        result = conn.execute(text(query))
        return [row[0] for row in result]

## Review Low Count Categories

In [12]:
# Display categories with low word counts
low_count_df = identify_low_count_categories(threshold=5)
display(low_count_df)

Found 32 categories with 5 or fewer words:


Unnamed: 0,word_category,word_count
0,Ability,2
1,Agriculture,2
2,Classroom,2
3,Culture,2
4,Hospitality,2
5,Identity,2
6,Lifestyle,2
7,Literary,2
8,Measurement,2
9,Memory,2


In [6]:
# Visualize category distribution
category_counts = get_category_counts()
fig = px.bar(category_counts, x='word_category', y='word_count', 
             title='Word Count by Category',
             labels={'word_count': 'Number of Words', 'word_category': 'Category'})
fig.update_layout(xaxis_tickangle=-45)
fig.show()

## Reclassify Words

Use the cells below to review and reclassify words from low-count categories.

In [7]:
category_counts.tail(50)

Unnamed: 0,word_category,word_count
57,Contrast,11
58,Idiom,11
59,Information,11
60,Science,11
61,Experience,12
62,Change,13
63,Expression,13
64,Music,13
65,Question,13
66,Behavior,14


In [56]:
# Display categories with low word counts
low_count_df = identify_low_count_categories(threshold=5)
display(low_count_df)

Found 23 categories with 5 or fewer words:


Unnamed: 0,word_category,word_count
0,Ability,2
1,Culture,2
2,Hospitality,2
3,Identity,2
4,Memory,2
5,Misc,2
6,Mathematics,3
7,Mediation,3
8,Preposition,3
9,Belief,4


In [57]:
# Example: Review words in a specific low-count category
# Replace 'example_category' with an actual category name from low_count_df
words_to_review = get_words_in_category('Identity')
display(words_to_review)

Unnamed: 0,word_id,word,word_category,word_rarity,pinyin,meaning,sentence,sentence_pinyin,sentence_meaning
0,D002200,身份,Identity,Common,shēnfèn,"Identity or status of a person, often related ...",请出示您的身份证明。,Qǐng chūshì nín de shēnfèn zhèngmíng.,Please show your identification.
1,D003145,混血儿,Identity,Common,hùn xuè ér,A person of mixed racial or ethnic heritage,她是一个混血儿，拥有中英文化的背景。,"Tā shì yī gè hùn xuè ér, yōngyǒu zhōng-yīng wé...","She is of mixed race, with both Chinese and Br..."


In [59]:
from main.utils import get_completion

In [58]:
get_words_in_category('General')

Unnamed: 0,word_id,word,word_category,word_rarity,pinyin,meaning,sentence,sentence_pinyin,sentence_meaning
0,D000012,大概,General,Common,da4 gai4,Approximately,他大概已经出发了,Tā dàgài yǐjīng chūfā le.,He probably already left
1,D000039,肯定,General,Common,ken3 ding4,Sure/Certain,你肯定能完成这个任务,Nǐ kěndìng néng wánchéng zhège rènwù,You will definitely be able to complete this t...
2,D000061,需要,General,Common,xu1 yao4,Need (As opposed to 要 which is just want),你需要帮助吗？,Nǐ xūyào bāngzhù ma?,Do you need help?
3,D000076,并,General,Common,bìng,And / also,他并不喜欢吃水果。,Tā bìng bù xǐhuān chī shuǐguǒ.,He does not like to eat fruit either.
4,D000130,超市,General,Common,chāo shì,Supermarket,我们去超市买一些水果和蔬菜。,Wǒmen qù chāoshì mǎi yīxiē shuǐguǒ hé shūcài.,We are going to the supermarket to buy some fr...
...,...,...,...,...,...,...,...,...,...
161,D003199,当了,General,Common,dāng le,To have taken on a role or position,他当了班长。,Tā dāngle bānzhǎng.,He became the class monitor.
162,D003210,局面,General,Common,jú miàn,"Situation; the current state of affairs, espec...",在这种局面下，我们需要冷静处理。,"Zài zhè zhǒng júmiàn xià, wǒmen xūyào lěngjìng...","In this situation, we need to handle it calmly."
163,D003211,局面,General,Common,jú miàn,"Aspect or phase of a situation, indicating a s...",他努力改善公司的局面。,Tā nǔlì gǎishàn gōngsī de júmiàn.,He strives to improve the company's situation.
164,D003244,公共,General,Common,gōng gòng,"Public or communal, referring to something tha...",公共交通是城市发展的关键。,Gōnggòng jiāotōng shì chéngshì fāzhǎn de guānj...,Public transportation is key to urban developm...


In [55]:
# Example: Update a single word's category
#update_word_category(word_id='D002631', new_category='General')

# Example: Move all words from one category to another
# batch_update_category(old_category='misspelled_category', new_category='correct_category')

# Example: Update multiple words by their IDs
batch_update_category_by_ids(word_ids=['D002717', 'D002760', 'D003187'], new_category='Work')

Updated 3 words to category 'Work'
Word IDs: ['D002717', 'D002760', 'D003187']


3

In [19]:
# List all available categories for reference
all_categories = get_all_categories()
print("Available categories:")
for cat in all_categories:
    print(f"  - {cat}")

Available categories:
  - Ability
  - Action
  - Activity
  - Agriculture
  - Animal
  - Appliance
  - Archaeology
  - Architecture
  - Art
  - Attention
  - Behavior
  - Belief
  - Body
  - Business
  - Change
  - Character
  - Classroom
  - Clothing
  - Color
  - Commitment
  - Communication
  - Commute
  - Concept
  - Contrast
  - Cooperation
  - Culture
  - Daily Life
  - Defense
  - Degree
  - Description
  - Direction
  - Economics
  - Education
  - Emotion
  - Entertainment
  - Etiquette
  - Event
  - Expectation
  - Experience
  - Expression
  - Fashion
  - Food
  - Frequency
  - Furniture
  - General
  - Geography
  - Grammar
  - Health
  - Hobby
  - Holidays
  - Hospitality
  - Identity
  - Idiom
  - Impact
  - Information
  - Intensity
  - Intent
  - Interaction
  - Investigation
  - Knowledge
  - Language
  - Law
  - Lifestyle
  - Literary
  - Literature
  - Location
  - Material
  - Mathematics
  - Measure Word
  - Measurement
  - Mediation
  - Memory
  - Military
  - Musi

## Reclassifying Category

In [63]:
from typing import List
from main.utils import get_completion, parse_response_table

In [None]:
def reclassify_words(
        words: List[str], 
        categories: List[str],
        model: str = "gpt-4o-mini",
        temperature: float = 0
    ) -> pd.DataFrame:
    """
    Reclassify words into categories using GPT.
    
    Args:
        words: List of words to classify
        categories: List of available categories
        model: OpenAI model to use
        temperature: Temperature for the model
        
    Returns:
        DataFrame with columns: Word, Category
    """
    words_str = "\n".join(f"- {word}" for word in words)
    categories_str = "\n".join(f"- {cat}" for cat in categories)
    
    prompt = f"""You are a language classification assistant. Given a list of words and a list of categories, 
assign each word to the most appropriate category.

Rules:
1. Each word must be assigned exactly one category
2. If a word fits well into an existing category, use that category
3. If no existing category is a good fit, you may suggest a new category name
4. Keep the original category if it's still the best fit

Words to classify:
{words_str}

Available categories:
{categories_str}

Return the results as a table with columns: Word | Category
Use | as the delimiter. Include a header row.
Do not include any explanation, just the table."""

    response = get_completion(prompt, model=model, temperature=temperature)
    content = response.choices[0].message.content
    
    # Parse the response into a DataFrame
    df = parse_response_table(content)
    
    return df


In [69]:
df = load_dict()

In [66]:
reclassed = reclassify_words(
    words=get_words_in_category('General')['word'].drop_duplicates().tolist(),
    categories=df['Word Category'].drop_duplicates().tolist()
)

In [71]:
reclassed['Category'].value_counts()

Category
Concept         28
Action          23
General         19
Opinion         17
Quantity         8
Emotion          7
Ability          5
Situation        3
Relationship     3
Preposition      3
Permission       3
Information      3
Grammar          2
Question         2
Literature       2
Behavior         2
Necessity        2
Organization     1
Example          1
Opportunity      1
Knowledge        1
Measure Word     1
Relatives        1
Sports           1
Economics        1
Activity         1
Achievement      1
Time             1
Description      1
Direction        1
Frequency        1
Goal             1
Place            1
Value            1
Name: count, dtype: int64

In [73]:
reclassed.loc[reclassed.Category == 'Measure Word']

Unnamed: 0,Word,Category
106,速,Measure Word
