In [1]:
import sys
import os
sys.path.append(os.path.abspath("..")) 

import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy import Column, Integer, String, DateTime, func
from sqlalchemy.orm import declarative_base
from sqlalchemy import inspect
from sqlalchemy import text
from database import engine

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from database import Base, SessionLocal, engine, ensure_views_from_files, init_db
from main.sql import load_dict, count_overlap_word
from main.utils import get_completion

In [2]:
init_db()
ensure_views_from_files()

with engine.begin() as conn:
    conn.execute(text("DROP TABLE IF EXISTS TranslationLog"))

## Loading Database/Identifying Low Count Categories

In [3]:
def get_category_counts() -> pd.DataFrame:
    """Query all categories and their word counts from the database."""
    query = """
    SELECT word_category, COUNT(DISTINCT word_id) as word_count
    FROM WordDict
    GROUP BY word_category
    ORDER BY word_count ASC
    """
    with engine.connect() as conn:
        df = pd.read_sql(text(query), conn)
    return df

def identify_low_count_categories(threshold: int = 5) -> pd.DataFrame:
    """Identify categories with word count below the threshold (likely misclassified)."""
    df = get_category_counts()
    low_count_df = df[df['word_count'] <= threshold]
    print(f"Found {len(low_count_df)} categories with {threshold} or fewer words:")
    return low_count_df

def get_words_in_category(category: str) -> pd.DataFrame:
    """Get all words belonging to a specific category for review."""
    query = """
    SELECT word_id, word, word_category, word_rarity, pinyin, meaning, sentence, sentence_pinyin, sentence_meaning
    FROM WordDict
    WHERE word_category = :category
    """
    with engine.connect() as conn:
        df = pd.read_sql(text(query), conn, params={"category": category})
    return df

def update_word_category(word_id: int, new_category: str) -> bool:
    """Update a single word's category by its ID."""
    query = """
    UPDATE WordDict
    SET word_category = :new_category
    WHERE word_id = :word_id
    """
    try:
        with engine.begin() as conn:
            result = conn.execute(text(query), {"new_category": new_category, "word_id": word_id})
        print(f"Updated word ID {word_id} to category '{new_category}'")
        return True
    except Exception as e:
        print(f"Error updating word ID {word_id}: {e}")
        return False

def batch_update_category(old_category: str, new_category: str) -> int:
    """Move all words from one category to another. Returns number of affected rows."""
    query = """
    UPDATE WordDict
    SET word_category = :new_category
    WHERE category = :old_category
    """
    try:
        with engine.begin() as conn:
            result = conn.execute(text(query), {"new_category": new_category, "old_category": old_category})
            affected = result.rowcount
        print(f"Moved {affected} words from '{old_category}' to '{new_category}'")
        return affected
    except Exception as e:
        print(f"Error batch updating category: {e}")
        return 0

def batch_update_category_by_ids(word_ids: list, new_category: str) -> int:
    """Update multiple words to a new category by their IDs. Returns number of affected rows."""
    if not word_ids:
        print("No word IDs provided.")
        return 0
    
    # Create placeholders for the IN clause
    placeholders = ', '.join([f':id_{i}' for i in range(len(word_ids))])
    query = f"""
    UPDATE WordDict
    SET word_category = :new_category
    WHERE word_id IN ({placeholders})
    """
    
    # Build params dict
    params = {"new_category": new_category}
    for i, word_id in enumerate(word_ids):
        params[f'id_{i}'] = word_id
    
    try:
        with engine.begin() as conn:
            result = conn.execute(text(query), params)
            affected = result.rowcount
        print(f"Updated {affected} words to category '{new_category}'")
        print(f"Word IDs: {word_ids}")
        return affected
    except Exception as e:
        print(f"Error batch updating by IDs: {e}")
        return 0

def get_all_categories() -> list:
    """Get a list of all unique categories for reference."""
    query = "SELECT DISTINCT word_category FROM WordDict ORDER BY word_category"
    with engine.connect() as conn:
        result = conn.execute(text(query))
        return [row[0] for row in result]

## Review Low Count Categories

In [4]:
# Display categories with low word counts
low_count_df = identify_low_count_categories(threshold=5)
display(low_count_df)

Found 32 categories with 5 or fewer words:


Unnamed: 0,word_category,word_count
0,Astronomy,1
1,Comparison,1
2,Family,1
3,Finance,1
4,Foundation,1
5,Measurement,1
6,Media,1
7,Observation,1
8,Utility,1
9,Environment,2


In [5]:
# Visualize category distribution
category_counts = get_category_counts()
fig = px.bar(category_counts, x='word_category', y='word_count', 
             title='Word Count by Category',
             labels={'word_count': 'Number of Words', 'word_category': 'Category'})
fig.update_layout(xaxis_tickangle=-45)
fig.show()

## Reclassify Words

Use the cells below to review and reclassify words from low-count categories.

In [6]:
category_counts.tail(50)

Unnamed: 0,word_category,word_count
67,Information,13
68,Progress,13
69,Question,13
70,Experience,14
71,Music,14
72,Entertainment,15
73,Expression,15
74,Utensil,15
75,Intensity,16
76,Law,16


In [7]:
# Display categories with low word counts
low_count_df = identify_low_count_categories(threshold=5)
display(low_count_df)

Found 32 categories with 5 or fewer words:


Unnamed: 0,word_category,word_count
0,Astronomy,1
1,Comparison,1
2,Family,1
3,Finance,1
4,Foundation,1
5,Measurement,1
6,Media,1
7,Observation,1
8,Utility,1
9,Environment,2


In [23]:
# Example: Review words in a specific low-count category
# Replace 'example_category' with an actual category name from low_count_df
words_to_review = get_words_in_category('Mediation')
display(words_to_review)

Unnamed: 0,word_id,word,word_category,word_rarity,pinyin,meaning,sentence,sentence_pinyin,sentence_meaning
0,D002190,调停,Mediation,Rare,tiáotíng,To mediate or reconcile; to act as a go-betwee...,他被选为调停双方争端的代表。,Tā bèi xuǎn wèi tiáotíng shuāngfāng zhēngduān ...,He was chosen as the mediator for the dispute ...
1,D002447,劝阻,Mediation,Rare,quàn zǔ,To dissuade or discourage someone from doing s...,我试图劝阻他不要做傻事。,Wǒ shìtú quànzǔ tā bùyào zuò shǎshì.,I tried to dissuade him from doing something f...
2,D002448,劝阻,Mediation,Rare,quàn zǔ,To prevent someone from acting by giving reaso...,老师的劝阻让学生重新考虑他们的计划。,Lǎoshī de quànzǔ ràng xuéshēng chóngxīn kǎolǜ ...,The teacher's dissuasion made the students rec...


In [20]:
# Example: Update a single word's category
#update_word_category(word_id='D002631', new_category='General')

# Example: Move all words from one category to another
# batch_update_category(old_category='misspelled_category', new_category='correct_category')

# Example: Update multiple words by their IDs
batch_update_category_by_ids(word_ids=['D003450'], new_category='Science')

Updated 1 words to category 'Science'
Word IDs: ['D003450']


1

In [21]:
# List all available categories for reference
all_categories = get_all_categories()
print("Available categories:")
for cat in all_categories:
    print(f"  - {cat}")

Available categories:
  - Ability
  - Achievement
  - Action
  - Activity
  - Agriculture
  - Animal
  - Appliance
  - Architecture
  - Art
  - Behavior
  - Belief
  - Body
  - Business
  - Change
  - Character
  - Clothing
  - Color
  - Commitment
  - Communication
  - Commute
  - Comparison
  - Concept
  - Condition
  - Conflict
  - Connection
  - Contrast
  - Cooking
  - Culture
  - Daily Life
  - Defense
  - Degree
  - Description
  - Direction
  - Economics
  - Education
  - Emotion
  - Entertainment
  - Environment
  - Etiquette
  - Event
  - Expectation
  - Experience
  - Expression
  - Family
  - Fashion
  - Finance
  - Food
  - Foundation
  - Frequency
  - Furniture
  - General
  - Geography
  - Grammar
  - Health
  - History
  - Hobby
  - Holidays
  - Hospitality
  - Identity
  - Idiom
  - Impact
  - Information
  - Intensity
  - Intent
  - Knowledge
  - Language
  - Law
  - Literature
  - Location
  - Mathematics
  - Measure Word
  - Measurement
  - Media
  - Mediation
  - M

## Reclassifying Category

In [13]:
from typing import List
from main.utils import get_completion, parse_response_table

In [14]:
def reclassify_words(
        words: List[str], 
        categories: List[str],
        model: str = "gpt-4o-mini",
        temperature: float = 0
    ) -> pd.DataFrame:
    """
    Reclassify words into categories using GPT.
    
    Args:
        words: List of words to classify
        categories: List of available categories
        model: OpenAI model to use
        temperature: Temperature for the model
        
    Returns:
        DataFrame with columns: Word, Category
    """
    words_str = "\n".join(f"- {word}" for word in words)
    categories_str = "\n".join(f"- {cat}" for cat in categories)
    
    prompt = f"""You are a language classification assistant. Given a list of words and a list of categories, 
assign each word to the most appropriate category.

Rules:
1. Each word must be assigned exactly one category
2. If a word fits well into an existing category, use that category
3. If no existing category is a good fit, you may suggest a new category name
4. Keep the original category if it's still the best fit

Words to classify:
{words_str}

Available categories:
{categories_str}

Return the results as a table with columns: Word | Category
Use | as the delimiter. Include a header row.
Do not include any explanation, just the table."""

    response = get_completion(prompt, model=model, temperature=temperature)
    content = response.choices[0].message.content
    
    # Parse the response into a DataFrame
    df = parse_response_table(content)
    
    return df


In [15]:
df = load_dict()

In [16]:
reclassed = reclassify_words(
    words=get_words_in_category('General')['word'].drop_duplicates().tolist(),
    categories=df['Word Category'].drop_duplicates().tolist()
)

In [17]:
reclassed['Category'].value_counts()

Category
General         42
Action          22
Opinion         14
Concept         12
Quantity         9
Emotion          8
Ability          6
Degree           3
Preposition      3
Event            2
Relationship     2
Question         2
Necessity        2
Behavior         2
Grammar          2
Relatives        1
Time             1
Measure Word     1
Knowledge        1
Activity         1
Achievement      1
Direction        1
Description      1
Sports           1
Experience       1
Frequency        1
Goal             1
Perspective      1
Place            1
Value            1
Name: count, dtype: int64

In [18]:
reclassed.loc[reclassed.Category == 'Measure Word']

Unnamed: 0,Word,Category
101,速,Measure Word
