In [2]:
import sys
import os
sys.path.append(os.path.abspath("..")) 

import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy import Column, Integer, String, DateTime, func
from sqlalchemy.orm import declarative_base
from sqlalchemy import inspect
from sqlalchemy import text
from database import engine

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from database import Base, SessionLocal, engine, ensure_views_from_files, init_db
from main.sql import load_dict, count_overlap_word

In [3]:
init_db()
ensure_views_from_files()

with engine.begin() as conn:
    conn.execute(text("DROP TABLE IF EXISTS TranslationLog"))

## Loading Database/Identifying Low Count Categories

In [14]:
def get_category_counts() -> pd.DataFrame:
    """Query all categories and their word counts from the database."""
    query = """
    SELECT word_category, COUNT(DISTINCT word_id) as word_count
    FROM WordDict
    GROUP BY word_category
    ORDER BY word_count ASC
    """
    with engine.connect() as conn:
        df = pd.read_sql(text(query), conn)
    return df

def identify_low_count_categories(threshold: int = 5) -> pd.DataFrame:
    """Identify categories with word count below the threshold (likely misclassified)."""
    df = get_category_counts()
    low_count_df = df[df['word_count'] <= threshold]
    print(f"Found {len(low_count_df)} categories with {threshold} or fewer words:")
    return low_count_df

def get_words_in_category(category: str) -> pd.DataFrame:
    """Get all words belonging to a specific category for review."""
    query = """
    SELECT word_id, word, word_category, word_rarity, pinyin, meaning, sentence, sentence_pinyin, sentence_meaning
    FROM WordDict
    WHERE word_category = :category
    """
    with engine.connect() as conn:
        df = pd.read_sql(text(query), conn, params={"category": category})
    return df

def update_word_category(word_id: int, new_category: str) -> bool:
    """Update a single word's category by its ID."""
    query = """
    UPDATE WordDict
    SET word_category = :new_category
    WHERE word_id = :word_id
    """
    try:
        with engine.begin() as conn:
            result = conn.execute(text(query), {"new_category": new_category, "word_id": word_id})
        print(f"Updated word ID {word_id} to category '{new_category}'")
        return True
    except Exception as e:
        print(f"Error updating word ID {word_id}: {e}")
        return False

def batch_update_category(old_category: str, new_category: str) -> int:
    """Move all words from one category to another. Returns number of affected rows."""
    query = """
    UPDATE WordDict
    SET word_category = :new_category
    WHERE category = :old_category
    """
    try:
        with engine.begin() as conn:
            result = conn.execute(text(query), {"new_category": new_category, "old_category": old_category})
            affected = result.rowcount
        print(f"Moved {affected} words from '{old_category}' to '{new_category}'")
        return affected
    except Exception as e:
        print(f"Error batch updating category: {e}")
        return 0

def get_all_categories() -> list:
    """Get a list of all unique categories for reference."""
    query = "SELECT DISTINCT word_category FROM WordDict ORDER BY word_category"
    with engine.connect() as conn:
        result = conn.execute(text(query))
        return [row[0] for row in result]

## Review Low Count Categories

In [8]:
# Display categories with low word counts
low_count_df = identify_low_count_categories(threshold=5)
display(low_count_df)

Found 46 categories with 5 or fewer words:


Unnamed: 0,word_category,word_count
0,Adaptation,1
1,Archaeology,1
2,Attention,1
3,Hospitality,1
4,Interaction,1
5,Investigation,1
6,Material,1
7,Observation,1
8,Opportunity,1
9,Plant,1


In [10]:
# Visualize category distribution
category_counts = get_category_counts()
fig = px.bar(category_counts, x='word_category', y='word_count', 
             title='Word Count by Category',
             labels={'word_count': 'Number of Words', 'word_category': 'Category'})
fig.update_layout(xaxis_tickangle=-45)
fig.show()

## Reclassify Words

Use the cells below to review and reclassify words from low-count categories.

In [18]:
# Example: Review words in a specific low-count category
# Replace 'example_category' with an actual category name from low_count_df
words_to_review = get_words_in_category('Action')
display(words_to_review)

Unnamed: 0,word_id,word,word_category,word_rarity,pinyin,meaning,sentence,sentence_pinyin,sentence_meaning
0,D000011,打算,Action,Common,da3 suan4,Plan to + (Obj) --> Need object,我打算明年去中国旅游,Wǒ dǎsuàn míngnián qù Zhōngguó lǚyóu,I plan to travel to China next year.
1,D000014,当,Action,Common,dāng,To be; to serve as,他想当一名老师。,Tā xiǎng dāng yī míng lǎoshī.,He wants to be a teacher.
2,D000015,当,Action,Common,dāng,To act as; to treat as,当我朋友对我很重要。,Dāng wǒ péngyǒu duì wǒ hěn zhòngyào.,It is important for me to treat him as a friend.
3,D000019,否定,Action,Common,fou3 ding4,To Negate To Deny To Reject,他否定了所有的指控,Tā fǒudìng le suǒyǒu de zhǐkòng.,He denied all the accusations.
4,D000049,收到,Action,Common,shou1 dao4,To receive,我收到了你的邮件。,Wǒ shōudào le nǐ de yóujiàn.,I received your email.
...,...,...,...,...,...,...,...,...,...
290,D003056,越过,Action,Common,yuè guò,To cross over; to overcome,我们需要越过这条河。,Wǒmen xūyào yuèguò zhè tiáo hé.,We need to cross over this river.
291,D003061,打破,Action,Common,dǎ pò,To break or shatter something,他不小心打破了杯子。,Tā bù xiǎoxīn dǎpò le bēizi.,He accidentally broke the cup.
292,D003066,驱散,Action,Common,qū sàn,"To disperse or scatter, typically used for int...",云层渐渐被风驱散了。,Yúncéng jiànjiàn bèi fēng qūsàn le.,The clouds were gradually dispersed by the wind.
293,D003067,丢弃,Action,Common,diū qì,To discard or abandon something.,他把旧衣服丢弃了。,Tā bǎ jiù yīfú diūqì le.,He discarded his old clothes.


In [16]:
# Example: Update a single word's category
update_word_category(word_id='D002654', new_category='Action')

# Example: Move all words from one category to another
# batch_update_category(old_category='misspelled_category', new_category='correct_category')

Updated word ID D002654 to category 'Action'


True

In [19]:
# List all available categories for reference
all_categories = get_all_categories()
print("Available categories:")
for cat in all_categories:
    print(f"  - {cat}")

Available categories:
  - Ability
  - Action
  - Activity
  - Agriculture
  - Animal
  - Appliance
  - Archaeology
  - Architecture
  - Art
  - Attention
  - Behavior
  - Belief
  - Body
  - Business
  - Change
  - Character
  - Classroom
  - Clothing
  - Color
  - Commitment
  - Communication
  - Commute
  - Concept
  - Contrast
  - Cooperation
  - Culture
  - Daily Life
  - Defense
  - Degree
  - Description
  - Direction
  - Economics
  - Education
  - Emotion
  - Entertainment
  - Etiquette
  - Event
  - Expectation
  - Experience
  - Expression
  - Fashion
  - Food
  - Frequency
  - Furniture
  - General
  - Geography
  - Grammar
  - Health
  - Hobby
  - Holidays
  - Hospitality
  - Identity
  - Idiom
  - Impact
  - Information
  - Intensity
  - Intent
  - Interaction
  - Investigation
  - Knowledge
  - Language
  - Law
  - Lifestyle
  - Literary
  - Literature
  - Location
  - Material
  - Mathematics
  - Measure Word
  - Measurement
  - Mediation
  - Memory
  - Military
  - Musi