In [None]:
import json
import csv
import re
from typing import Dict, List, Tuple

class WordChecker:
    def __init__(self, json_file_path: str, csv_file_path: str):
        """
        Kh·ªüi t·∫°o WordChecker
        
        Args:
            json_file_path: ƒê∆∞·ªùng d·∫´n ƒë·∫øn file JSON
            csv_file_path: ƒê∆∞·ªùng d·∫´n ƒë·∫øn file CSV
        """
        self.json_file_path = json_file_path
        self.csv_file_path = csv_file_path
        self.json_data = {}
        self.csv_data = {}
        
    def load_json_data(self) -> Dict:
        """T·∫£i d·ªØ li·ªáu t·ª´ file JSON"""
        try:
            with open(self.json_file_path, 'r', encoding='utf-8') as f:
                self.json_data = json.load(f)
            print(f"‚úì ƒê√£ t·∫£i th√†nh c√¥ng {len(self.json_data)} entries t·ª´ file JSON")
        except Exception as e:
            print(f"‚úó L·ªói khi ƒë·ªçc file JSON: {e}")
        return self.json_data
    
    def load_csv_data(self) -> Dict:
        """T·∫£i d·ªØ li·ªáu t·ª´ file CSV"""
        try:
            with open(self.csv_file_path, 'r', encoding='utf-8') as f:
                csv_reader = csv.DictReader(f)
                for row in csv_reader:
                    word_id = row['word_id']
                    self.csv_data[word_id] = {
                        'word': row['word'],
                        'synset_id': row['synset_id'],
                        'pos': row['pos'],
                        'gloss': row['gloss']
                    }
            print(f"‚úì ƒê√£ t·∫£i th√†nh c√¥ng {len(self.csv_data)} t·ª´ t·ª´ file CSV")
        except Exception as e:
            print(f"‚úó L·ªói khi ƒë·ªçc file CSV: {e}")
        return self.csv_data
    
    def check_word_in_sentences(self, word: str, sentences: List[str]) -> Tuple[bool, List[str]]:
        """
        Ki·ªÉm tra xem t·ª´ c√≥ xu·∫•t hi·ªán trong danh s√°ch c√¢u kh√¥ng
        
        Args:
            word: T·ª´ c·∫ßn ki·ªÉm tra
            sentences: Danh s√°ch c√°c c√¢u
            
        Returns:
            Tuple(c√≥ xu·∫•t hi·ªán hay kh√¥ng, danh s√°ch c√¢u ch·ª©a t·ª´ ƒë√≥)
        """
        found_sentences = []
        
        # T·∫°o pattern ƒë·ªÉ t√¨m t·ª´ (word boundary ƒë·ªÉ tr√°nh t√¨m t·ª´ con)
        pattern = r'\b' + re.escape(word) + r'\b'
        
        for sentence in sentences:
            if re.search(pattern, sentence, re.IGNORECASE):
                found_sentences.append(sentence)
        
        return len(found_sentences) > 0, found_sentences
    
    def check_all_words(self) -> Dict:
        """
        Ki·ªÉm tra t·∫•t c·∫£ c√°c t·ª´ trong CSV c√≥ xu·∫•t hi·ªán trong JSON t∆∞∆°ng ·ª©ng kh√¥ng
        
        Returns:
            Dictionary ch·ª©a k·∫øt qu·∫£ ki·ªÉm tra
        """
        results = {
            'found': [],
            'not_found': [],
            'missing_word_id': []
        }
        
        for word_id, word_info in self.csv_data.items():
            word = word_info['word']
            
            # Ki·ªÉm tra xem word_id c√≥ t·ªìn t·∫°i trong JSON kh√¥ng
            if word_id not in self.json_data:
                results['missing_word_id'].append({
                    'word_id': word_id,
                    'word': word,
                    'reason': f'word_id {word_id} kh√¥ng t·ªìn t·∫°i trong file JSON'
                })
                continue
            
            # L·∫•y danh s√°ch c√¢u t·ª´ JSON
            sentences = self.json_data[word_id]
            
            # Ki·ªÉm tra t·ª´ c√≥ xu·∫•t hi·ªán trong c√¢u kh√¥ng
            found, found_sentences = self.check_word_in_sentences(word, sentences)
            
            if found:
                results['found'].append({
                    'word_id': word_id,
                    'word': word,
                    'total_sentences': len(sentences),
                    'sentences_with_word': len(found_sentences),
                    'found_sentences': found_sentences
                })
            else:
                results['not_found'].append({
                    'word_id': word_id,
                    'word': word,
                    'sentences': sentences
                })
        
        return results
    
    def print_summary(self, results: Dict):
        """In t√≥m t·∫Øt k·∫øt qu·∫£"""
        print("\n" + "="*60)
        print("                    T√ìM T·∫ÆT K·∫æT QU·∫¢")
        print("="*60)
        
        total_words = len(self.csv_data)
        found_count = len(results['found'])
        not_found_count = len(results['not_found'])
        missing_id_count = len(results['missing_word_id'])
        
        print(f"üìä T·ªïng s·ªë t·ª´ c·∫ßn ki·ªÉm tra: {total_words}")
        print(f"‚úÖ S·ªë t·ª´ t√¨m th·∫•y trong c√¢u: {found_count}")
        print(f"‚ùå S·ªë t·ª´ KH√îNG t√¨m th·∫•y trong c√¢u: {not_found_count}")
        print(f"‚ö†Ô∏è  S·ªë word_id kh√¥ng t·ªìn t·∫°i trong JSON: {missing_id_count}")
        print(f"üìà T·ª∑ l·ªá t√¨m th·∫•y: {found_count/max(total_words-missing_id_count, 1)*100:.1f}%")
    
    def print_detailed_results(self, results: Dict, show_found: bool = True, show_not_found: bool = True):
        """In k·∫øt qu·∫£ chi ti·∫øt"""
        
        if show_found and results['found']:
            print("\n" + "="*60)
            print("‚úÖ C√ÅC T·ª™ T√åM TH·∫§Y TRONG C√ÇU")
            print("="*60)
            for item in results['found']:
                print(f"\nüîç Word ID: {item['word_id']} | T·ª´: '{item['word']}'")
                print(f"   üìù T·ªïng s·ªë c√¢u: {item['total_sentences']}")
                print(f"   ‚ú® S·ªë c√¢u ch·ª©a t·ª´: {item['sentences_with_word']}")
                print("   üìÑ C√°c c√¢u ch·ª©a t·ª´:")
                for i, sentence in enumerate(item['found_sentences'], 1):
                    print(f"      {i}. {sentence}")
        
        if show_not_found and results['not_found']:
            print("\n" + "="*60)
            print("‚ùå C√ÅC T·ª™ KH√îNG T√åM TH·∫§Y TRONG C√ÇU")
            print("="*60)
            for item in results['not_found']:
                print(f"\nüîç Word ID: {item['word_id']} | T·ª´: '{item['word']}'")
                print("   üìÑ C√°c c√¢u trong JSON:")
                for i, sentence in enumerate(item['sentences'], 1):
                    print(f"      {i}. {sentence}")
        
        if results['missing_word_id']:
            print("\n" + "="*60)
            print("‚ö†Ô∏è  C√ÅC WORD_ID KH√îNG T·ªíN T·∫†I TRONG JSON")
            print("="*60)
            for item in results['missing_word_id']:
                print(f"   Word ID: {item['word_id']} | T·ª´: '{item['word']}'")
                print(f"   L√Ω do: {item['reason']}")
    
    def run_check(self, show_details: bool = True, show_found: bool = True, show_not_found: bool = True):
        """
        Ch·∫°y ki·ªÉm tra ho√†n ch·ªânh
        
        Args:
            show_details: Hi·ªÉn th·ªã k·∫øt qu·∫£ chi ti·∫øt
            show_found: Hi·ªÉn th·ªã c√°c t·ª´ t√¨m th·∫•y
            show_not_found: Hi·ªÉn th·ªã c√°c t·ª´ kh√¥ng t√¨m th·∫•y
        """
        print("üöÄ B·∫Øt ƒë·∫ßu ki·ªÉm tra...")
        
        # T·∫£i d·ªØ li·ªáu
        self.load_json_data()
        self.load_csv_data()
        
        if not self.json_data or not self.csv_data:
            print("‚ùå Kh√¥ng th·ªÉ t·∫£i d·ªØ li·ªáu. Vui l√≤ng ki·ªÉm tra ƒë∆∞·ªùng d·∫´n file.")
            return None
        
        # Th·ª±c hi·ªán ki·ªÉm tra
        print("\nüîç ƒêang ki·ªÉm tra...")
        results = self.check_all_words()
        
        # In k·∫øt qu·∫£
        self.print_summary(results)
        
        if show_details:
            self.print_detailed_results(results, show_found, show_not_found)
        
        return results

# H√†m s·ª≠ d·ª•ng
def main():
    """H√†m ch√≠nh ƒë·ªÉ ch·∫°y tool"""
    # ƒê∆∞·ªùng d·∫´n t·ªõi c√°c file
    json_file = "pseudo_sent.json"  # Thay ƒë·ªïi ƒë∆∞·ªùng d·∫´n n√†y
    csv_file = "word_synsets_with_pos_with_gloss.csv"   # Thay ƒë·ªïi ƒë∆∞·ªùng d·∫´n n√†y
    
    # T·∫°o v√† ch·∫°y checker
    checker = WordChecker(json_file, csv_file)
    results = checker.run_check(
        show_details=True,      # Hi·ªÉn th·ªã chi ti·∫øt
        show_found=True,        # Hi·ªÉn th·ªã t·ª´ t√¨m th·∫•y
        show_not_found=True     # Hi·ªÉn th·ªã t·ª´ kh√¥ng t√¨m th·∫•y
    )
    
    return results

if __name__ == "__main__":
    # Ch·∫°y tool
    main()
    
    # Ho·∫∑c b·∫°n c√≥ th·ªÉ s·ª≠ d·ª•ng nh∆∞ sau:
    # checker = WordChecker("path_to_json.json", "path_to_csv.csv")
    # results = checker.run_check()

üöÄ B·∫Øt ƒë·∫ßu ki·ªÉm tra...
‚úó L·ªói khi ƒë·ªçc file JSON: [Errno 2] No such file or directory: 'data.json'
‚úó L·ªói khi ƒë·ªçc file CSV: [Errno 2] No such file or directory: 'words.csv'
‚ùå Kh√¥ng th·ªÉ t·∫£i d·ªØ li·ªáu. Vui l√≤ng ki·ªÉm tra ƒë∆∞·ªùng d·∫´n file.


In [25]:
import json
import csv

# ƒê·ªçc d·ªØ li·ªáu t·ª´ file JSON
json_file_path = "pseudo_sent.json"
with open(json_file_path, 'r', encoding='utf-8') as f:
    json_data = json.load(f)

# ƒê·ªçc d·ªØ li·ªáu t·ª´ file CSV
csv_file_path = "word_synsets_with_pos_with_gloss.csv"
csv_rows = []
word_dict = {}

with open(csv_file_path, 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    fieldnames = reader.fieldnames  # L∆∞u t√™n c√°c c·ªôt
    
    for row in reader:
        csv_rows.append(row)
        word_dict[row['word_id']] = row['word']  # L∆∞u √°nh x·∫° word_id -> word

# T√¨m c√°c word_id kh√¥ng c√≥ trong √≠t nh·∫•t m·ªôt c√¢u
problematic_ids = set()

for word_id, sentences in json_data.items():
    if word_id in word_dict:
        target_word = word_dict[word_id]
        
        for sentence in sentences:
            # Ki·ªÉm tra t·ª´ kh√¥ng t·ªìn t·∫°i trong c√¢u (kh√¥ng ph√¢n bi·ªát hoa th∆∞·ªùng)
            if target_word.lower() not in sentence.lower():
                problematic_ids.add(word_id)
                break  # Ch·ªâ c·∫ßn 1 c√¢u kh√¥ng c√≥ t·ª´ l√† ƒë·ªß

# L·ªçc c√°c d√≤ng CSV c√≥ word_id b·ªã l·ªói
problematic_rows = [row for row in csv_rows if row['word_id'] in problematic_ids]

# Ghi k·∫øt qu·∫£ ra file CSV (v·ªõi t·∫•t c·∫£ c√°c c·ªôt)
output_file = "missing_words_full.csv"
with open(output_file, 'w', encoding='utf-8', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(problematic_rows)

print(f"Ho√†n th√†nh! ƒê√£ t√¨m th·∫•y {len(problematic_ids)} t·ª´ c√≥ v·∫•n ƒë·ªÅ.")
print(f"K·∫øt qu·∫£ ƒë√£ ƒë∆∞·ª£c l∆∞u v√†o {output_file} v·ªõi ƒë·∫ßy ƒë·ªß c√°c c·ªôt.")

Ho√†n th√†nh! ƒê√£ t√¨m th·∫•y 15165 t·ª´ c√≥ v·∫•n ƒë·ªÅ.
K·∫øt qu·∫£ ƒë√£ ƒë∆∞·ª£c l∆∞u v√†o missing_words_full.csv v·ªõi ƒë·∫ßy ƒë·ªß c√°c c·ªôt.
