In [19]:
"""
Author: Seungjae, Lee
Date: 2025-08-26 (Tue)
"""

import sqlite3

conn = sqlite3.connect("word_cefr_minified.db")
cursor = conn.cursor()

cursor.execute("PRAGMA table_list;")
tables = cursor.fetchall()
tables = [r[1] for r in tables]
print(tables)

['word_categories', 'categories', 'word_pos', 'pos_tags', 'sqlite_sequence', 'words', 'sqlite_schema', 'sqlite_temp_schema']


In [9]:
import sqlite3

conn = sqlite3.connect("word_cefr_minified.db")
cursor = conn.cursor()

# 모든 CREATE 문 가져오기
cursor.execute("SELECT sql FROM sqlite_master WHERE sql NOT NULL;")
schema_statements = cursor.fetchall()

with open("schema.sql", "w", encoding="utf-8") as f:
    for stmt in schema_statements:
        f.write(stmt[0] + ";\n\n")

print("Schema exported to schema.sql")


Schema exported to schema.sql


In [20]:
import sqlite3

conn = sqlite3.connect("word_cefr_minified.db")
cursor = conn.cursor()

cursor.execute("""
    SELECT w.word_id,
           w.word,
           wp.pos_tag_id,
           wp.level,
           pt.tag,
           pt.description
    FROM words AS w
    JOIN word_pos AS wp
        ON w.word_id = wp.word_id
    JOIN pos_tags AS pt
        ON wp.pos_tag_id = pt.tag_id
    WHERE wp.level = 6
""")

rows = cursor.fetchall()
for row in rows[:10]:  # 앞에서 10개만 보기
    print(row)

(159572, 'alsine', 12, 6.0, 'NN', 'Noun, singular or mass')
(109633, 'alca', 14, 6.0, 'NNP', 'Proper noun, singular')
(40616, 'annabel', 14, 6.0, 'NNP', 'Proper noun, singular')
(62281, 'amygdaloid', 12, 6.0, 'NN', 'Noun, singular or mass')
(84063, 'amores', 13, 6.0, 'NNS', 'Noun, plural')
(138271, 'aavso', 14, 6.0, 'NNP', 'Proper noun, singular')
(73092, 'awd', 12, 6.0, 'NN', 'Noun, singular or mass')
(115597, 'allegre', 12, 6.0, 'NN', 'Noun, singular or mass')
(64629, 'asea', 14, 6.0, 'NNP', 'Proper noun, singular')
(128175, 'ariege', 14, 6.0, 'NNP', 'Proper noun, singular')


In [None]:
"""
difficulties - level
"""

import sqlite3

conn = sqlite3.connect("word_cefr_minified.db")
cursor = conn.cursor()

cursor.execute("""
    SELECT DISTINCT wp.level
    FROM words AS w
    JOIN word_pos AS wp
        ON w.word_id = wp.word_id
    JOIN pos_tags AS pt
        ON wp.pos_tag_id = pt.tag_id
    ORDER BY wp.level
""")

levels = cursor.fetchall()  # [(1.0,), (2.0,), (3.0,), ...]
levels = [row[0] for row in levels]

print(levels)  # [1.0, 2.0, 3.0, ...]

# for row in rows[:10]:  # 앞에서 10개만 보기
#     print(row)

[1.0, 1.0000396219873482, 1.0701639982876034, 1.1142242838803693, 1.149932008204644, 1.1529742060442045, 1.1667891210844017, 1.1669794962777376, 1.1713148756978897, 1.1760016154182513, 1.1778300365814731, 1.183829860733898, 1.1838298607338982, 1.1938791673345095, 1.2038788094569695, 1.2107128405410652, 1.22433516266898, 1.2307175039478038, 1.248152071850814, 1.2745910311804063, 1.286365517965287, 1.2932599989760947, 1.2976011024982808, 1.3139047806425121, 1.3230096724259388, 1.3289218520997739, 1.3310748843699964, 1.3311206572835312, 1.3333333333333333, 1.3406128994276745, 1.3536044961876756, 1.3590881845599414, 1.364391899694818, 1.3694757237502886, 1.3846444702671483, 1.3862580865700183, 1.391626316962281, 1.4099356224245314, 1.4206240133604693, 1.437150164126276, 1.4407827929105725, 1.4562941959947777, 1.45927692460701, 1.462501737795753, 1.466834566139269, 1.4695377194053674, 1.4741728167137298, 1.498483123059227, 1.5, 1.5080229566862315, 1.5080229566862318, 1.5108125360364568, 1.5

In [47]:
import sqlite3

conn = sqlite3.connect("word_cefr_minified.db")
cursor = conn.cursor()

cursor.execute("""
    SELECT COUNT(DISTINCT w.word)
    FROM words AS w
""")

row = cursor.fetchone()
row

(172782,)

In [None]:
import sqlite3

conn = sqlite3.connect("word_cefr_minified.db")
cursor = conn.cursor()

cursor.execute("""
    SELECT w.word, pt.tag
    FROM words AS w
    JOIN word_pos AS wp
        ON w.word_id = wp.word_id
    JOIN pos_tags AS pt
        ON wp.pos_tag_id = pt.tag_id
    WHERE ROUND(wp.level) = 5
      AND pt.tag IN ('NN', 'VB', 'JJ', 'RB', 'IN', 'CC', 'TO', 'DT', 'PRP$')
    ORDER BY wp.frequency_count DESC
    LIMIT 3777
""")

rows = cursor.fetchall()

for r in rows[:20]:
    print(r)

words = [r[0] for r in rows]

with open('cefr-c1.txt', 'w') as f:
    f.write(','.join(words))
