In [1]:
import requests
import json
from time import time, sleep 
from openai import OpenAI 
import os
from dotenv import load_dotenv
import pandas as pd 
from pathlib import Path 
# Load environment variables from .env file
load_dotenv()

# Replace with your actual DeepSeek API endpoint and API key
DEEPSEEK_API_URL = "https://api.deepseek.com"
DEEPSEEK_API_KEY = os.getenv('DEEPSEEK_API_KEY')

OUT_FOLDER = "deepseek_enu"

In [2]:
json_data = """
{
  "core_function_words": {
    "articles": ["a", "an", "the"],
    "prepositions": ["of", "in", "to", "for", "with", "on", "at", "from", "by", "about", "as", "into", "like", "through", "after", "over", "between", "out", "against", "during", "without", "before", "under", "around", "among", "off", "above", "near", "behind", "below", "across", "along", "beside", "towards", "onto", "until"],
    "conjunctions": ["and", "but", "or", "nor", "so", "yet", "for", "although", "because", "if", "that", "while", "since", "unless", "whereas", "whether"],
    "pronouns": ["I", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them", "my", "your", "his", "her", "its", "our", "their", "mine", "yours", "hers", "ours", "theirs", "myself", "yourself", "himself", "herself", "itself", "ourselves", "yourselves", "themselves", "who", "whom", "whose", "which", "what", "this", "that", "these", "those", "each", "every", "some", "any", "no", "none", "all", "both", "either", "neither", "one", "another", "other", "such"],
    "auxiliary_verbs": ["be", "am", "is", "are", "was", "were", "been", "have", "has", "had", "do", "does", "did", "can", "could", "may", "might", "must", "shall", "should", "will", "would"]
  },
  "common_nouns": {
    "people": ["person", "man", "woman", "child", "boy", "girl", "family", "friend", "name", "group", "student", "teacher", "doctor", "president", "people", "father", "mother", "son", "daughter", "brother", "sister", "husband", "wife", "parent", "customer", "patient", "member", "worker", "leader"],
    "time": ["time", "year", "day", "week", "month", "hour", "minute", "second", "morning", "afternoon", "evening", "night", "today", "yesterday", "tomorrow", "moment", "period", "date", "age", "future", "past"],
    "place": ["place", "world", "country", "city", "state", "school", "home", "house", "room", "office", "area", "side", "street", "town", "area", "building", "land", "community", "region", "location"],
    "things": ["thing", "way", "part", "number", "word", "life", "point", "line", "form", "case", "system", "program", "question", "government", "company", "money", "water", "food", "car", "door", "book", "paper", "window", "table", "chair", "computer", "phone", "end", "law", "war", "power", "problem", "service", "idea", "reason", "level", "kind", "example", "result", "change", "process", "information", "head", "body", "hand", "eye", "face", "back", "side", "air", "fire", "ground"],
    "abstract_concepts": ["work", "fact", "issue", "effect", "use", "need", "type", "method", "research", "study", "experience", "development", "history", "art", "health", "job", "action", "activity", "decision", "value", "theory", "nature", "order", "course", "report", "education", "knowledge", "love", "relationship", "control", "force", "mind", "game", "quality", "situation", "society", "technology", "project", "plan", "rate", "cost", "event", "business"]
  },
  "common_verbs": {
    "basic_actions": ["be", "have", "do", "make", "get", "go", "come", "see", "look", "hear", "listen", "say", "tell", "talk", "speak", "take", "give", "find", "use", "work", "help", "start", "play", "run", "move", "live", "believe", "bring", "happen", "write", "sit", "stand", "lose", "meet", "include", "continue", "set", "learn", "change", "lead", "understand", "watch", "follow", "stop", "create", "allow", "add", "spend", "grow", "open", "walk", "win", "offer", "remember", "love", "consider", "appear", "buy", "serve", "die", "send", "expect", "build", "stay", "fall", "cut", "reach", "kill", "remain", "suggest", "raise", "pass", "sell", "require", "report", "decide", "pull", "return", "explain", "hope", "develop", "carry", "thank", "receive", "join", "agree", "hit", "produce", "eat", "cover", "catch", "choose", "deal", "throw", "keep", "hold"],
    "mental_actions": ["think", "know", "want", "need", "mean", "feel", "become", "seem", "try", "call", "ask", "show", "seem", "care", "worry", "expect", "imagine", "wonder"],
    "communication": ["speak", "talk", "say", "tell", "ask", "answer", "call", "write", "read", "communicate", "explain", "describe", "report", "inform"],
    "change_influence": ["become", "change", "make", "do", "create", "build", "form", "develop", "increase", "decrease", "improve", "reduce", "affect", "cause", "impact", "influence"],
    "relationships": ["meet", "know", "love", "like", "help", "support", "join", "leave", "follow", "lead"],
    "existence_state": ["be", "exist", "live", "stay", "remain", "seem", "appear", "happen"]
  },
  "common_adjectives": {
    "basic_qualities": ["good", "new", "old", "great", "high", "low", "small", "large", "big", "little", "long", "short", "strong", "weak", "right", "wrong", "different", "important", "possible", "real", "best", "better", "easy", "difficult", "hard", "true", "false", "clear", "simple", "special", "certain", "personal", "open", "public", "common", "full", "available", "ready", "nice", "fine"],
    "descriptive": ["red", "blue", "green", "black", "white", "dark", "light", "beautiful", "happy", "sad", "bad", "serious", "free", "able", "sure", "likely", "single", "recent", "early", "late", "main", "major", "particular", "similar", "significant", "difficult", "easy", "human", "local", "national", "international", "social", "political", "economic", "financial", "legal", "medical", "environmental", "physical", "mental", "general", "specific"],
    "evaluative": ["important", "necessary", "useful", "valuable", "effective", "successful", "popular", "interesting", "positive", "negative", "same", "similar", "different"],
    "quantity_degree": ["much", "many", "few", "several", "enough", "whole", "total", "entire", "half", "most", "least"]
  },
  "common_adverbs": {
    "manner": ["well", "quickly", "slowly", "carefully", "easily", "really", "actually", "probably", "very", "also", "just", "then", "now", "here", "there", "up", "down", "away", "back", "forward", "together", "alone", "early", "late", "again", "still", "always", "never", "often", "sometimes", "usually", "ever", "almost", "already", "even", "only", "quite", "too", "enough", "however", "therefore", "finally", "recently", "especially", "particularly", "generally", "completely", "totally", "absolutely", "simply"],
    "time": ["now", "then", "today", "yesterday", "tomorrow", "soon", "later", "ago", "before", "after", "always", "usually", "often", "sometimes", "rarely", "never", "already", "still", "yet"],
    "place": ["here", "there", "somewhere", "anywhere", "everywhere", "nowhere", "inside", "outside", "above", "below", "near", "far", "away"],
    "degree": ["very", "quite", "rather", "pretty", "really", "extremely", "incredibly", "fairly", "somewhat", "hardly", "barely", "slightly"],
    "frequency": ["always", "usually", "often", "frequently", "sometimes", "occasionally", "seldom", "rarely", "never"]
  },
  "numbers": ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand", "million", "billion", "first", "second", "third", "fourth", "fifth", "last"],
  "question_words": ["who", "what", "where", "when", "why", "how", "which", "whose", "whom"]
}
"""

In [3]:
parsed_json = json.loads(json_data)

In [4]:
words = []
for category in parsed_json.keys():
    sub_cat = parsed_json[category]
    if isinstance(sub_cat,dict):
        for subcategory in parsed_json[category].keys():
            words += sub_cat[subcategory]        
    elif isinstance(sub_cat,list):
        words += sub_cat

print(f"Total number of words in the JSON: {len(words)}")

Total number of words in the JSON: 714


In [5]:
deepseek_client = OpenAI(api_key=DEEPSEEK_API_KEY, base_url=DEEPSEEK_API_URL)

In [6]:
def generate_enu_prompt(word, category="", subcategory=""):
    """
    Generates a custom prompt for the DeepSeek API based on your template.

    Args:
        character (str): The Chinese character to query.

    Returns:
        str: A formatted prompt for the API.
    """
    if category:
        cat_prompt = f" which belongs to '{category}' category "
        if subcategory:
            cat_prompt += f" and '{subcategory}' sub-category"
    else:
        cat_prompt = "" 
    prompt = f"""
    You are an expert in English language. 
    Generate a wholistic view in terms of the following attributes 
    on this word '{word}' {cat_prompt} :

    - meaning: Explain its meaning.
    - pronunciation: Provide its pronunciation.
    - etymology: Explain its origin.
    - homophone word: List other words that sound like '{word}'.
    - common phrase: Provide common phrases containing '{word}' 
    - sentence: Provide example short sentences or famous quotes using '{word}'.
    - short stories: Share 1-2 short stories related to '{word}'.
    - poetry: Include 1-2 famous poems containing and describing '{word}'.
    - image: Suggest simple images illustrating '{word}'.
    - audio: Suggest short audio clips illustrating '{word}'.
    - video: Suggest short video clips illustrating '{word}'.
    - movie: List famous movies related to '{word}' or movie titles containing '{word}'.
    - reference: Provide additional popular reference materials.
    - interesting website: Suggest popular websites related to '{word}'.

    Format the output as a valid JSON object.
    """
    return prompt.strip()

In [7]:
generate_enu_prompt("human", category="biology", subcategory="")

"You are an expert in English language. \n    Generate a wholistic view in terms of the following attributes \n    on this word 'human'  which belongs to 'biology' category  :\n\n    - meaning: Explain its meaning.\n    - pronunciation: Provide its pronunciation.\n    - etymology: Explain its origin.\n    - homophone word: List other words that sound like 'human'.\n    - common phrase: Provide common phrases containing 'human' \n    - sentence: Provide example short sentences or famous quotes using 'human'.\n    - short stories: Share 1-2 short stories related to 'human'.\n    - poetry: Include 1-2 famous poems containing and describing 'human'.\n    - image: Suggest simple images illustrating 'human'.\n    - audio: Suggest short audio clips illustrating 'human'.\n    - video: Suggest short video clips illustrating 'human'.\n    - movie: List famous movies related to 'human' or movie titles containing 'human'.\n    - reference: Provide additional popular reference materials.\n    - int

In [8]:
def extract_json_string(response):
    # Find the start and end of the JSON string
    start = response.find("```json") + len("```json")
    end = response.find("```", start)

    # Extract the JSON string and strip any leading/trailing whitespace
    json_str = response[start:end].strip()

    return json_str

In [9]:
def call_deepseek_api(prompt, system_message = "You are a helpful assistant", client=None):
    if client is None:
        client = OpenAI(api_key=DEEPSEEK_API_KEY, base_url=DEEPSEEK_API_URL)
    ts_start = time()
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": prompt},
        ],
        stream=False
    )
    ts_stop = time()

    return response.choices[0].message.content, (ts_stop-ts_start)

In [10]:
def main(character="son", category="", subcategory="", out_folder=OUT_FOLDER):
    # Call the DeepSeek API
    prompt = generate_enu_prompt(character, category, subcategory)

    response, ts_delta = call_deepseek_api(prompt, client=deepseek_client)
    json_str = extract_json_string(response)
    json_obj = json.loads(json_str)
    json_out = json.dumps(json_obj, indent=2, ensure_ascii=False)
    file_json = f"{out_folder}/{character}-1.json"
    print(f"API call for '{character}' completed in {ts_delta} sec, \n output written to {file_json}")
    with open(file_json, "w", encoding="utf-8") as f:
        f.write(json_out)

In [11]:
def process_words(words, cat="", sub_cat="", out_folder=OUT_FOLDER):
    processed = []
    for zi in words:
        file_json = f"{out_folder}/{zi}-1.json"
        if Path(file_json).exists():
            print(f"Skip {zi}")
            continue
        try:
            main(character = zi, category=cat, subcategory=sub_cat)
            processed.append(zi)
        except Exception as e: 
            print(f"Failed to process {zi}:\n {str(e)}")    
    return processed

In [12]:
processed = []

for category in parsed_json.keys():
    sub_cat = parsed_json[category]
    if isinstance(sub_cat,dict):
        for subcategory in parsed_json[category].keys():
            words = sub_cat[subcategory]
            processed += process_words(words, cat=category, sub_cat=subcategory)
    elif isinstance(sub_cat,list):
        words = sub_cat
        processed += process_words(words, cat=category)

API call for 'a' completed in 17.480198621749878 sec, 
 output written to deepseek_enu/a-1.json
API call for 'an' completed in 13.252955198287964 sec, 
 output written to deepseek_enu/an-1.json
API call for 'the' completed in 16.925647974014282 sec, 
 output written to deepseek_enu/the-1.json
API call for 'of' completed in 18.194700956344604 sec, 
 output written to deepseek_enu/of-1.json
API call for 'in' completed in 15.008503198623657 sec, 
 output written to deepseek_enu/in-1.json
API call for 'to' completed in 14.622117519378662 sec, 
 output written to deepseek_enu/to-1.json
API call for 'for' completed in 18.098021745681763 sec, 
 output written to deepseek_enu/for-1.json
API call for 'with' completed in 16.942495346069336 sec, 
 output written to deepseek_enu/with-1.json
API call for 'on' completed in 13.444606304168701 sec, 
 output written to deepseek_enu/on-1.json
API call for 'at' completed in 16.19912362098694 sec, 
 output written to deepseek_enu/at-1.json
API call for 'f

API call for 'yourselves' completed in 17.276864767074585 sec, 
 output written to deepseek_enu/yourselves-1.json
API call for 'themselves' completed in 16.21254563331604 sec, 
 output written to deepseek_enu/themselves-1.json
API call for 'who' completed in 15.362658500671387 sec, 
 output written to deepseek_enu/who-1.json
API call for 'whom' completed in 16.165207624435425 sec, 
 output written to deepseek_enu/whom-1.json
API call for 'whose' completed in 15.246801137924194 sec, 
 output written to deepseek_enu/whose-1.json
API call for 'which' completed in 19.332354307174683 sec, 
 output written to deepseek_enu/which-1.json
API call for 'what' completed in 13.859436988830566 sec, 
 output written to deepseek_enu/what-1.json
API call for 'this' completed in 16.821837186813354 sec, 
 output written to deepseek_enu/this-1.json
Skip that
API call for 'these' completed in 15.178189992904663 sec, 
 output written to deepseek_enu/these-1.json
API call for 'those' completed in 15.07605695

API call for 'hour' completed in 19.199199676513672 sec, 
 output written to deepseek_enu/hour-1.json
API call for 'minute' completed in 15.302237510681152 sec, 
 output written to deepseek_enu/minute-1.json
API call for 'second' completed in 15.061413764953613 sec, 
 output written to deepseek_enu/second-1.json
API call for 'morning' completed in 18.82902956008911 sec, 
 output written to deepseek_enu/morning-1.json
API call for 'afternoon' completed in 18.64539623260498 sec, 
 output written to deepseek_enu/afternoon-1.json
API call for 'evening' completed in 18.37855100631714 sec, 
 output written to deepseek_enu/evening-1.json
API call for 'night' completed in 13.152798891067505 sec, 
 output written to deepseek_enu/night-1.json
API call for 'today' completed in 18.6242618560791 sec, 
 output written to deepseek_enu/today-1.json
API call for 'yesterday' completed in 17.53290629386902 sec, 
 output written to deepseek_enu/yesterday-1.json
API call for 'tomorrow' completed in 14.6076

API call for 'hand' completed in 22.771055936813354 sec, 
 output written to deepseek_enu/hand-1.json
API call for 'eye' completed in 16.170180320739746 sec, 
 output written to deepseek_enu/eye-1.json
API call for 'face' completed in 13.843894481658936 sec, 
 output written to deepseek_enu/face-1.json
API call for 'back' completed in 16.66243863105774 sec, 
 output written to deepseek_enu/back-1.json
Skip side
API call for 'air' completed in 18.099297523498535 sec, 
 output written to deepseek_enu/air-1.json
API call for 'fire' completed in 18.06807827949524 sec, 
 output written to deepseek_enu/fire-1.json
API call for 'ground' completed in 16.567718744277954 sec, 
 output written to deepseek_enu/ground-1.json
API call for 'work' completed in 25.007866382598877 sec, 
 output written to deepseek_enu/work-1.json
API call for 'fact' completed in 17.458003044128418 sec, 
 output written to deepseek_enu/fact-1.json
API call for 'issue' completed in 16.6334011554718 sec, 
 output written t

API call for 'include' completed in 17.43864345550537 sec, 
 output written to deepseek_enu/include-1.json
API call for 'continue' completed in 14.172811508178711 sec, 
 output written to deepseek_enu/continue-1.json
API call for 'set' completed in 20.29839515686035 sec, 
 output written to deepseek_enu/set-1.json
API call for 'learn' completed in 16.114689588546753 sec, 
 output written to deepseek_enu/learn-1.json
Skip change
API call for 'lead' completed in 16.607683420181274 sec, 
 output written to deepseek_enu/lead-1.json
API call for 'understand' completed in 15.307388544082642 sec, 
 output written to deepseek_enu/understand-1.json
API call for 'watch' completed in 18.11398935317993 sec, 
 output written to deepseek_enu/watch-1.json
API call for 'follow' completed in 15.637834310531616 sec, 
 output written to deepseek_enu/follow-1.json
API call for 'stop' completed in 17.230590343475342 sec, 
 output written to deepseek_enu/stop-1.json
API call for 'create' completed in 13.824

API call for 'inform' completed in 13.397252798080444 sec, 
 output written to deepseek_enu/inform-1.json
Skip become
Skip change
Skip make
Skip do
Skip create
Skip build
Skip form
Skip develop
API call for 'increase' completed in 17.193737745285034 sec, 
 output written to deepseek_enu/increase-1.json
API call for 'decrease' completed in 16.08383274078369 sec, 
 output written to deepseek_enu/decrease-1.json
API call for 'improve' completed in 20.612641096115112 sec, 
 output written to deepseek_enu/improve-1.json
API call for 'reduce' completed in 17.00175905227661 sec, 
 output written to deepseek_enu/reduce-1.json
API call for 'affect' completed in 15.473253965377808 sec, 
 output written to deepseek_enu/affect-1.json
API call for 'cause' completed in 16.76030683517456 sec, 
 output written to deepseek_enu/cause-1.json
API call for 'impact' completed in 18.95969271659851 sec, 
 output written to deepseek_enu/impact-1.json
API call for 'influence' completed in 16.066906929016113 sec

API call for 'human' completed in 13.545839071273804 sec, 
 output written to deepseek_enu/human-1.json
API call for 'local' completed in 15.40318512916565 sec, 
 output written to deepseek_enu/local-1.json
API call for 'national' completed in 14.417957782745361 sec, 
 output written to deepseek_enu/national-1.json
API call for 'international' completed in 13.84869647026062 sec, 
 output written to deepseek_enu/international-1.json
API call for 'social' completed in 16.253165006637573 sec, 
 output written to deepseek_enu/social-1.json
API call for 'political' completed in 13.265098333358765 sec, 
 output written to deepseek_enu/political-1.json
API call for 'economic' completed in 15.699310541152954 sec, 
 output written to deepseek_enu/economic-1.json
API call for 'financial' completed in 15.089907169342041 sec, 
 output written to deepseek_enu/financial-1.json
API call for 'legal' completed in 14.032010555267334 sec, 
 output written to deepseek_enu/legal-1.json
API call for 'medica

API call for 'generally' completed in 12.877944231033325 sec, 
 output written to deepseek_enu/generally-1.json
API call for 'completely' completed in 17.459843635559082 sec, 
 output written to deepseek_enu/completely-1.json
API call for 'totally' completed in 17.72020125389099 sec, 
 output written to deepseek_enu/totally-1.json
API call for 'absolutely' completed in 14.522823810577393 sec, 
 output written to deepseek_enu/absolutely-1.json
API call for 'simply' completed in 15.881147146224976 sec, 
 output written to deepseek_enu/simply-1.json
Skip now
Skip then
Skip today
Skip yesterday
Skip tomorrow
API call for 'soon' completed in 16.094309091567993 sec, 
 output written to deepseek_enu/soon-1.json
API call for 'later' completed in 17.908982276916504 sec, 
 output written to deepseek_enu/later-1.json
API call for 'ago' completed in 14.423182964324951 sec, 
 output written to deepseek_enu/ago-1.json
Skip before
Skip after
Skip always
API call for 'usually' completed in 17.63158845

In [13]:
!cd 

C:\Users\p2p2l\projects\wgong\zistory\zinets\dev\notebook
