In [15]:
from docxcompose.composer import Composer
from docx import Document
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
import pandas as pd
import random
import os

In [16]:
# * set review preference

ls_prefered_cate = ["fin", "cs", "math", "other"]
f_mean_difficulty = 1  # 0~5. 0 means random.
f_fresh_type = 1  # Represents "days since last review". 1 = "near", 2 = "far", 0 = "random"
f_accu_type = 1  # Represents "accuracy". 1 = "low accuracy", 2 = "high accuracy", 0 = "random"
f_q_num = 10 # number of questions

In [17]:
# * choose questions according to preference

# *** get info

df_info = pd.read_excel("info.xlsx")
df_records = pd.read_excel("records.xlsx")

# ***** do with missing values

df_info["accuracy"] = df_info["accuracy"].fillna(0)
df_info["days since last review"] = df_info["days since last review"].fillna(30000)
df_info["review dates"] = (
    df_info["review dates"].astype(str).str.strip("[]").str.split(",")
)
df_info["mistake dates"] = (
    df_info["mistake dates"].astype(str).str.strip("[]").str.split(",")
)

# ***** update df_info according to df_records


def update_mistake_dates(row, df_records):
    s_q_id = row["question_id"]
    df_records_sub = df_records[
        (df_records["question_id"] == s_q_id) & (df_records["mistake"] == 1)
    ]
    # print(df_records_sub)
    if len(df_records_sub) > 0:
        return df_records_sub["date"].values
    else:
        return []


df_info["mistake dates"] = df_info.apply(
    lambda row: update_mistake_dates(row, df_records), axis=1
)


if len(df_info) < f_q_num:
    print(f"Only {len(df_info)} questions are available.")
    f_q_num = len(df_info)


# *** selection

# ***** category
ls_prefered_cate = [f"category_{cate}" for cate in ls_prefered_cate]
df_info = df_info[df_info[ls_prefered_cate].sum(axis=1) > 0]

# ***** difficulty

df_info_set_difficulty = df_info[df_info["difficulty"] == f_mean_difficulty]
df_info_unset_difficulty = df_info[~(df_info["difficulty"] == f_mean_difficulty)]

f_q_num_set_difficulty = round(
    f_q_num * (0.4 + (0.6 - 0.4) * random.random())
)  # 40% ~ 60% questions with set difficulty

f_q_num_set_difficulty = (
    len(df_info_set_difficulty)
    if f_q_num_set_difficulty > len(df_info_set_difficulty)
    else f_q_num_set_difficulty
)
f_q_num_unset_difficulty = f_q_num - f_q_num_set_difficulty

# ***** fresh type & accuracy

if f_fresh_type * f_accu_type == 0:
    if f_fresh_type != 0:
        s_care = "days since last review"
    else:
        s_care = "accuracy"

    f_care_type = f_fresh_type + f_accu_type
    if f_care_type == 1:
        bi_ascending = True
    elif f_care_type == 2:
        bi_ascending = False
    else:
        raise ValueError("Wrong in f_fresh_type or f_accu_type")

    df_info_set_difficulty = df_info_set_difficulty.sort_values(
        by=s_care, ascending=bi_ascending
    ).iloc[:f_q_num_set_difficulty]
    df_info_unset_difficulty = df_info_unset_difficulty.sort_values(
        by=s_care, ascending=bi_ascending
    ).iloc[:f_q_num_unset_difficulty]

    df_q = pd.concat([df_info_set_difficulty, df_info_unset_difficulty])

elif f_fresh_type * f_accu_type != 0:
    if f_fresh_type == 2:
        df_info_set_difficulty["days since last review"] = (
            df_info_set_difficulty["days since last review"].max()
            - df_info_set_difficulty["days since last review"]
        )
        df_info_unset_difficulty["days since last review"] = (
            df_info_unset_difficulty["days since last review"].max()
            - df_info_unset_difficulty["days since last review"]
        )
    if f_accu_type == 2:
        df_info_set_difficulty["accuracy"] = 1 - df_info_set_difficulty["accuracy"]
        df_info_unset_difficulty["accuracy"] = 1 - df_info_unset_difficulty["accuracy"]

    df_info_set_difficulty["accuracy & freshness"] = (
        df_info_set_difficulty["accuracy"]
        * df_info_set_difficulty["days since last review"]
    )
    df_info_unset_difficulty["accuracy & freshness"] = (
        df_info_unset_difficulty["accuracy"]
        * df_info_unset_difficulty["days since last review"]
    )

    df_info_set_difficulty = df_info_set_difficulty.sort_values(
        by="accuracy & freshness",
    ).iloc[:f_q_num_set_difficulty]
    df_info_unset_difficulty = df_info_unset_difficulty.sort_values(
        by="accuracy & freshness",
    ).iloc[:f_q_num_unset_difficulty]

    df_q = pd.concat([df_info_set_difficulty, df_info_unset_difficulty])


Only 2 questions are available.


In [18]:
# * update df_info

ts_today = pd.Timestamp.today().strftime("%Y-%m-%d")

new_val = ts_today

# df_info["review dates"] = df_info.apply(
#     lambda row: list(set(list(row["review dates"]) + [new_val]))
#     if row.name in df_q.index
#     else row["review dates"],
#     axis=1,
# )


def add_review_date(row, ts_today):
    ls_origin_dates = row["review dates"]
    ls_added_dates = []
    for origin_date in ls_origin_dates:
        try:
            origin_date = pd.to_datetime(origin_date).strftime("%Y-%m-%d")
            # print(origin_date)
            ls_added_dates.append(origin_date)
        except Exception:
            continue
    ls_added_dates.append(
        ts_today
    ) if ts_today not in ls_added_dates else ls_added_dates
    return ls_added_dates


df_info["review dates"] = df_info.apply(
    lambda row: add_review_date(row, ts_today),
    axis=1,
)


def update_freshness(row, ts_today):
    # try:
    ls_review_hist = row["review dates"]
    ts_last_review = pd.Timestamp(ls_review_hist[-1])
    return (pd.to_datetime(ts_today) - ts_last_review).days


# except Exception:
#     return 30000

df_info["days since last review"] = df_info.apply(
    lambda row: update_freshness(row, ts_today), axis=1
)

# def update_mistake_dates(row): # "nan" appears when parsed
#     ls_mistake_dates = row['mistake dates']
#     ls_mistake_dates_update = []
#     for mistake_date in ls_mistake_dates:
#         if "20" in mistake_date:
#             ls_mistake_dates_update.append(mistake_date)
#     return ls_mistake_dates_update

#     # return [date if "20" in row['mistake dates'] else continue]
# df_info['mistake dates'] = df_info.apply(lambda row: update_mistake_dates(row), axis = 1)


def update_accuracy(row):
    return 1 - (len(row["mistake dates"]) - len(row["review dates"]))


df_info["accuracy"] = df_info.apply(lambda row: update_accuracy(row), axis=1)

df_info.to_excel("info.xlsx", index=None)


# * update df_records

df_records_update = df_q[["date", "question_id"]]
df_records = pd.concat([df_records, df_records_update])
df_records = df_records.sort_values(by="date").drop_duplicates()
df_records.to_excel("records.xlsx", index=None)

In [None]:
# * get questions docs from the question folder

ls_q_docs_path = "Q//" + df_q["question_id"].values + ".docx"

master = Document(ls_q_docs_path[0])
composer = Composer(master)

def add_page_break(paragraph):
    break_element = OxmlElement('w:br')
    break_element.set(qn('w:type'), 'page') 
    paragraph._p.append(break_element)

# * merge the questions

for file_path in ls_q_docs_path[1:]:
    last_paragraph = master.paragraphs[-1]
    add_page_break(last_paragraph)
    doc = Document(file_path)
    composer.append(doc)

# * save the question file
f_num_of_the_day = 1
s_file_name = f"review paper//review-q{ts_today}-{f_num_of_the_day}.docx"

while os.path.exists(s_file_name):
    f_num_of_the_day += 1
    s_file_name = f"review paper//review-q{ts_today}-{f_num_of_the_day}.docx"

composer.save(s_file_name)

print("Questions generated successfully!")

In [None]:
# * get answers docs from the question folder

df_a = df_q[["question_id"]].copy()
df_a.columns = ["answer_id"]

df_a["answer_id"] = df_a.apply(lambda row: "a" + row["answer_id"][1:], axis=1)

ls_a_docs_path = "A//" + df_a["answer_id"].values + ".docx"

master = Document(ls_a_docs_path[0])
composer = Composer(master)

# * merge the questions

for file_path in ls_a_docs_path[1:]:
    last_paragraph = master.paragraphs[-1]
    add_page_break(last_paragraph)
    doc = Document(file_path)
    composer.append(doc)

# * save the question file
f_num_of_the_day = 1
s_file_name = f"review paper//review-a{ts_today}-{f_num_of_the_day}.docx"

while os.path.exists(s_file_name):
    f_num_of_the_day += 1
    s_file_name = f"review paper//review-a{ts_today}-{f_num_of_the_day}.docx"

composer.save(s_file_name)

print("Answers generated successfully!")

In [19]:
# # * get questions docs from the question folder

# ls_q_docs_path = "Q//" + df_q["question_id"].values + ".docx"

# master = Document(ls_q_docs_path[0])
# composer = Composer(master)

# # * merge the questions

# for file_path in ls_q_docs_path[1:]:
#     doc = Document(file_path)
#     composer.append(doc)

# # * save the question file
# f_num_of_the_day = 1
# s_file_name = f"review paper//review-q{ts_today}-{f_num_of_the_day}.docx"

# while os.path.exists(s_file_name):
#     f_num_of_the_day += 1
#     s_file_name = f"review paper//review-q{ts_today}-{f_num_of_the_day}.docx"

# composer.save(s_file_name)

# print("Questions generated successfully!")

Questions generated successfully!


In [21]:
# # * get answers docs from the question folder

# df_a = df_q[["question_id"]].copy()
# df_a.columns = ["answer_id"]

# df_a["answer_id"] = df_a.apply(lambda row: "a" + row["answer_id"][1:], axis=1)

# ls_a_docs_path = "A//" + df_a["answer_id"].values + ".docx"

# master = Document(ls_a_docs_path[0])
# composer = Composer(master)

# # * merge the questions

# for file_path in ls_a_docs_path[1:]:
#     doc = Document(file_path)
#     composer.append(doc)

# # * save the question file
# f_num_of_the_day = 1
# s_file_name = f"review paper//review-a{ts_today}-{f_num_of_the_day}.docx"

# while os.path.exists(s_file_name):
#     f_num_of_the_day += 1
#     s_file_name = f"review paper//review-a{ts_today}-{f_num_of_the_day}.docx"

# composer.save(s_file_name)

# print("Answers generated successfully!")

Answers generated successfully!
