In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
link_tps = 'https://raw.githubusercontent.com/Capstone-Buddies/Machine-Learning/main/Dataset/SNBT%20Datasets%20-%20TPS.csv'
link_answer_history_tps = 'https://raw.githubusercontent.com/Capstone-Buddies/Machine-Learning/main/Dataset/SNBT%20Datasets%20-%20Answer_History_TPS.csv'

user_history = pd.read_csv(link_answer_history_tps)
tps_question_data = pd.read_csv(link_tps)

In [None]:
# Menghitung jumlah soal yang telah dijawab oleh user untuk setiap kategori
def get_total_questions_per_category(user_data):
    return user_data.groupby('Question_Category').size()

# Menghitung jumlah soal yang salah dijawab oleh user untuk setiap kategori
def get_mistakes_per_category(user_data):
    mistakes = user_data[user_data['USER_ANSWER'] != user_data['CORRECT_ANSWER']]
    return mistakes.groupby('Question_Category').size(), mistakes

In [None]:
# Menghitung similarity antara soal yang salah dijawab dengan soal lain dalam kategori yang sama
def calculate_similarity(mistakes, tps_question_data):
    combined_descriptions = pd.concat([mistakes['Question_Description'], tps_question_data['Questions_Descriptions']])

    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(combined_descriptions)

    # Pisahkan TF-IDF matrix untuk user_mistakes dan tps_question_data
    tfidf_user_mistakes = tfidf_matrix[:len(mistakes)]
    tfidf_tps_question_data = tfidf_matrix[len(mistakes):]

    # Hitung cosine similarity
    similarity_matrix = cosine_similarity(tfidf_user_mistakes, tfidf_tps_question_data)

    # Menampilkan similarity matrix dalam bentuk DataFrame
    similarity_df = pd.DataFrame(similarity_matrix, columns=[f"Q{idx+1}" for idx in range(tfidf_tps_question_data.shape[0])])
    similarity_df.index = mistakes['ID_QUESTION'].values

    # Filter similarity berdasarkan kategori yang sama
    top_similar_questions = {}
    for idx, user_question in enumerate(similarity_df.index):
        category = mistakes.iloc[idx]['Question_Category']
        same_category_indices = tps_question_data[tps_question_data['Question_Category'] == category].index
        category_similarities = similarity_df.loc[user_question, [f"Q{index+1}" for index in same_category_indices]]
        top_similar_questions[user_question] = category_similarities.nlargest(10) # mengambil top 10 yang paling mirip

    return top_similar_questions

In [None]:
# Menentukan proporsi soal berdasarkan kategori yang salah dijawab lebih banyak
def determine_proportion(mistakes_per_category, total_questions=10, min_questions_per_category=1):
    # Kategori yang ada
    all_categories = user_data['Question_Category'].unique()

    # Proporsi minimal untuk setiap kategori
    proportion = pd.Series(min_questions_per_category, index=all_categories)

    # Soal yang tersisa setelah distribusi minimal
    remaining_questions = total_questions - proportion.sum()

    # Proporsi berdasarkan kesalahan
    if remaining_questions > 0:
        mistakes_proportion = (mistakes_per_category / mistakes_per_category.sum() * remaining_questions).round().astype(int)
        for cat in mistakes_proportion.index:
            proportion[cat] += mistakes_proportion[cat]

    # Jika masih ada sisa soal yang belum terdistribusi, tambahkan ke kategori dengan kesalahan terbanyak
    remaining_questions = total_questions - proportion.sum()
    if remaining_questions > 0:
        most_mistakes_category = mistakes_per_category.idxmax()
        proportion[most_mistakes_category] += remaining_questions

    return proportion

In [None]:
# Menyusun kuis berdasarkan proporsi dan soal-soal dengan similarity tertinggi
def generate_quiz(proportion, top_similar_questions, mistakes, tps_question_data, total_questions=10):
    quiz_questions = []

    # Soal yang salah dijawab berdasarkan similarity
    for category in mistakes_per_category.index:
        questions_needed = proportion[category]
        for user_question, similar_questions in top_similar_questions.items():
            if mistakes.loc[mistakes['ID_QUESTION'] == user_question]['Question_Category'].values[0] == category:
                similar_question_ids = [tps_question_data.iloc[int(col[1:]) - 1]['ID'] for col in similar_questions.index]
                selected_questions = tps_question_data[tps_question_data['ID'].isin(similar_question_ids) & (tps_question_data['Question_Category'] == category)].head(questions_needed).to_dict('records')
                quiz_questions.extend(selected_questions)
                break

    # Soal yang dijawab benar secara acak
    for category in proportion.index:
        if category not in mistakes_per_category.index:
            questions_needed = proportion[category]
            selected_questions = tps_question_data[tps_question_data['Question_Category'] == category].sample(questions_needed).to_dict('records')
            quiz_questions.extend(selected_questions)

    # Pastikan hanya ada 10 soal
    quiz_questions = quiz_questions[:total_questions]
    return quiz_questions

In [None]:
# Memeriksa apakah user baru
def is_new_user(user_id, user_history):
    return user_history[user_history['ID_USER'] == user_id].empty

# Menghasilkan soal untuk user baru secara merata per kategori
def generate_questions_for_new_user(tps_question_data, total_questions=10):
    categories = tps_question_data['Question_Category'].unique()
    questions_per_category = total_questions // len(categories)

    quiz_questions = []

    for category in categories:
        selected_questions = tps_question_data[tps_question_data['Question_Category'] == category].sample(questions_per_category).to_dict('records')
        quiz_questions.extend(selected_questions)

    # Jika ada sisa soal yang belum terdistribusi
    remaining_questions = total_questions - len(quiz_questions)
    if remaining_questions > 0:
        extra_questions = tps_question_data[~tps_question_data['ID'].isin([q['ID'] for q in quiz_questions])].sample(remaining_questions).to_dict('records')
        quiz_questions.extend(extra_questions)

    return quiz_questions

In [None]:
# Merekomendasikan soal kepada user
def recommend_questions_for_user(user_id, user_history, tps_question_data, total_questions=10):
    if is_new_user(user_id, user_history):
        print("User baru, generate soal secara merata per kategori.")
        quiz_questions = generate_questions_for_new_user(tps_question_data, total_questions)

    else:
        user_data = user_history[user_history['ID_USER'] == user_id]

        total_questions_per_category = get_total_questions_per_category(user_data)

        mistakes_per_category, mistakes = get_mistakes_per_category(user_data)

        top_similar_questions = calculate_similarity(mistakes, tps_question_data)

        proportion = determine_proportion(mistakes_per_category, total_questions)
        print("\nProporsi soal yang akan ditampilkan untuk setiap kategori:")
        print(proportion)

        quiz_questions = generate_quiz(proportion, top_similar_questions, mistakes, tps_question_data, total_questions)

    print("\nSoal yang akan ditampilkan dalam kuis:")
    for question in quiz_questions:
        print(question)

In [None]:
# Menentukan ID user
user_id = 10

# Menghitung jumlah soal untuk setiap kategori yang telah dijawab oleh user (untuk pengecekan)
user_data = user_history[user_history['ID_USER'] == user_id]
total_questions_per_category = user_data.groupby('Question_Category').size()
print("Jumlah soal untuk setiap kategori yang telah dijawab oleh user:")
print(total_questions_per_category)

# Menghitung jumlah soal yang salah dijawab oleh user untuk setiap kategori (untuk pengecekan)
mistakes = user_data[user_data['USER_ANSWER'] != user_data['CORRECT_ANSWER']]
mistakes_per_category = mistakes.groupby('Question_Category').size()
print("\nJumlah soal yang salah dijawab oleh user untuk setiap kategori:")
print(mistakes_per_category)

recommend_questions_for_user(user_id, user_history, tps_question_data)

Jumlah soal untuk setiap kategori yang telah dijawab oleh user:
Question_Category
PBM    3
PK     1
PPU    2
PU     4
dtype: int64

Jumlah soal yang salah dijawab oleh user untuk setiap kategori:
Question_Category
PBM    2
PPU    2
PU     2
dtype: int64

Proporsi soal yang akan ditampilkan untuk setiap kategori:
PU     3
PBM    3
PPU    3
PK     1
dtype: int64

Soal yang akan ditampilkan dalam kuis:
{'ID': 200, 'Question_Category': 'PBM', 'Questions_Descriptions': '(1) Di tengah masa pendaftararan pasangan capres dan cawapres untuk pilpres 2019, 4-10 Agustus 2018, masyarakat diributkan dengan beberapa capaian ekonomi pemerintahan Presiden Jokowi yang dianggap tidak sesuai ekspektasi atau janji yang telah disampaikannya di masa kampanye pilpres 2014 lalu. (2) Salah satu indikator ekonomi yang sering kali dibahas dan diributkan oleh pihak oposisi adalah pertumbuhan ekonomi yang jauh dari target awal, yaitu 7 persen atau meleset sedikit dari target Anggaran Pendapatan Belanja Negara (APBN