In [1]:
import pandas as pd
import scipy as sp
from scipy.stats import multinomial
import os
import numpy as np
import math

In [2]:
def entropy(c):
    # calculate entropy
    result=-1
    if(len(c)>0):
        result=0
    for x in c:
        result+=(-x)*math.log(x,2)
    return result

In [4]:
# load the training data
df = pd.read_csv('../train_data/train_task_3_4.csv')
# load side information
answer_meta_data = pd.read_csv('../metadata/answer_metadata_task_3_4.csv')
question_meta_data = pd.read_csv('../metadata/question_metadata_task_3_4.csv')
student_meta_data = pd.read_csv('../metadata/student_metadata_task_3_4.csv')

In [5]:
# load the submission csv file
submission_file = pd.read_csv('../starter_kit/submission_templates/submission_task_3.csv')

### calculate the entropy of choice

In [8]:
choice_entropy = df.groupby('QuestionId')['AnswerValue'].agg(lambda x:multinomial.entropy(1, x.value_counts(normalize=True)))
submission_file['entropy_choice'] = choice_entropy
submission_file['z_entropy_choice'] = (submission_file['entropy_choice']-np.mean(submission_file['entropy_choice']))/np.std(submission_file['entropy_choice'])

### calculate the question confidence

In [7]:
new_df = df.merge(answer_meta_data, on='AnswerId', how='left')
notnull_confidence = new_df[new_df['Confidence'].notnull()]
que_num = notnull_confidence.groupby('QuestionId')['QuestionId'].agg(lambda x: len(x))
num_confid = {}
for i in que_num.index:
    num_confid[i] = que_num[i]
valid_que = []
for key, value in num_confid.items():
    if value < 20:
        pass
    else:
        valid_que.append(key)
notnull_confidence = notnull_confidence[notnull_confidence['QuestionId'].isin(valid_que)]
que_avg_confid = {}
for idx in notnull_confidence['QuestionId'].unique():
    cut = notnull_confidence[notnull_confidence['QuestionId']==idx]
    que_avg_confid[idx] = cut['Confidence'].mean()
all_que_confid = list(que_avg_confid.values())
submission_file['confidence'] = submission_file['QuestionId'].apply(lambda x: que_avg_confid[x] if x in que_avg_confid else np.mean(all_que_confid))
submission_file['z_confidence'] = (submission_file['confidence']-np.mean(submission_file['confidence']))/np.std(submission_file['confidence'])

### calculate the answer right&wrong entropy

In [9]:
right_entropy = df.groupby('QuestionId')['IsCorrect'].agg(lambda x: multinomial.entropy(1, x.value_counts(normalize=True)))
submission_file['right_entropy'] = right_entropy
submission_file['z_entropy_right'] = (submission_file['right_entropy']-np.mean(submission_file['right_entropy']))/np.std(submission_file['right_entropy'])

### calculate the entropy conditioned on group
H(right&wrong | group)

In [11]:
def get_one_condition_entropy(one_df):
    some_res = one_df.groupby('GroupId')['IsCorrect'].agg(lambda x: multinomial.entropy(1, x.value_counts(normalize=True)))
    one_score = 0
    for one_group_idx in some_res.index:
        cut_df = one_df[one_df['GroupId']==one_group_idx]
        one_score += some_res[one_group_idx] * (cut_df.shape[0]/one_df.shape[0])
    return one_score

In [12]:
cond_right_group_entropy = {}
for one_que_id in new_df.QuestionId.unique():
    one_df = new_df[new_df['QuestionId']==one_que_id]
    # 计算conditional entropy
    cond_right_group_entropy[one_que_id] = get_one_condition_entropy(one_df)
submission_file['cond_entropy_group'] = submission_file['QuestionId'].apply(lambda x: cond_right_group_entropy[x])
submission_file['z_cond_entropy'] = (submission_file['cond_entropy_group']-submission_file['cond_entropy_group'].mean())/submission_file['cond_entropy_group'].std()

### calculate the entropy conditioned on quiz

In [13]:
def get_one_quiz_condition_entropy(one_df):
    some_res = one_df.groupby('QuizId')['IsCorrect'].agg(lambda x: multinomial.entropy(1, x.value_counts(normalize=True)))
    one_score = 0
    for one_group_idx in some_res.index:
        cut_df = one_df[one_df['QuizId']==one_group_idx]
        one_score += some_res[one_group_idx] * (cut_df.shape[0]/one_df.shape[0])
    return one_score
cond_right_quiz_entropy = {}
for one_que_id in new_df.QuestionId.unique():
    one_df = new_df[new_df['QuestionId']==one_que_id]
    # 计算conditional entropy
    cond_right_quiz_entropy[one_que_id] = get_one_quiz_condition_entropy(one_df)
submission_file['cond_entropy_quiz'] = submission_file['QuestionId'].apply(lambda x: cond_right_quiz_entropy[x])
submission_file['z_cond_quiz_entropy'] = (submission_file['cond_entropy_quiz']-submission_file['cond_entropy_quiz'].mean())/submission_file['cond_entropy_quiz'].std()

### final ranking

In [15]:
float0, float_1, float_2 = 0.7, 0.1, 1
submission_file['final_score'] = submission_file['z_entropy_choice'] + float0*submission_file['z_cond_entropy'] + \
float_1* submission_file['z_cond_quiz_entropy'] + \
float_2*submission_file['z_entropy_right'] - submission_file['z_confidence']

In [None]:
ranking = submission_file['final_score'].rank(method='first', ascending=False).astype('int16')
submission_file['ranking'] = ranking
submission_file[['QuestionId','ranking']].to_csv('../submissions/final_report.csv',index=False)
first_try = pd.read_csv('../submissions/final_report.csv')
first_try_zip = first_try.sort_values("ranking", ascending=True)
first_try_zip.to_csv('../submissions/submission_task_3_report.csv',index=False)