# 第一部分模型

#### 建模思路：将成绩预测问题视为回归问题进行分析。

##### 通过EDA发现大部分同学的成绩是相对稳定的，而影响其成绩变化的主要的原因是由于试卷分布的不同造成的，即不同试卷考了不同的内容导致同学的成绩发生变化，通过降维的手段对每一次考试的高维稀疏知识点进行降维处理。

#### 特征工程：使用全量的考试成绩通过统计的方式对每一位学生进行“成绩画像”的构建，如：score_mean、score_cv、score_std、score_peak、score_pluse等（score_peak：峰值因子，score_pluse:脉冲值，表示波形是否存在冲击指标，可以理解成学生是否具有偏离正常发挥的高分冲击指标现象）

#### 降维思路：将8门课视作1门课，每门课的每次考试只是在于考试点不同。将8门课的所有考试点concat，构成一门大的科目，（可以理解成这是一门大的《通识》课，包含了各个科目的各个考试点），然后利用NMF降维算法对这门《通识》课进行降维处理。
##### NMF-非负矩阵分解，可以对一个非负数矩阵分解成X = W.H的形式，而这种基于基向量组合的表示形式具有很直观的语义解释，它反映了人类思维中“局部构成整体”的概念。# 

## 1.1 **特征工程**

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
pd.set_option('display.max_columns',100)
pd.set_option('display.max_rows',100)

#### 数据读取

In [None]:
path = '/home/kesci/input/smart_edu7557/'
course = pd.read_csv(path + 'course.csv')
all_knowledge = pd.read_csv(path + 'all_knowledge.csv')
course1_exam = pd.read_csv(path + 'course1_exams.csv')
course2_exam = pd.read_csv(path + 'course2_exams.csv')
course3_exam = pd.read_csv(path + 'course3_exams.csv')
course4_exam = pd.read_csv(path + 'course4_exams.csv')
course5_exam = pd.read_csv(path + 'course5_exams.csv')
course6_exam = pd.read_csv(path + 'course6_exams.csv')
course7_exam = pd.read_csv(path + 'course7_exams.csv')
course8_exam = pd.read_csv(path + 'course8_exams.csv')
exam_score = pd.read_csv(path + 'exam_score.csv')
student_df = pd.read_csv(path + 'student.csv')
test_data = pd.read_csv(path + 'submission_s2.csv')

#### 提取每次考试的总体复杂度 （complexity） 以及各复杂度占比（complexity_i_ratio）特征

#### 对每一次考试进行时序化处理

In [None]:
all_knowledge['complexity'] = all_knowledge['complexity']+1
def func_1(data,index):
    return np.sum(data.values[1:] * all_knowledge.loc[all_knowledge['course']=='course'+index]['complexity'].values)
    
course1_exam['complexity'] = course1_exam.apply(func_1,axis=1,args=('1',))
course2_exam['complexity'] = course2_exam.apply(func_1,axis=1,args=('2',))
course3_exam['complexity'] = course3_exam.apply(func_1,axis=1,args=('3',))
course4_exam['complexity'] = course4_exam.apply(func_1,axis=1,args=('4',))
course5_exam['complexity'] = course5_exam.apply(func_1,axis=1,args=('5',))
course6_exam['complexity'] = course6_exam.apply(func_1,axis=1,args=('6',))
course7_exam['complexity'] = course7_exam.apply(func_1,axis=1,args=('7',))
course8_exam['complexity'] = course8_exam.apply(func_1,axis=1,args=('8',))

def func_2(data,index,complexity_):
    course_knowledge = all_knowledge.loc[all_knowledge['course']=='course'+index].copy()
    course_knowledge['complexity_'] = 0
    course_knowledge.loc[course_knowledge['complexity']==complexity_,'complexity_'] = 1
    return np.sum(data.values[1:-1*(i)] * course_knowledge['complexity_'].values)/100

for i in range(1,6):
    course1_exam['complexity'+str(i)+'_ratio'] = course1_exam.apply(func_2,axis=1,args=('1',i))
    course2_exam['complexity'+str(i)+'_ratio'] = course2_exam.apply(func_2,axis=1,args=('2',i))
    course3_exam['complexity'+str(i)+'_ratio'] = course3_exam.apply(func_2,axis=1,args=('3',i))
    course4_exam['complexity'+str(i)+'_ratio'] = course4_exam.apply(func_2,axis=1,args=('4',i))
    course5_exam['complexity'+str(i)+'_ratio'] = course5_exam.apply(func_2,axis=1,args=('5',i))
    course6_exam['complexity'+str(i)+'_ratio'] = course6_exam.apply(func_2,axis=1,args=('6',i))
    course7_exam['complexity'+str(i)+'_ratio'] = course7_exam.apply(func_2,axis=1,args=('7',i))
    course8_exam['complexity'+str(i)+'_ratio'] = course8_exam.apply(func_2,axis=1,args=('8',i))

dict_course1  = dict(zip(course1_exam['exam_id'].unique(),range(1,len(course1_exam['exam_id'].unique())+1)))
dict_course2  = dict(zip(course2_exam['exam_id'].unique(),range(1,len(course2_exam['exam_id'].unique())+1)))
dict_course3  = dict(zip(course3_exam['exam_id'].unique(),range(1,len(course3_exam['exam_id'].unique())+1)))
dict_course4  = dict(zip(course4_exam['exam_id'].unique(),range(1,len(course4_exam['exam_id'].unique())+1)))
dict_course5  = dict(zip(course5_exam['exam_id'].unique(),range(1,len(course5_exam['exam_id'].unique())+1)))
dict_course6  = dict(zip(course6_exam['exam_id'].unique(),range(1,len(course6_exam['exam_id'].unique())+1)))
dict_course7  = dict(zip(course7_exam['exam_id'].unique(),range(1,len(course7_exam['exam_id'].unique())+1)))
dict_course8  = dict(zip(course8_exam['exam_id'].unique(),range(1,len(course8_exam['exam_id'].unique())+1)))

course1_exam['number_exam'] = course1_exam['exam_id'].apply(lambda x : dict_course1[x])
course2_exam['number_exam'] = course2_exam['exam_id'].apply(lambda x : dict_course2[x])
course3_exam['number_exam'] = course3_exam['exam_id'].apply(lambda x : dict_course3[x])
course4_exam['number_exam'] = course4_exam['exam_id'].apply(lambda x : dict_course4[x])
course5_exam['number_exam'] = course5_exam['exam_id'].apply(lambda x : dict_course5[x])
course6_exam['number_exam'] = course6_exam['exam_id'].apply(lambda x : dict_course6[x])
course7_exam['number_exam'] = course7_exam['exam_id'].apply(lambda x : dict_course7[x])
course8_exam['number_exam'] = course8_exam['exam_id'].apply(lambda x : dict_course8[x])

course1_exam['course'] = 'course1'
course2_exam['course'] = 'course2'
course3_exam['course'] = 'course3'
course4_exam['course'] = 'course4'
course5_exam['course'] = 'course5'
course6_exam['course'] = 'course6'
course7_exam['course'] = 'course7'
course8_exam['course'] = 'course8'

exam_feature = ['exam_id','course','number_exam','complexity','complexity1_ratio',
       'complexity2_ratio', 'complexity3_ratio','complexity4_ratio','complexity5_ratio']
course_all_info = pd.concat([course1_exam[exam_feature],course2_exam[exam_feature],
                             course3_exam[exam_feature],course4_exam[exam_feature],
                             course5_exam[exam_feature],course6_exam[exam_feature],
                             course7_exam[exam_feature],course8_exam[exam_feature]],ignore_index=True)
course_all_info = course_all_info.merge(course, on = 'course')
course_max_df = course_all_info.groupby(['course'],as_index=False)['number_exam'].agg({'max_number_exam':'max'})
course_all_info = course_all_info.merge(course_max_df,on = 'course')
course_all_info['invert_number_exam'] = course_all_info['max_number_exam'] + 1 - course_all_info['number_exam']

#### 处理数据中考了0分的同学
##### 规则：去掉每个同学每门学科的一个最高分和最低分(如果该同学出现过0分，则删除其所有的0分考试）

In [None]:
data_all = pd.DataFrame()
#删除学生-科目最高/最低分
for student in exam_score['student_id'].unique():
    for course in exam_score['course'].unique():
        tmp_df = exam_score.loc[np.array(exam_score['student_id']==student) & np.array(exam_score['course']==course)&np.array(exam_score['istrain']==True)]
        #剪掉最低分
        min_ = tmp_df['score'].min()
        if min_ == 0: 
            tmp_df = tmp_df.loc[tmp_df['score']!=min_]
        else:
            tmp_df = tmp_df.drop(index = tmp_df.loc[tmp_df['score']==min_].index[0])
        #剪掉最高分
        max_ = tmp_df['score'].max()
        tmp_df = tmp_df.drop(index = tmp_df.loc[tmp_df['score']==max_].index[0])
        #print(tmp_df.shape)
        data_all = data_all.append(tmp_df)


#### 特征提取
##### 从 8门小科目 和 1门大科目 两个角度抽取描述学生成绩的特征

In [None]:
student_course_score = data_all.groupby(by=['student_id','course'], as_index=False)['score'].agg({'score_mean':np.mean, 'score_median':np.median, 'score_std':np.std,'max_score':np.max,'min_score':np.min,'score_nunique':'nunique'})
student_score = data_all.groupby(by=['student_id'], as_index=False)['score'].agg({'s_mean_score':np.mean, 's_median_score':np.median, 's_std_score':np.std,'s_max_score':np.max,'s_min_score':np.min,'s_score_nunique':'nunique'})
student_course_score['score_cv'] = student_course_score['score_std']/student_course_score['score_mean']
student_score['s_cv_score'] = student_score['s_std_score']/student_score['s_mean_score']
student_course_score['score_range'] = student_course_score['max_score']-student_course_score['min_score']
student_score['s_range_score'] = student_score['s_max_score']-student_score['s_min_score']

def l2_norm(a,b):
    return math.sqrt(pow(a,2) + pow(b,2))

student_course_score['rms'] = student_course_score.apply(lambda x: l2_norm(x['score_mean'], x['score_std']), axis=1)
student_score['s_rms'] = student_score.apply(lambda x: l2_norm(x['s_mean_score'], x['s_std_score']), axis=1)
student_course_score['wave'] = student_course_score['rms']/student_course_score['score_mean']
student_score['s_wave'] = student_score['s_rms']/student_score['s_mean_score']
student_course_score['peak'] = student_course_score['max_score']/student_course_score['rms']
student_score['s_peak'] = student_score['s_max_score']/student_score['s_rms']
student_course_score['pluse'] = student_course_score['max_score']/student_course_score['score_mean']
student_score['s_pluse'] = student_score['s_max_score']/student_score['s_mean_score']

df_all=exam_score.merge(student_course_score, on=['student_id','course'], how='left')
df_all=df_all.merge(student_score, on=['student_id'], how='left')

In [None]:
#优/差比率特征
tmp = data_all.groupby(by=['student_id','course'], as_index=False)['score'].agg({'good_ratio':lambda x : np.sum(x>95),                                                                     'bad_ratio':lambda x : np.sum(x<60)})
df_all = df_all.merge(tmp,on =['student_id','course'])
#提取性别学科—平均分特征
data_all = data_all.merge(student_df,on='student_id')
gender_course_mean = data_all.groupby(['gender','course'],as_index=False)['score'].agg({'gender_course_mean':'mean'})
df_all = df_all.merge(student_df,on='student_id')
df_all = df_all.merge(gender_course_mean,on=['gender','course'])

#### 知识点所属的段落、类目关于知识点的映射

In [None]:
def get_category_section(course_exam_,course,col='category'):
    course_exam = course_exam_.copy()
    course_exam.set_index(keys ='exam_id',inplace = True)
    course_knowledge = all_knowledge.loc[all_knowledge['course']==course]
    k_cat_dict = dict(zip(course_knowledge['knowledge_point'].values,course_knowledge[col].values))
    col_length = len(course_exam.columns)
    for cat in course_knowledge[col].unique():
        tmp_series = pd.Series(np.zeros(course_exam.shape[0]))
        for i in range(col_length):
            k = course_exam.columns[i]
            # if(k == 'exam_id'):
            #     continue
            if k_cat_dict[k] == cat:
                tmp_series += course_exam[k].values
        course_exam[cat] = tmp_series.values
    test_exam_list = test_data.loc[test_data['course']==course]['exam_id'].unique()
    exams_list = np.hstack((exam_score.loc[exam_score['course']==course]['exam_id'].unique(),test_exam_list))
    valid_course_exam = course_exam.loc[exams_list]
    for col in valid_course_exam.columns:
        if(valid_course_exam[col].sum() == 0 or col[0]=='K'):
            valid_course_exam.drop(columns=[col],inplace=True)
    valid_course_exam.reset_index(drop= False,inplace = True)
    return valid_course_exam

#### 知识点、知识点所属的段落、类目 进行NMF降维处理

In [None]:
course1_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course1_exams.csv')
course2_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course2_exams.csv')
course3_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course3_exams.csv')
course4_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course4_exams.csv')
course5_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course5_exams.csv')
course6_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course6_exams.csv')
course7_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course7_exams.csv')
course8_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course8_exams.csv')
import copy
courses_exam = [course1_exam,course2_exam,course3_exam,course4_exam,course5_exam,course6_exam,course7_exam,course8_exam]
course_exam_section = copy.deepcopy(courses_exam)
course_exam_category = copy.deepcopy(courses_exam)
#knowledge_point
for i,course_exam in enumerate(courses_exam):
    cols = list(map(lambda x: str(x) +'_c' + str(i+1)  if x not in['exam_id'] else x,course_exam.columns))
    course_exam.columns = cols
knowledge_df = pd.concat([course1_exam,course2_exam,course3_exam,course4_exam,course5_exam,course6_exam,course7_exam,course8_exam],axis = 0)
knowledge_df.fillna(0,inplace = True)

from sklearn.decomposition import NMF
model = NMF(n_components=100, init='random', random_state=2019)
knowledge_decompose = model.fit_transform(knowledge_df.drop('exam_id',axis = 1))
decom_knowledge = pd.DataFrame(knowledge_decompose,columns = ["Decompose_k"+str(i) for i in range(100)])
decom_knowledge['exam_id'] = knowledge_df['exam_id'].values
df_all=df_all.merge(decom_knowledge, on=['exam_id'], how='left')

#knowledge_section
for i,course_exam in enumerate(course_exam_section):
    course_exam =get_category_section(course_exam,'course'+str(i+1),'section')
    cols = list(map(lambda x: str(x) +'_c' + str(i+1)  if x not in['exam_id'] else x,course_exam.columns))
    course_exam.columns = cols

knowledge_df = pd.concat([course_exam_section[i] for i in range(8) ],axis = 0)
knowledge_df.fillna(0,inplace = True)
model = NMF(n_components=15, init='random', random_state=2019)
knowledge_decompose = model.fit_transform(knowledge_df.drop('exam_id',axis = 1))
decom_knowledge = pd.DataFrame(knowledge_decompose,columns = ["Decompose_s"+str(i) for i in range(15)])
decom_knowledge['exam_id'] = knowledge_df['exam_id'].values
df_all=df_all.merge(decom_knowledge, on=['exam_id'], how='left')
#knowledge_category
for i,course_exam in enumerate(course_exam_category):
    course_exam =get_category_section(course_exam,'course'+str(i+1),'category')
    cols = list(map(lambda x: str(x) +'_c' + str(i+1)  if x not in['exam_id'] else x,course_exam.columns))
    course_exam.columns = cols

knowledge_df = pd.concat([course_exam_category[i] for i in range(8) ],axis = 0)
knowledge_df.fillna(0,inplace = True)    
model = NMF(n_components=10, init='random', random_state=2019)
knowledge_decompose = model.fit_transform(knowledge_df.drop('exam_id',axis = 1))
decom_knowledge = pd.DataFrame(knowledge_decompose,columns = ["Decompose_c"+str(i) for i in range(10)])
decom_knowledge['exam_id'] = knowledge_df['exam_id'].values
df_all=df_all.merge(decom_knowledge, on=['exam_id'], how='left')

#### 划分数据集，特征选择

In [None]:
data_all = df_all.loc[df_all['score']!=0]
number_feature_list = [ 
       'complexity', 'complexity1_ratio', 'complexity2_ratio',
       'complexity3_ratio', 'complexity4_ratio','complexity5_ratio',
       'score_mean', 'score_median', 'score_std','max_score', 'min_score',
       'score_nunique', 'score_cv','rms',  'peak', 'pluse',
       's_std_score', 's_max_score', 's_min_score',
       's_score_nunique', 's_cv_score', 
       's_rms',  's_peak', 's_pluse', 
       'gender_course_mean','invert_number_exam',
       'Decompose_k0', 'Decompose_k1',
       'Decompose_k2', 'Decompose_k3', 'Decompose_k4', 'Decompose_k5',
       'Decompose_k6', 'Decompose_k7', 'Decompose_k8', 'Decompose_k9',
       'Decompose_k10', 'Decompose_k11', 'Decompose_k12', 'Decompose_k13',
       'Decompose_k14', 'Decompose_k15', 'Decompose_k16', 'Decompose_k17',
       'Decompose_k18', 'Decompose_k19', 'Decompose_k20', 'Decompose_k21',
       'Decompose_k22', 'Decompose_k23', 'Decompose_k24', 'Decompose_k25',
       'Decompose_k26', 'Decompose_k27', 'Decompose_k28', 'Decompose_k29',
       'Decompose_k30', 'Decompose_k31', 'Decompose_k32', 'Decompose_k33',
       'Decompose_k34', 'Decompose_k35', 'Decompose_k36', 'Decompose_k37',
       'Decompose_k38', 'Decompose_k39', 'Decompose_k40', 'Decompose_k41',
       'Decompose_k42', 'Decompose_k43', 'Decompose_k44', 'Decompose_k45',
       'Decompose_k46', 'Decompose_k47', 'Decompose_k48', 'Decompose_k49',
       'Decompose_k50', 'Decompose_k51', 'Decompose_k52', 'Decompose_k53',
       'Decompose_k54', 'Decompose_k55', 'Decompose_k56', 'Decompose_k57',
       'Decompose_k58', 'Decompose_k59', 'Decompose_k60', 'Decompose_k61',
       'Decompose_k62', 'Decompose_k63', 'Decompose_k64', 'Decompose_k65',
       'Decompose_k66', 'Decompose_k67', 'Decompose_k68', 'Decompose_k69',
       'Decompose_k70', 'Decompose_k71', 'Decompose_k72', 'Decompose_k73',
       'Decompose_k74', 'Decompose_k75', 'Decompose_k76', 'Decompose_k77',
       'Decompose_k78', 'Decompose_k79', 'Decompose_k80', 'Decompose_k81',
       'Decompose_k82', 'Decompose_k83', 'Decompose_k84', 'Decompose_k85',
       'Decompose_k86', 'Decompose_k87', 'Decompose_k88', 'Decompose_k89',
       'Decompose_k90', 'Decompose_k91', 'Decompose_k92', 'Decompose_k93',
       'Decompose_k94', 'Decompose_k95', 'Decompose_k96', 'Decompose_k97',
       'Decompose_k98', 'Decompose_k99',
       'Decompose_s0', 'Decompose_s1',
       'Decompose_s2', 'Decompose_s3', 'Decompose_s4', 'Decompose_s5',
       'Decompose_s6', 'Decompose_s7', 'Decompose_s8', 'Decompose_s9',
       'Decompose_s10', 'Decompose_s11', 'Decompose_s12', 'Decompose_s13',
       'Decompose_s14', 'Decompose_c0', 'Decompose_c1', 'Decompose_c2',
       'Decompose_c3', 'Decompose_c4', 'Decompose_c5', 'Decompose_c6',
       'Decompose_c7', 'Decompose_c8', 'Decompose_c9'

       ]
cat_feature_list= ['student_id', 'course', #'course_class',
                  ]

for cat in cat_feature_list:
    data_all[cat] = data_all[cat].astype('category')
    
train_set = data_all.loc[data_all['istrain']==True]
test_set = data_all.loc[data_all['istrain']==False]

train_set.reset_index(drop=True,inplace=True)
X_train_chuli = train_set[number_feature_list+cat_feature_list]
y_train_chuli = train_set['score']
test_X = test_set[number_feature_list+cat_feature_list]

## 1.2 **建模**# 

In [None]:
import lightgbm as lgb
from sklearn import metrics
from sklearn.model_selection import KFold,StratifiedKFold

#### 5折交叉验证建模/验证

In [None]:
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
test_Y = np.zeros(test_X.shape[0])
oof = np.zeros(X_train_chuli.shape[0])
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_chuli, y_train_chuli)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(X_train_chuli.loc[trn_idx],
                           label=y_train_chuli.loc[trn_idx],
                           
                           )
    val_data = lgb.Dataset(X_train_chuli.loc[val_idx],  
                                label=y_train_chuli.loc[val_idx],
                           )

    param = {
        'max_depth': -1,
        'num_leaves':int(31),
        'min_data_in_leaf': int(40), 
        'objective':'mae',
        'learning_rate': 0.1,
        "boosting": "gbdt",
        'bagging_fraction':0.9,
        'feature_fraction':0.9,
        'bagging_freq':1,
        'bagging_seed':2019,
        "metric": 'rmse',
        "verbosity": 500,
        'lambda_l1':1,
        'lambda_l2': 2
    }

    gbm = lgb.train(param,
                    trn_data,
                    200000,
                    valid_sets = [trn_data, val_data],
                    verbose_eval=500,
                    early_stopping_rounds = 50)

    oof[val_idx] = gbm.predict(X_train_chuli.loc[val_idx],
                       num_iteration=gbm.best_iteration) 
    
    test_Y = test_Y + gbm.predict(test_X, num_iteration=gbm.best_iteration)/5
    
#10000 3.924

#### 结果输出

In [None]:
train_set['pred'] = oof
test_set['pred'] = test_Y
for col in cat_feature_list:
    if col == 'student_id':
        test_set[col] = test_set[col].astype('int64')
    else :
        test_set[col] = test_set[col].astype('str')
stack_1 = train_set[['student_id','course','exam_id','score','pred']]
#stack_1.to_csv('/home/kesci/work/Output_stacking_kfolds/kfolds_stacking_allin_nmf.csv')
test_data = pd.read_csv('/home/kesci/input/smart_edu7557/submission_s2.csv')
test_data.drop(columns = 'pred',inplace=True)
test_1 = test_data.merge(test_set[['student_id','course','exam_id','pred']],on=['student_id','course','exam_id'])
#test_1.to_csv('/home/kesci/work/Output_stacking_kfolds/kfolds_allin_nmf.csv',index=None)

# 第二部分模型

#### 建模思路： 将成绩预测问题视为一种短期时间序列预测的问题。

##### 即认为学生的成绩会发生成绩上升/下滑，通过构建各科目成绩的趋势性、稳定性的特征如： rank_diff_before、rank_diff_long、rank_std、rank_mean等特征。其中，rank_diff_before是指学生前一次考试排名变化的情况，rank_diff_long是指学生前N次考试排名变化的情况）

#### 数据集划分：使用全量的考试成绩情况作为使用，通过一个大小为8的滑动窗口对全量考试进行特征构建。

#### 时序化思路：根据course_exam文件中出现的顺序决定其时序关系. 如： 第n个出现的exam，对应该course下的第n场exam

## 2.1 **特征工程**

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
pd.set_option('display.max_columns',100)
pd.set_option('display.max_rows',100)

#### 数据读取

In [None]:
path = '/home/kesci/input/smart_edu7557/'
course = pd.read_csv(path + 'course.csv')
all_knowledge = pd.read_csv(path + 'all_knowledge.csv')
course1_exam = pd.read_csv(path + 'course1_exams.csv')
course2_exam = pd.read_csv(path + 'course2_exams.csv')
course3_exam = pd.read_csv(path + 'course3_exams.csv')
course4_exam = pd.read_csv(path + 'course4_exams.csv')
course5_exam = pd.read_csv(path + 'course5_exams.csv')
course6_exam = pd.read_csv(path + 'course6_exams.csv')
course7_exam = pd.read_csv(path + 'course7_exams.csv')
course8_exam = pd.read_csv(path + 'course8_exams.csv')
exam_score = pd.read_csv(path + 'exam_score.csv')
student_df = pd.read_csv(path + 'student.csv')
test_data = pd.read_csv(path + 'submission_s2.csv')

#### 提取每次考试的总体复杂度 （complexity） 以及各复杂度占比（complexity_i_ratio）特征

In [None]:
all_knowledge['complexity'] = all_knowledge['complexity']+1

def func_1(data,index):
    return np.sum(data.values[1:] * all_knowledge.loc[all_knowledge['course']=='course'+index]['complexity'].values)
    
course1_exam['complexity'] = course1_exam.apply(func_1,axis=1,args=('1',))
course2_exam['complexity'] = course2_exam.apply(func_1,axis=1,args=('2',))
course3_exam['complexity'] = course3_exam.apply(func_1,axis=1,args=('3',))
course4_exam['complexity'] = course4_exam.apply(func_1,axis=1,args=('4',))
course5_exam['complexity'] = course5_exam.apply(func_1,axis=1,args=('5',))
course6_exam['complexity'] = course6_exam.apply(func_1,axis=1,args=('6',))
course7_exam['complexity'] = course7_exam.apply(func_1,axis=1,args=('7',))
course8_exam['complexity'] = course8_exam.apply(func_1,axis=1,args=('8',))

def func_2(data,index,complexity_):
    course_knowledge = all_knowledge.loc[all_knowledge['course']=='course'+index].copy()
    course_knowledge['complexity_'] = 0
    course_knowledge.loc[course_knowledge['complexity']==complexity_,'complexity_'] = 1
    return np.sum(data.values[1:-1*(i)] * course_knowledge['complexity_'].values)/100

for i in range(1,6):
    course1_exam['complexity'+str(i)+'_ratio'] = course1_exam.apply(func_2,axis=1,args=('1',i))
    course2_exam['complexity'+str(i)+'_ratio'] = course2_exam.apply(func_2,axis=1,args=('2',i))
    course3_exam['complexity'+str(i)+'_ratio'] = course3_exam.apply(func_2,axis=1,args=('3',i))
    course4_exam['complexity'+str(i)+'_ratio'] = course4_exam.apply(func_2,axis=1,args=('4',i))
    course5_exam['complexity'+str(i)+'_ratio'] = course5_exam.apply(func_2,axis=1,args=('5',i))
    course6_exam['complexity'+str(i)+'_ratio'] = course6_exam.apply(func_2,axis=1,args=('6',i))
    course7_exam['complexity'+str(i)+'_ratio'] = course7_exam.apply(func_2,axis=1,args=('7',i))
    course8_exam['complexity'+str(i)+'_ratio'] = course8_exam.apply(func_2,axis=1,args=('8',i))

course1_exam['course'] = 'course1'
course2_exam['course'] = 'course2'
course3_exam['course'] = 'course3'
course4_exam['course'] = 'course4'
course5_exam['course'] = 'course5'
course6_exam['course'] = 'course6'
course7_exam['course'] = 'course7'
course8_exam['course'] = 'course8'

#### 对试卷进行时序化处理

In [None]:
dict_course1  = dict(zip(course1_exam['exam_id'].unique(),range(1,len(course1_exam['exam_id'].unique())+1)))
dict_course2  = dict(zip(course2_exam['exam_id'].unique(),range(1,len(course2_exam['exam_id'].unique())+1)))
dict_course3  = dict(zip(course3_exam['exam_id'].unique(),range(1,len(course3_exam['exam_id'].unique())+1)))
dict_course4  = dict(zip(course4_exam['exam_id'].unique(),range(1,len(course4_exam['exam_id'].unique())+1)))
dict_course5  = dict(zip(course5_exam['exam_id'].unique(),range(1,len(course5_exam['exam_id'].unique())+1)))
dict_course6  = dict(zip(course6_exam['exam_id'].unique(),range(1,len(course6_exam['exam_id'].unique())+1)))
dict_course7  = dict(zip(course7_exam['exam_id'].unique(),range(1,len(course7_exam['exam_id'].unique())+1)))
dict_course8  = dict(zip(course8_exam['exam_id'].unique(),range(1,len(course8_exam['exam_id'].unique())+1)))

course1_exam['number_exam'] = course1_exam['exam_id'].apply(lambda x : dict_course1[x])
course2_exam['number_exam'] = course2_exam['exam_id'].apply(lambda x : dict_course2[x])
course3_exam['number_exam'] = course3_exam['exam_id'].apply(lambda x : dict_course3[x])
course4_exam['number_exam'] = course4_exam['exam_id'].apply(lambda x : dict_course4[x])
course5_exam['number_exam'] = course5_exam['exam_id'].apply(lambda x : dict_course5[x])
course6_exam['number_exam'] = course6_exam['exam_id'].apply(lambda x : dict_course6[x])
course7_exam['number_exam'] = course7_exam['exam_id'].apply(lambda x : dict_course7[x])
course8_exam['number_exam'] = course8_exam['exam_id'].apply(lambda x : dict_course8[x])

In [None]:
exam_feature = ['exam_id','course','number_exam','complexity','complexity1_ratio',
       'complexity2_ratio', 'complexity3_ratio','complexity4_ratio','complexity5_ratio']
course_all_info = pd.concat([course1_exam[exam_feature],course2_exam[exam_feature],
                             course3_exam[exam_feature],course4_exam[exam_feature],
                             course5_exam[exam_feature],course6_exam[exam_feature],
                             course7_exam[exam_feature],course8_exam[exam_feature]],ignore_index=True)
course_all_info = course_all_info.merge(course, on = 'course')
course_max_df = course_all_info.groupby(['course'],as_index=False)['number_exam'].agg({'max_number_exam':'max'})
course_all_info = course_all_info.merge(course_max_df,on = 'course')
course_all_info['invert_number_exam'] = course_all_info['max_number_exam'] + 1 - course_all_info['number_exam']

#### 处理数据中考了0分的同学
##### 填充规则：该次考试的平均分 + 同学在这门课中的考的平均分 - 这门课总计的平均分

In [None]:
def propress_0_score(exam_df):
    data = exam_df.copy()
    data_0_score = data.loc[data['score']==0]
    exam_data = data.fillna(0)
    exam_data = exam_data.loc[exam_data['score']!=0]
    for ind in data_0_score.index:
        exam_id = data_0_score.loc[ind]['exam_id']
        student_id = data_0_score.loc[ind]['student_id']
        course_id = data_0_score.loc[ind]['course']
        mean_score_of_exam = exam_data.loc[exam_data['exam_id']==exam_id,'score'].mean()
        score_of_student = exam_data.loc[np.array(exam_data['student_id']==student_id)& np.array(exam_data['course']==course_id)]['score'].mean()
        score_of_course = exam_data.loc[exam_data['course']==course_id]['score'].mean()
        data.loc[ind, 'score'] = mean_score_of_exam+score_of_student-score_of_course
    return data
    
exam_score = propress_0_score(exam_score)


#### 根据得分计算每一次考试的排名情况

In [None]:
exam_score = pd.concat([exam_score,test_data],ignore_index=True,axis=0)
exam_score['score'].fillna(-2019,inplace=True)
exam_score['istrain'] = exam_score['score']!=-2019
exam_score.drop(columns=['course','pred'],inplace=True,errors='ignore')
exam_score = exam_score.merge(course_all_info,on='exam_id')
exam_score['rank'] = -1
for exam in exam_score.exam_id.unique():
    tmp_df = exam_score.loc[exam_score['exam_id']==exam]
    score_list = list(tmp_df['score'].unique())
    score_list.sort(reverse=True)
    rank_dic = dict(zip(score_list,range(1,len(score_list)+1)))
    tmp_df['rank'] = tmp_df['score'].apply(lambda x : rank_dic[x])
    exam_score.loc[tmp_df.index ,'rank'] = tmp_df['rank']

#### 计算排名变化情况（与前一次相比）

In [None]:
exam_score['rank_diff'] = -2019
for course in exam_score['course'].unique():
    for student in exam_score['student_id'].unique():
        tmp_df = exam_score.loc[np.array(exam_score['course']==course) & np.array(exam_score['student_id']==student)]
        tmp_df = tmp_df.sort_values(by = ['invert_number_exam'],ascending = False)
        tmp_df['rank_diff'] = tmp_df['rank'].diff()
        exam_score.loc[tmp_df.index , 'rank_diff'] = tmp_df['rank_diff'].values
exam_score.loc[exam_score['istrain'] == False,'rank'] = -1
exam_score.loc[exam_score['istrain'] == False,'rank_diff'] = 0


#### 构建特征
##### 通过时序预测的方法提取趋势性、稳定性相关特征

In [1]:
def l2_norm(a,b):
    return math.sqrt(pow(a,2) + pow(b,2))
    
def create_feature(course_info,exam_cnt = 6):
    data_ = pd.DataFrame()
    for course in course_info['course'].unique():
        for index in  course_info['invert_number_exam'].unique():
            tmp_df = course_info.loc[course_info['course']==course]
            if index + exam_cnt > tmp_df['invert_number_exam'].max():
                continue
            tmp_df = tmp_df.loc[np.array(tmp_df['invert_number_exam']<=index+exam_cnt)&
                                   np.array(tmp_df['invert_number_exam']>index)]         
            tmp_df.fillna(0,inplace = True)
            tmp_df = tmp_df.loc[tmp_df['score']!=0]
            student_score_info = tmp_df.groupby(['student_id'],as_index=False)['score'].agg({
                                                              'score_mean':'mean','score_std':'std','score_sum':'sum',
                                                              'score_max':'max','score_min':'min','score_cnt':'count',
                                                              'score_0_cnt':lambda x : np.sum(x==0),
                                                              'score_90_cnt':lambda x : np.sum(x>=90)/exam_cnt,
                                                              'score_80_cnt':lambda x : np.sum(x>=80)/exam_cnt,
                                                              'score_70_cnt':lambda x : np.sum(x>=70)/exam_cnt,
                                                              'score_60_cnt':lambda x : np.sum(x>=60)/exam_cnt,
                                                              })
            student_score_info['score_cv'] = student_score_info['score_std']/student_score_info['score_mean']
            student_score_info['score_rms'] = student_score_info.apply(lambda row: l2_norm(row['score_mean'], row['score_std']), axis=1)
            student_score_info['score_wave'] = student_score_info['score_rms']/student_score_info['score_mean']
            student_score_info['score_peak'] = student_score_info['score_max']/student_score_info['score_rms']
            student_score_info['score_pluse'] = student_score_info['score_max']/student_score_info['score_mean']
            
            student_rank_info = tmp_df.groupby(['student_id'],as_index=False)['rank'].agg({
                                                              'rank_mean':'mean','rank_std':'std','rank_sum':'sum',
                                                              'rank_max':'max','rank_min':'min','score_cnt':'count',
                                                              'rank_5_cnt':lambda x : np.sum(x<=5)/exam_cnt,
                                                              'rank_10_cnt':lambda x : np.sum(x<=10)/exam_cnt,
                                                              'rank_20_cnt':lambda x : np.sum(x<=20)/exam_cnt,
                                                              'rank_30_cnt':lambda x : np.sum(x<=30)/exam_cnt,
                                                              })
            student_rank_info['rank_cv'] = student_rank_info['rank_std']/student_rank_info['rank_mean']
            student_rank_info['rank_rms'] = student_rank_info.apply(lambda row: l2_norm(row['rank_mean'], row['rank_std']), axis=1)
            student_rank_info['rank_wave'] = student_rank_info['rank_rms']/student_rank_info['rank_mean']
            student_rank_info['rank_peak'] = student_rank_info['rank_max']/student_rank_info['rank_rms']
            student_rank_info['rank_pluse'] = student_rank_info['rank_max']/student_rank_info['rank_mean']

            student_rank_diff_info = tmp_df.groupby(['student_id'],as_index=False)['rank_diff'].agg({
                                                                                        'rank_long_diff':'sum',
                                                                                        })
            tmp_course_info = course_info.loc[np.array(course_info['course']==course) & np.array(course_info['invert_number_exam']==index)]
            tmp_course_info = tmp_course_info.merge(student_score_info,on = 'student_id')
            tmp_course_info = tmp_course_info.merge(student_rank_info,on = 'student_id')
            tmp_course_info = tmp_course_info.merge(student_rank_diff_info,on = 'student_id')
            data_ = data_.append(tmp_course_info,ignore_index=True)
    #提取同学前一次考试的分数、排名、排名变化特征        
    exam_mean_score_before = data_.groupby(['exam_id'],as_index=False)['score'].agg({'exam_mean_score':'mean'})
    data_all_before = data_[['course','student_id','invert_number_exam','rank_diff','score','rank','exam_id']].copy()
    data_all_before = data_all_before.merge(exam_mean_score_before,on = 'exam_id')
    data_all_before['invert_number_exam'] = data_all_before['invert_number_exam']-1
    data_all_before.rename(columns = {'rank_diff':'rank_diff_before','score':'score_before','rank':'rank_before','exam_id':'exam_id_before'},inplace=True)
    data_ = data_.merge(data_all_before,on=['course','invert_number_exam','student_id'])
    data_['invert_complexity'] = 1/(data_['complexity']+1e-10)
    #提取性别学科—平均分特征
    data_ = data_.merge(student_df,on='student_id')
    gender_course_mean = data_.loc[data_['istrain']==True].groupby(['gender','course'],as_index=False)['score'].agg({'gender_course_mean':'mean'})
    data_ = data_.merge(gender_course_mean,on=['gender','course'])
    
    return data_

In [None]:
data_all = create_feature(exam_score,8)
#data_all.to_csv('/home/kesci/work/feature_data_kfolds/feature_'+str(8)+'feature_8_exams_kfolds_modify0_complexity.csv')

## 2.2 **建模**

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn import metrics
from sklearn.model_selection import KFold,StratifiedKFold
!pip install bayesian-optimization
from bayes_opt import BayesianOptimization
pd.set_option('display.max_columns',250)
pd.set_option('display.max_rows',250)

#### 划分数据集，特征选择

In [None]:
#path = '/home/kesci/work/feature_data_kfolds/'
#data_all = pd.read_csv(path + 'feature_8_exams_kfolds_modify0_complexity.csv',index_col=0)
data_all.loc[data_all['score']>100,'score'] = 100

number_feature_list = [
       'complexity', 'complexity1_ratio', 'complexity2_ratio',
       'complexity3_ratio', 'complexity4_ratio', 'complexity5_ratio',
        'score_mean', 'score_std', 'score_max',
       'score_min', 'score_rms',
       'rank_mean', 'rank_std','rank_rms', 'rank_long_diff',
       'rank_diff_before','rank_before','score_before',
       'exam_mean_score','gender_course_mean','invert_number_exam'
                      ]
cat_feature_list = ['course', 'student_id']

for cat in cat_feature_list:
    data_all[cat] = data_all[cat].astype('category')
    
train_set = data_all.loc[data_all['istrain']==True]
test_set = data_all.loc[data_all['istrain']==False]

train_set.reset_index(drop=True,inplace=True)
X_train_chuli = train_set[number_feature_list+cat_feature_list]
y_train_chuli = train_set['score']
test_X = test_set[number_feature_list+cat_feature_list]



#### 利用Bayesian优化器得到模型的优化参数

In [None]:
params = {'target': -3.9703564115232326,
 'params': {'lambda_l1': 1.9645126446155863,
  'lambda_l2': 0.009179013529974744,
  'min_data_in_leaf': 87.6607326409302,
  'num_leaves': 75.2384901178975}}
  
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
test_Y = np.zeros(test_X.shape[0])
oof = np.zeros(X_train_chuli.shape[0])
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_chuli, y_train_chuli)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(X_train_chuli.loc[trn_idx],
                           label=y_train_chuli.loc[trn_idx],
                           
                           )
    val_data = lgb.Dataset(X_train_chuli.loc[val_idx],
                                label=y_train_chuli.loc[val_idx],
                           )

    param = {
        'max_depth': -1,
        'num_leaves':int(params['params']['num_leaves']),
        'min_data_in_leaf': int(params['params']['min_data_in_leaf']), 
        'objective':'mae',
        'learning_rate': 0.01,
        "boosting": "gbdt",
        'bagging_fraction':0.8,
        'feature_fraction':0.8,
        "metric": 'rmse',
        "verbosity": 500,
        'lambda_l1':params['params']['lambda_l1'],
        'lambda_l2': params['params']['lambda_l2']
    }

    gbm = lgb.train(param,
                    trn_data,
                    200000,
                    valid_sets = [trn_data, val_data],
                    verbose_eval=500,
                    early_stopping_rounds = 100)

    oof[val_idx] = gbm.predict(X_train_chuli.loc[val_idx],
                       num_iteration=gbm.best_iteration) 
    
    test_Y = test_Y + gbm.predict(test_X, num_iteration=gbm.best_iteration)/5

#### 结果输出

In [None]:
train_set['pred'] = oof
test_set['pred'] = test_Y
for col in cat_feature_list:
    if col == 'student_id':
        test_set[col] = test_set[col].astype('int64')
    else :
        test_set[col] = test_set[col].astype('str')
stack_2 = train_set[['student_id','course','exam_id','score','pred']]
#stack_2.to_csv('/home/kesci/work/Output_stacking_kfolds/kfolds_stacking_8_exams_modify0_complexity.csv')
test_data = pd.read_csv('/home/kesci/input/smart_edu7557/submission_s2.csv')
test_data.drop(columns = 'pred',inplace=True)
test_2 = test_data.merge(test_set[['student_id','course','exam_id','pred']],on=['student_id','course','exam_id'])
#test_2.to_csv('/home/kesci/work/Output_stacking_kfolds/kfolds_test_8_exams_modify0.csv',index=None)

# 第三部分模型



#### 首先对成绩、试卷考点分布、学生考点掌握整体进行数学抽象
##### 定义：
##### score : m \* n (exam_number * student_numbers) 			试卷-学生成绩矩阵
##### exam : m \* s+ (exam_number * course_section)  			试卷-考点分布矩阵
##### stu : 1+s \* n (course_section \* student_numbers)  学生-考点掌握矩阵

###### 其中， m为各科考试次数 n为学生数目 s为各科考点数

### 建模思路：Argmin <stu> (Σ(score - exam\*stu)^2)



In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
from sklearn import metrics

pd.set_option('display.max_columns',500)
pd.set_option('display.max_rows',1000)

#### 数据读取

In [None]:
path = '/home/kesci/input/smart_edu7557/'
course = pd.read_csv(path + 'course.csv')
all_knowledge = pd.read_csv(path + 'all_knowledge.csv')
course1_exam = pd.read_csv(path + 'course1_exams.csv',index_col=0)
course2_exam = pd.read_csv(path + 'course2_exams.csv',index_col=0)
course3_exam = pd.read_csv(path + 'course3_exams.csv',index_col=0)
course4_exam = pd.read_csv(path + 'course4_exams.csv',index_col=0)
course5_exam = pd.read_csv(path + 'course5_exams.csv',index_col=0)
course6_exam = pd.read_csv(path + 'course6_exams.csv',index_col=0)
course7_exam = pd.read_csv(path + 'course7_exams.csv',index_col=0)
course8_exam = pd.read_csv(path + 'course8_exams.csv',index_col=0)
exam_score = pd.read_csv(path + 'exam_score.csv')
student_df = pd.read_csv(path + 'student.csv')
test_data = pd.read_csv(path + 'submission_s2.csv')

#### 知识点所属的段落关于知识点的映射


In [None]:
def get_category_section(course_exam_,course,col='section'):
    course_exam = course_exam_.copy()
    course_knowledge = all_knowledge.loc[all_knowledge['course']==course]
    k_cat_dict = dict(zip(course_knowledge['knowledge_point'].values,course_knowledge[col].values))
    col_length = len(course_exam.columns)
    for cat in course_knowledge[col].unique():
        tmp_series = pd.Series(np.zeros(course_exam.shape[0]))
        for i in range(col_length):
            k = course_exam.columns[i]
            if k_cat_dict[k] == cat:
                tmp_series += course_exam[k].values
        course_exam[cat] = tmp_series.values
    test_exam_list = test_data.loc[test_data['course']==course]['exam_id'].unique()
    exams_list = np.hstack((exam_score.loc[exam_score['course']==course]['exam_id'].unique(),test_exam_list))
    valid_course_exam = course_exam.loc[exams_list]
    for col in valid_course_exam.columns:
        if(valid_course_exam[col].sum() == 0 or col[0]=='K'):
            valid_course_exam.drop(columns=[col],inplace=True)
    return valid_course_exam

In [None]:
valid_course1_exam = get_category_section(course1_exam,'course1')
valid_course2_exam = get_category_section(course2_exam,'course2')
valid_course3_exam = get_category_section(course3_exam,'course3')
valid_course4_exam = get_category_section(course4_exam,'course4')
valid_course5_exam = get_category_section(course5_exam,'course5')
valid_course6_exam = get_category_section(course6_exam,'course6')
valid_course7_exam = get_category_section(course7_exam,'course7')
valid_course8_exam = get_category_section(course8_exam,'course8')

#### 处理数据中考了0分的同学
##### 填充规则：该次考试的平均分 + 同学在这门课中的考的平均分 - 这门课总计的平均分

In [None]:
def propress_0_score(exam_df):
    data = exam_df.copy()
    data_0_score = data.loc[data['score']==0]
    exam_data = data.fillna(0)
    exam_data = exam_data.loc[exam_data['score']!=0]
    for ind in data_0_score.index:
        exam_id = data_0_score.loc[ind]['exam_id']
        student_id = data_0_score.loc[ind]['student_id']
        course_id = data_0_score.loc[ind]['course']
        mean_score_of_exam = exam_data.loc[exam_data['exam_id']==exam_id,'score'].mean()
        score_of_student = exam_data.loc[np.array(exam_data['student_id']==student_id)& np.array(exam_data['course']==course_id)]['score'].mean()
        score_of_course = exam_data.loc[exam_data['course']==course_id]['score'].mean()
        data.loc[ind, 'score'] = int(mean_score_of_exam+score_of_student-score_of_course)
    return data
    
exam_score = propress_0_score(exam_score)
exam_score.loc[exam_score['score']>100,'score'] = 100

In [None]:
exam_score_mean = exam_score.groupby('exam_id',as_index=False)['score'].agg({'exam_mean_score':'mean'})
course_score_mean = exam_score.groupby('course',as_index=False)['score'].agg({'course_mean_score':'mean'})
#倒数第二次考试均分
last_exam_list = exam_score.drop_duplicates(subset='course',keep='last')['exam_id'].values
last_exam = exam_score.loc[exam_score['exam_id'].isin(last_exam_list)].groupby('course',as_index=False)['score'].agg({'last_exam_mean_score':'mean'})
exam_score = exam_score.merge(exam_score_mean,on='exam_id')
exam_score = exam_score.merge(course_score_mean,on='course')
exam_score = exam_score.merge(last_exam,on='course')

exam_score['score_eval'] = exam_score['score']#/exam_score['exam_mean_score'] * exam_score['course_mean_score']

In [None]:
total_courses_exams = np.array([
                                valid_course1_exam.values,valid_course2_exam.values,valid_course3_exam.values,
                                valid_course4_exam.values,valid_course5_exam.values,valid_course6_exam.values,
                                valid_course7_exam.values,valid_course8_exam.values,
                                ])

#### 求取最优解

In [None]:
i = 0 
for c in exam_score['course'].unique():
    course_exams = exam_score.loc[exam_score['course']==c]
    stu_dict = dict(zip(course_exams['student_id'].unique(),range(len(course_exams['student_id'].unique()))))
    exam_dict = dict(zip(course_exams['exam_id'].unique(),range(len(course_exams['exam_id'].unique()))))
    exam_len = len(course_exams['exam_id'].unique())
    col = course_exams['student_id'].map(stu_dict).values
    row = course_exams['exam_id'].map(exam_dict).values
    val = course_exams['score_eval'].values
    score_matrix = sparse.csr_matrix((val,(row,col)),shape = (exam_len, 500) ).toarray()
    exam_matrix = total_courses_exams[i][:(total_courses_exams[i].shape[0]-1), :]
    stu_matrix = np.dot(np.dot(np.linalg.inv(np.dot(exam_matrix.T,exam_matrix)), exam_matrix.T), score_matrix)
    i+=1
    np.save('/home/kesci/work/student_knowledge_matrix/course_'+str(i)+'_stu_matrix_final',stu_matrix)
   
    

In [None]:
student_id_list = exam_score['student_id'].unique()
res_df = pd.DataFrame()
for i in range(1,9):
    exam_matrix = np.reshape(total_courses_exams[i-1][total_courses_exams[i-1].shape[0]-1,],newshape=(total_courses_exams[i-1].shape[1],1))
    course_stu_matrix = np.load('/home/kesci/work/student_knowledge_matrix/course_'+str(i)+'_stu_matrix_final.npy')
    score_list = np.dot(course_stu_matrix.T, exam_matrix)
    course_df = pd.DataFrame({'student_id':student_id_list,'pred':np.reshape(score_list,newshape=(500))})
    course_df['course'] = 'course'+str(i)
    res_df = res_df.append(course_df)


In [None]:
test_data.drop(columns='pred',inplace=True)
test_3 = test_data.merge(res_df,on = ['student_id','course'])
#test_3.to_csv('/home/kesci/work/student_knowledge_matrix/res_knowledge_point_nomodify.csv')

# 第四部分算法模型融合



#### 利用 Stacking的方法对第一部分模型、第二部分模型进行线性回归融合得到一个初步Stacking结果，然后将这个结果与第三部分得到的考点掌握得分按比例加权得到最后的结果


In [None]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import LinearRegression

#### 数据读取
### stack_1、stack_2、test_1、test_2、test_3结果可以在上面的cell中计算得到。
# 结果运行比较慢，如需要验证请自行删除下面一个cell的注释

In [2]:
# path = '/home/kesci/work'
# stack_1 = pd.read_csv(path + '/Output_stacking_kfolds/kfolds_stacking_8_exams_modify0_complexity.csv') 
# stack_2 = pd.read_csv(path +'/Output_stacking_kfolds/kfolds_stacking_allin_nmf.csv') 
# test_1 = pd.read_csv(path + '/Output_stacking_kfolds/kfolds_test_8_exams_modify0.csv') 
# test_2 = pd.read_csv(path + '/Output_stacking_kfolds/kfolds_allin_nmf.csv')


#### 划分数据集，特征选择

In [None]:
stack_1.rename(columns={'pred':'pred_1'},inplace=True)
stack_2.rename(columns={'pred':'pred_2'},inplace=True)

test_1.rename(columns={'pred':'pred_1'},inplace=True)
test_2.rename(columns={'pred':'pred_2'},inplace=True)

data_all = stack_1.merge(stack_2,on=['student_id','exam_id','score'])
test_data = test_1.merge(test_2,on=['student_id','course','exam_id'])

feature_List = [ 'pred_1', 'pred_2']
X_train_chuli = data_all[feature_List]
y_train_chuli = data_all['score']
test_X = test_data[feature_List]

#### 线性回归求取stacking 结果

In [None]:
lr = LinearRegression()
lr.fit(X_train_chuli,y_train_chuli)
test_Y = lr.predict(test_X)

#### 与第三部分结果加权融合

In [None]:
test_data['pred'] = test_Y
#test_data.to_csv(path+'/res_output/submission_8exam_tsvd_nmf.csv')
#test_3 = pd.read_csv(path+'/student_knowledge_matrix/res_knowledge_point_nomodify.csv')
test_data['pred'] = 0.5*test_data['pred'] + 0.5*test_3['pred']
#test_data[['student_id','course','exam_id','pred']].to_csv('/home/kesci/work/res_output/submission_8exams_nmf_knowledge.csv',index=None)