In [2]:
import pandas as pd
import numpy as np
import pickle as pkl
import math
from sklearn.preprocessing import StandardScaler

train_feat= pd.read_csv('Datasets/train_features.csv', index_col=0)
test_feat= pd.read_csv('Datasets/test_features.csv', index_col=0)
all_feat = pd.concat([train_feat, test_feat])

user_profile = pd.read_csv('Datasets/user_info.csv',
                           delimiter=',',
                           index_col='user_id',
                           na_values='',
                           dtype=str)

user_profile['birth'] = pd.to_numeric(user_profile['birth'], errors='coerce')

# extract user age
birth_year = user_profile['birth'].to_dict()
def age_convert(y):
    if y == None or math.isnan(y):
        return 0
    a = 2018 - int(y)
    if a> 70 or a< 10:
        a = 0
    return a

all_feat['age'] = [age_convert(birth_year.get(int(u),None)) for u in all_feat['username']]

# extract user gender
user_gender = user_profile['gender'].to_dict()
def gender_convert(g):
    if g == 'male':
        return 1
    elif g == 'female':
        return 2
    else:
        return 0


all_feat['gender'] = [gender_convert(user_gender.get(int(u),None)) for u in all_feat['username']]

user_edu = user_profile['education'].to_dict()
def edu_convert(x):
    edus = ["Bachelor's","High", "Master's", "Primary", "Middle","Associate","Doctorate"]
    #if x == None or or math.isnan(x):
    #    return 0
    if not isinstance(x, str):
        return 0
    ii = edus.index(x)
    return ii+1

all_feat['education'] = [edu_convert(user_edu.get(int(u), None)) for u in all_feat['username']]



user_enroll_num = all_feat.groupby('username').count()[['course_id']]
course_enroll_num = all_feat.groupby('course_id').count()[['username']]

user_enroll_num.columns = ['user_enroll_num']
course_enroll_num.columns = ['course_enroll_num']

all_feat = pd.merge(all_feat, user_enroll_num, left_on = 'username', right_index = True)
all_feat = pd.merge(all_feat, course_enroll_num, left_on='course_id', right_index=True)

#extract user cluster
user_cluster_id = pkl.load(open('cluster/user_dict','rb'), encoding='latin-1')
cluster_label = np.load('cluster/label_5_10time.npy')
all_feat['cluster_label'] = [cluster_label[user_cluster_id[u]] for u in all_feat['username']]


#extract course category
courseinfo = pd.read_csv('Datasets/course_info.csv', index_col='id')
en_categorys = ['math','physics','electrical', 'computer','foreign language', 'business', 'economics',
                'biology','medicine','literature','philosophy','history','social science', 'art','engineering',
                'education','environment','chemistry']

def category_convert(cc):
    if isinstance(cc, str):
        for i, c in zip(range(len(en_categorys)), en_categorys):
            if cc == c:
                return i+1
    else:
        return 0
category_dict = courseinfo['category'].to_dict()

all_feat['course_category'] = [category_convert(category_dict.get(str(x), None)) for x in all_feat['username']]

act_feats = [c for c in train_feat.columns if 'count' in c or 'time' in c or 'num' in c]

pkl.dump(act_feats, open('act_feats.pkl','wb'))

num_feats = act_feats + ['age','course_enroll_num','user_enroll_num']
scaler= StandardScaler()
newX = scaler.fit_transform(all_feat[num_feats])
print(newX.shape)
for i, n_f in enumerate(num_feats):
    all_feat[n_f] = newX[:,i]   

all_feat.loc[train_feat.index].to_csv('SVMData/train_feat.csv')
all_feat.loc[test_feat.index].to_csv('SVMData/test_feat.csv')



(225642, 26)


In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
import math
from sklearn.preprocessing import StandardScaler

In [3]:

train_feat= pd.read_csv('Datasets/train_features.csv', index_col=0)
test_feat= pd.read_csv('Datasets/test_features.csv', index_col=0)
all_feat = pd.concat([train_feat, test_feat])
all_feat.head()

Unnamed: 0_level_0,all#count,session#count,seek_video#num,play_video#num,pause_video#num,stop_video#num,load_video#num,problem_get#num,problem_check#num,problem_save#num,...,delete_comment#num,click_info#num,click_courseware#num,click_about#num,click_forum#num,click_progress#num,close_courseware#num,truth,username,course_id
enroll_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
772,2,2,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,1,5981,course-v1:TsinghuaX+70800232X+2015_T2
773,21,21,6,4,3,0,1,0,0,0,...,0,2,2,2,0,0,1,1,1544995,course-v1:TsinghuaX+70800232X+2015_T2
774,74,74,9,14,10,1,8,0,0,0,...,0,6,17,0,0,0,9,1,1072798,course-v1:TsinghuaX+70800232X+2015_T2
776,19,19,0,3,3,1,2,0,0,0,...,0,2,4,2,0,0,2,0,561867,course-v1:TsinghuaX+70800232X+2015_T2
777,4,4,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,1,1,1368125,course-v1:TsinghuaX+70800232X+2015_T2


In [8]:
courseinfo2 = pd.read_csv('SVMData/course_infoEDIT.csv', index_col='id')

all_feat['start'] = [courseinfo2[u] for u in all_feat['username']]
for u in all_feat['username']:
    if( courseinfo2)

courseinfo2


Unnamed: 0_level_0,Unnamed: 0,course_id,start,end,course_type,category,start_date,end_date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
6561,0,course-v1:CPVS+CPVS-HDLSC001+20160901,2016-11-16 08:00:00,2016-12-31 23:30:00,0,,2016-11-16,2016-12-31
5557,1,course-v1:SCUT+144282+201709,2016-09-01 00:00:00,2017-02-28 00:00:00,0,,2016-09-01,2017-02-28
9433,2,course-v1:ZK+06093+J,2018-01-01 08:00:00,2020-01-01 00:00:00,0,,2018-01-01,2020-01-01
8320,3,course-v1:nuist+001+2016-T1,2017-03-01 18:30:00,2017-07-01 23:30:00,0,,2017-03-01,2017-07-01
231,4,FUDAN/CFD004/2014.9-2015.1,2014-09-10 08:00:00,2015-09-10 00:00:00,0,,2014-09-10,2015-09-10
...,...,...,...,...,...,...,...,...
10493,6405,course-v1:NBUX+lzu_MH001x+2017_T1,2017-04-10 00:00:00,2017-05-21 23:59:00,0,,2017-04-10,2017-05-21
11058,6406,course-v1:Train+Train12+2017_T1,2017-05-01 08:00:00,2017-05-31 00:00:00,0,,2017-05-01,2017-05-31
4184,6407,course-v1:nttec+10610204+2015_T2,2015-12-07 00:00:00,2016-12-07 00:00:00,0,,2015-12-07,2016-12-07
8333,6408,course-v1:TsinghuaX+60610231+2016_T2_SP,2016-08-25 08:00:00,,1,philosophy,2016-08-25,


In [6]:
courseinfo = pd.read_csv('Datasets/course_info.csv')
courseinfo.head()

Unnamed: 0,id,course_id,start,end,course_type,category
0,6561,course-v1:CPVS+CPVS-HDLSC001+20160901,2016-11-16 08:00:00,2016-12-31 23:30:00,0,
1,5557,course-v1:SCUT+144282+201709,2016-09-01 00:00:00,2017-02-28 00:00:00,0,
2,9433,course-v1:ZK+06093+J,2018-01-01 08:00:00,2020-01-01 00:00:00,0,
3,8320,course-v1:nuist+001+2016-T1,2017-03-01 18:30:00,2017-07-01 23:30:00,0,
4,231,FUDAN/CFD004/2014.9-2015.1,2014-09-10 08:00:00,2015-09-10 00:00:00,0,
