In [1]:
%reset -f

import pandas as pd
import os
import numpy as np
from six.moves import xrange
import math
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import pickle

In [2]:
def ability_level_mapper(data, groups=None, col='front', how='naive', n_level=19, invert=True, parameters=None):
    # the raw data is divided into groups according to its exc_num, ability levels are calculated respectively
    # how: 1 is mapping without any other processing, called 'naive'
    origin = data.copy()
    
    if parameters is not None:
        col, how, n_level, invert, v_max, v_min = parameters
        interval = (v_max- v_min)/n_level
        assert interval!=0, 'zero dividend'
        origin['performance'] = (origin[col]-v_min)/interval
        
        origin['performance'] = origin['performance'].astype(int)
        if invert:
            origin['performance'] = n_level-origin['performance']+1
        else:
            origin['performance'] = origin['performance']+1
        
        return origin, parameters
        
    if groups == None:
        v_max = origin[col].max()
        v_min = origin[col].min()
        
        interval = (v_max- v_min)/n_level
        assert interval!=0, 'zero dividend'
        origin['performance'] = (origin[col]-v_min)/interval
        
        origin['performance'] = origin['performance'].astype(int)
        if invert:
            origin['performance'] = n_level-origin['performance']+1
        else:
            origin['performance'] = origin['performance']+1
        
        parameters = (col, how, n_level, invert, v_max, v_min )
        return origin, parameters
    
    if how == 1 or how=='naive':
        tmp = origin[col]
        for index, group in groups:
            
            v_max = group[col].max()
            v_min = group[col].min()
            
            interval = (v_max-v_min)/n_level
            
            assert interval!=0, 'zero dividend'
            
            origin.loc[index, col] = (origin.loc[index, col]-v_min)/interval
        origin[col] = origin[col].astype(int)
        if invert:
            origin['performance'] = n_level-origin[col]+1
            
        else:
            origin['performance'] = origin['performance']+1
        origin[col] = tmp
        parameters = (col, how, n_level, invert, v_max, v_min )
        return origin, parameters
    
    
def merge_minority(data, by=['difficulty'], by2=['uid', 'day', 'exc_num', 'exc_times'],threshold=10, threshold2=3):
    cnt = 1
    while cnt>0:
        cnt = 0
        for index, group in data.groupby(by):
            d = index
            tmp = group.groupby(by2).mean()
            
            if len(group)<threshold or len(tmp)<threshold2:
                if d<data['difficulty'].max()/2:
                    data.loc[data.difficulty==d, 'difficulty'] +=1
                else:
                    data.loc[data.difficulty==d, 'difficulty'] -=1
                cnt += 1
                continue
            
        print(cnt)
    return data       

def rewrite_env_parameter(if_merge_min, sampling_rate):
    with open('../data/parameter/def_env.p', 'rb') as f:
        tmp = pickle.load(f)
        tmp = tmp + [if_merge_min] + [sampling_rate]
        
    with open('../data/parameter/def_env.p', 'wb') as f:
        pickle.dump(tmp, f)
        
    del tmp

In [3]:
all_data = pd.read_csv('../data/step1_clear_data.csv')
all_data.head()

Unnamed: 0,day,exc_num,exc_times,uid,avg_velocity,front,difficulty,vd_score
0,1,1.1,1,4,0.358939,0.011647,155,16.581553
1,1,1.1,1,4,0.358445,0.011647,155,16.558709
2,1,1.1,1,4,0.35795,0.011971,155,16.29194
3,1,1.1,1,4,0.357456,0.012333,155,16.006004
4,1,1.1,1,4,0.356961,0.012333,155,15.983862


In [4]:
# tmp = all_data.set_index(['day', 'exc_num', 'uid', target_col])

In [5]:
# all_data['vf'] = all_data['velocity']/(all_data['front']+0.1)

In [6]:
# env parameters
sampling_rate = 0.5
merge_min = False

rewrite_env_parameter(merge_min, sampling_rate)

### sampling

In [7]:
all_data = all_data.sample(frac=sampling_rate).reset_index(drop=True)
all_data.head()

Unnamed: 0,day,exc_num,exc_times,uid,avg_velocity,front,difficulty,vd_score
0,5,3.2,2,2,0.454052,0.093344,121,4.393611
1,2,2.2,3,3,0.21482,0.173702,92,1.169396
2,4,4.3,1,7,0.917374,0.557642,160,1.616113
3,4,4.1,3,1,0.710612,0.477751,67,1.456915
4,5,2.2,1,8,0.174841,0.057829,33,2.577686


### average data

In [8]:
target_col = 'front'
invert = True

item, parameters = ability_level_mapper(all_data, col=target_col, invert=invert)
if merge_min:
    item = merge_minority(item)
item = item[['day', 'exc_num', 'uid', 'exc_times', 'difficulty','performance']]. \
    groupby(by=['day', 'exc_num', 'uid', 'exc_times', 'difficulty']).mean()

# print(item['performance'].describe())
# uid_avg_score_per_item = item.reset_index()[['day', 'exc_num', 'uid', 'performance']].
#     groupby(by=['day', 'exc_num', 'uid']).mean()
# uid_avg_score_per_item = uid_avg_score_per_item.reset_index()
# uid_avg_score_per_item.describe()
with open('../data/parameter/ability_mapper.p', 'wb') as f:
    pickle.dump(parameters, f)
with open('../data/parameter/ability_mapper2.p', 'wb') as f:
    pickle.dump(parameters, f, protocol=2)

item = item.reset_index()
item.head()

Unnamed: 0,day,exc_num,uid,exc_times,difficulty,performance
0,1,1.1,1,2,5,20.0
1,1,1.1,1,2,7,20.0
2,1,1.1,1,2,9,20.0
3,1,1.1,1,2,37,20.0
4,1,1.1,1,2,68,20.0


In [9]:
item.describe()

Unnamed: 0,day,exc_num,uid,exc_times,difficulty,performance
count,17464.0,17464.0,17464.0,17464.0,17464.0,17464.0
mean,3.078218,2.767997,5.478985,2.553195,148.566136,17.012251
std,1.403547,1.111026,3.133392,1.557697,88.818672,2.64225
min,1.0,1.1,1.0,1.0,0.0,3.0
25%,2.0,2.1,3.0,1.0,69.0,15.146658
50%,3.0,2.3,5.0,2.0,155.0,17.9
75%,4.0,4.1,8.0,4.0,227.0,19.112146
max,5.0,4.3,11.0,9.0,329.0,20.0


In [10]:
item.to_csv('../data/step2_expected_performance.csv', index=False)

#### unidemention testing (TODO)

In [11]:
# check pearson coefficient between different items

tmp = uid_avg_score_per_item.reset_index()
# groups = tmp[(tmp['day']==1) & (tmp['exc_num']!=1.1)].groupby(by=['exc_num'])
# groups = tmp[(tmp['day']==2)].groupby(by=['exc_num'])
# groups = tmp[(tmp['day']==2) & (tmp['exc_num']>2)].groupby(by=['exc_num'])
# groups = tmp[(tmp['day']==3)].groupby(by=['exc_num'])
# groups = tmp[(tmp['day']==3) & (tmp['exc_num']>3)].groupby(by=['exc_num'])
# groups = tmp[(tmp['day']==4) & (tmp['exc_num']>4)].groupby(by=['exc_num'])
# groups = tmp[(tmp['day']==5) & ((tmp['exc_num']>4) | (tmp['exc_num']<2))].groupby(by=['exc_num'])
# groups = tmp[(tmp['day']==5)].groupby(by=['exc_num'])
# groups = tmp[((tmp['day']==2) | (tmp['day']==3)) & (tmp['exc_num']==2.3)].groupby(by=['day', 'exc_num'])
# groups = tmp[((tmp['day']==2) | (tmp['day']==3)) ].groupby(by=['day', 'exc_num'])
groups = tmp[((tmp['day']==2) | (tmp['day']==1)) ].groupby(by=['day', 'exc_num'])
for index1, group1 in groups:
    set1 = group1['uid'].tolist()

    for index2, group2 in groups:
        if index1==index2:
            continue
        else:
            set2 = group2['uid'].tolist()
            common_uid = list(set(set1).intersection(set2))
            l1 = group1.set_index(['uid']).loc[common_uid, 'performance']
            l2 = group2.set_index(['uid']).loc[common_uid, 'performance']
#             print(group1.set_index(['uid']).loc[common_uid])
#             print(group2.set_index(['uid']).loc[common_uid])
            pearson, p_vlaue = pearsonr(l1, l2)
            print(index1, index2)
            print(pearson)

NameError: name 'uid_avg_score_per_item' is not defined

In [None]:
groups = tmp[(tmp['day']==1) & (tmp['exc_num']>1)].groupby(by=['exc_num'])
sns.set_style('whitegrid')
f, ax= plt.subplots(figsize = (14, 10))
for index, group in groups:
#     print(group)
    
    ax = sns.lineplot(x='uid', y='performance', data=group.reset_index(drop=False))
#     ax.set_title('exc_num is'+str(index))

#### item effectivity

In [None]:
exc_nums = tmp['exc_num'].unique()
for en in exc_nums:
    data = tmp[tmp['exc_num']==en]
    print(data)
#     sns.set_style('whitegrid')
#     f, ax= plt.subplots(figsize = (14, 10))
#     ax = sns.lineplot(x='uid', y=target_col, hue='day', data=data.reset_index())
#     ax.set_title('exc_num is'+str(en))