In [1]:
%reset -f

import pandas as pd
import os
import numpy as np
from six.moves import xrange
import math
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import pickle

In [2]:
def ability_level_mapper(data, groups=None, col='front', how='naive', n_level=19, invert=True, parameters=None):
    # the raw data is divided into groups according to its exc_num, ability levels are calculated respectively
    # how: 1 is mapping without any other processing, called 'naive'
    origin = data.copy()
    
    if parameters is not None:
        col, how, n_level, invert, v_max, v_min = parameters
        interval = (v_max- v_min)/n_level
        assert interval!=0, 'zero dividend'
        origin['performance'] = (origin[col]-v_min)/interval
        
        origin['performance'] = origin['performance'].astype(int)
        if invert:
            origin['performance'] = n_level-origin['performance']+1
        else:
            origin['performance'] = origin['performance']+1
        
        return origin, parameters
        
    if groups == None:
        v_max = origin[col].max()
        v_min = origin[col].min()
        
        interval = (v_max- v_min)/n_level
        assert interval!=0, 'zero dividend'
        origin['performance'] = (origin[col]-v_min)/interval
        
        origin['performance'] = origin['performance'].astype(int)
        if invert:
            origin['performance'] = n_level-origin['performance']+1
        else:
            origin['performance'] = origin['performance']+1
        
        parameters = (col, how, n_level, invert, v_max, v_min )
        return origin, parameters
    
    if how == 1 or how=='naive':
        tmp = origin[col]
        for index, group in groups:
            
            v_max = group[col].max()
            v_min = group[col].min()
            
            interval = (v_max-v_min)/n_level
            
            assert interval!=0, 'zero dividend'
            
            origin.loc[index, col] = (origin.loc[index, col]-v_min)/interval
        origin[col] = origin[col].astype(int)
        if invert:
            origin['performance'] = n_level-origin[col]+1
            
        else:
            origin['performance'] = origin['performance']+1
        origin[col] = tmp
        parameters = (col, how, n_level, invert, v_max, v_min )
        return origin, parameters
    
    
def merge_minority(data, by=['difficulty'], by2=['uid', 'day', 'exc_num', 'exc_times'],threshold=10, threshold2=3):
    cnt = 1
    while cnt>0:
        cnt = 0
        for index, group in data.groupby(by):
            d = index
            tmp = group.groupby(by2).mean()
            
            if len(group)<threshold or len(tmp)<threshold2:
                if d<data['difficulty'].max()/2:
                    data.loc[data.difficulty==d, 'difficulty'] +=1
                else:
                    data.loc[data.difficulty==d, 'difficulty'] -=1
                cnt += 1
                continue
            
        print(cnt)
    return data       

def rewrite_env_parameter(if_merge_min, sampling_rate):
    with open('../data/parameter/def_env.p', 'rb') as f:
        tmp = pickle.load(f)
        tmp = tmp + [if_merge_min] + [sampling_rate]
        
    with open('../data/parameter/def_env.p', 'wb') as f:
        pickle.dump(tmp, f)
        
    return tmp

In [3]:
all_data = pd.read_csv('../data/step1_clear_data.csv')
all_data.head()

Unnamed: 0,day,uid,exc_num,exc_times,force,difficulty,deviation,velocity
0,1.0,1.0,1.1,1.0,28.816152,1,0.005096,0.113868
1,1.0,1.0,1.1,1.0,26.999215,1,0.005096,0.113868
2,1.0,1.0,1.1,1.0,24.539013,1,0.005096,0.113868
3,1.0,1.0,1.1,1.0,23.562499,1,0.005096,0.113868
4,1.0,1.0,1.1,1.0,18.746454,1,0.005096,0.113868


In [4]:
# tmp = all_data.set_index(['day', 'exc_num', 'uid', target_col])

In [5]:
# all_data['vf'] = all_data['velocity']/(all_data['front']+0.1)

In [6]:
# env parameters
sampling_rate = 0.5
merge_min = False

target_col, diff_def, n_class, group, _, if_merge_min, sampleing_rate = rewrite_env_parameter(merge_min, sampling_rate)

### sampling

In [7]:
all_data = all_data.sample(frac=sampling_rate).reset_index(drop=True)
all_data.head()

Unnamed: 0,day,uid,exc_num,exc_times,force,difficulty,deviation,velocity
0,4.0,7.0,4.2,2.0,41.068457,14,0.116505,0.25732
1,4.0,1.0,4.2,1.0,25.5112,14,0.066315,0.184003
2,3.0,7.0,3.2,1.0,62.855231,2,0.061597,1.585861
3,3.0,6.0,3.2,4.0,86.963342,1,0.128024,0.608501
4,2.0,7.0,2.2,3.0,23.934141,4,0.059091,0.198898


### average data

In [8]:
# target_col = 'front'
invert = False

item, parameters = ability_level_mapper(all_data, col=target_col, invert=invert)
if merge_min:
    item = merge_minority(item)
    
item = item[['day', 'exc_num', 'uid', 'exc_times', 'difficulty','performance']]
# item = item[['day', 'exc_num', 'uid', 'exc_times', 'difficulty','performance']]. \
#     groupby(by=['day', 'exc_num', 'uid', 'exc_times', 'difficulty']).mean()

# print(item['performance'].describe())
# uid_avg_score_per_item = item.reset_index()[['day', 'exc_num', 'uid', 'performance']].
#     groupby(by=['day', 'exc_num', 'uid']).mean()
# uid_avg_score_per_item = uid_avg_score_per_item.reset_index()
# uid_avg_score_per_item.describe()
with open('../data/parameter/ability_mapper.p', 'wb') as f:
    pickle.dump(parameters, f)
with open('../data/parameter/ability_mapper2.p', 'wb') as f:
    pickle.dump(parameters, f, protocol=2)

item.head()

Unnamed: 0,day,exc_num,uid,exc_times,difficulty,performance
0,4.0,4.2,7.0,2.0,14,2
1,4.0,4.2,1.0,1.0,14,2
2,3.0,3.2,7.0,1.0,2,3
3,3.0,3.2,6.0,4.0,1,4
4,2.0,2.2,7.0,3.0,4,1


In [9]:
item.describe()

Unnamed: 0,day,exc_num,uid,exc_times,difficulty,performance
count,146726.0,146726.0,146726.0,146726.0,146726.0,146726.0
mean,3.318553,2.904681,5.386142,2.329342,5.458712,2.549051
std,1.167735,0.910343,3.096001,1.410088,5.486662,1.198359
min,1.0,1.1,1.0,1.0,1.0,1.0
25%,2.0,2.1,3.0,1.0,1.0,2.0
50%,3.0,3.1,5.0,2.0,2.0,2.0
75%,4.0,4.1,8.0,3.0,13.0,3.0
max,5.0,4.2,11.0,8.0,18.0,20.0


In [10]:
print(item.head())
item.to_csv('../data/step2_expected_performance.csv', index=False)

   day  exc_num  uid  exc_times  difficulty  performance
0  4.0      4.2  7.0        2.0          14            2
1  4.0      4.2  1.0        1.0          14            2
2  3.0      3.2  7.0        1.0           2            3
3  3.0      3.2  6.0        4.0           1            4
4  2.0      2.2  7.0        3.0           4            1


#### unidemention testing (TODO)

In [11]:
# check pearson coefficient between different items

tmp = uid_avg_score_per_item.reset_index()
# groups = tmp[(tmp['day']==1) & (tmp['exc_num']!=1.1)].groupby(by=['exc_num'])
# groups = tmp[(tmp['day']==2)].groupby(by=['exc_num'])
# groups = tmp[(tmp['day']==2) & (tmp['exc_num']>2)].groupby(by=['exc_num'])
# groups = tmp[(tmp['day']==3)].groupby(by=['exc_num'])
# groups = tmp[(tmp['day']==3) & (tmp['exc_num']>3)].groupby(by=['exc_num'])
# groups = tmp[(tmp['day']==4) & (tmp['exc_num']>4)].groupby(by=['exc_num'])
# groups = tmp[(tmp['day']==5) & ((tmp['exc_num']>4) | (tmp['exc_num']<2))].groupby(by=['exc_num'])
# groups = tmp[(tmp['day']==5)].groupby(by=['exc_num'])
# groups = tmp[((tmp['day']==2) | (tmp['day']==3)) & (tmp['exc_num']==2.3)].groupby(by=['day', 'exc_num'])
# groups = tmp[((tmp['day']==2) | (tmp['day']==3)) ].groupby(by=['day', 'exc_num'])
groups = tmp[((tmp['day']==2) | (tmp['day']==1)) ].groupby(by=['day', 'exc_num'])
for index1, group1 in groups:
    set1 = group1['uid'].tolist()

    for index2, group2 in groups:
        if index1==index2:
            continue
        else:
            set2 = group2['uid'].tolist()
            common_uid = list(set(set1).intersection(set2))
            l1 = group1.set_index(['uid']).loc[common_uid, 'performance']
            l2 = group2.set_index(['uid']).loc[common_uid, 'performance']
#             print(group1.set_index(['uid']).loc[common_uid])
#             print(group2.set_index(['uid']).loc[common_uid])
            pearson, p_vlaue = pearsonr(l1, l2)
            print(index1, index2)
            print(pearson)

NameError: name 'uid_avg_score_per_item' is not defined

In [None]:
groups = tmp[(tmp['day']==1) & (tmp['exc_num']>1)].groupby(by=['exc_num'])
sns.set_style('whitegrid')
f, ax= plt.subplots(figsize = (14, 10))
for index, group in groups:
#     print(group)
    
    ax = sns.lineplot(x='uid', y='performance', data=group.reset_index(drop=False))
#     ax.set_title('exc_num is'+str(index))

#### item effectivity

In [None]:
exc_nums = tmp['exc_num'].unique()
for en in exc_nums:
    data = tmp[tmp['exc_num']==en]
    print(data)
#     sns.set_style('whitegrid')
#     f, ax= plt.subplots(figsize = (14, 10))
#     ax = sns.lineplot(x='uid', y=target_col, hue='day', data=data.reset_index())
#     ax.set_title('exc_num is'+str(en))