In [1]:
%reset -f

import pandas as pd
import os
import numpy as np
from six.moves import xrange
import math
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import pickle
from scipy.cluster.vq import kmeans,vq, whiten

In [2]:
def ability_level_mapper(data, groups=None, col='front', how='naive', n_level=19, 
                         invert=True, parameters=None, divide_points=None, 
                         target_col_name='performance', balanced_scale=False, avg_perf=None):
    """
    balanced_scale: allow to have nagative grades
    """
    
    # the raw data is divided into groups according to its exc_num, ability levels are calculated respectively
    # how: 1 is mapping without any other processing, called 'naive'
    origin = data.copy()
    # if target col is imbalanced, here we seperate it in parts with different scale value
    if parameters is not None:
        col, how, n_level, invert, v_max, v_min, divide_points, balanced_scale, avg_perf = parameters
        
    if divide_points is not None:
        assert n_level==len(divide_points), 'false length of imbalanced_data'
        # elements in inbalaced_data are ordered increasely
        origin[target_col_name] = 0
        for i,v in enumerate(divide_points):
            if invert:
                origin.loc[(origin.performance==0) & (origin[col]<=v), target_col_name] = n_level-i+1
            else:
                origin.loc[(origin.performance==0) & (origin[col]<=v), target_col_name] = i+1
        if invert:
            origin.loc[origin.performance==0, target_col_name] = 1
        else:
            origin.loc[origin.performance==0, target_col_name] = n_level+1
            
        if balanced_scale and avg_perf is not None:
            origin[target_col_name] -= avg_perf

        parameters = (col, how, n_level, invert, None, None, divide_points, balanced_scale, avg_perf)
        return origin, parameters
        
    if how == 'cluster':
        origin = origin.reset_index()
        centroids,_ = kmeans(whiten(origin[col]), k_or_guess=n_level+1)
#         origin = origin.sort_values(by=[col])
        clx,_ = vq(whiten(origin[col]),centroids)
        
        origin[target_col_name] = clx+1
        tmp = origin[[target_col_name, col]]
        
        tmp = tmp[[target_col_name, col]].groupby([target_col_name]).mean()
        tmp['tmp'] = 1
        tmp = tmp.sort_values(by=col)
        tmp = tmp['tmp'].cumsum().reset_index()
        origin = origin.merge(tmp, how='left', on=[target_col_name])
        origin[target_col_name] = origin['tmp']
#         origin = origin.sort_values(by=['index'])
        origin = origin.set_index(['index'])
        divide_points = origin.groupby([target_col_name])[col].max().tolist()
        divide_points = divide_points[:-1]
        
        if balanced_scale:
            if avg_perf is None:
                avg_perf = origin[target_col_name].mean()
                avg_perf = int(avg_perf)
            origin[target_col_name] -= avg_perf
        else:
            avg_perf = None
        parameters = (col, how, n_level, invert, None, None, divide_points, balanced_scale, avg_perf)
        
        return origin, parameters
        
        
    if groups == None:
        v_max = origin[col].max()
        v_min = origin[col].min()
        
        interval = (v_max- v_min)/n_level
        assert interval!=0, 'zero dividend'
        origin[target_col_name] = (origin[col]-v_min)/interval

        origin[target_col_name] = origin[target_col_name].astype(int)
        if invert:
            origin[target_col_name] = n_level-origin[target_col_name]+1
        else:
            origin[target_col_name] = origin[target_col_name]+1

        if balanced_scale:
            if avg_perf is None:
                avg_perf = origin[target_col_name].mean()
                avg_perf = int(avg_perf)
            origin[target_col_name] -= avg_perf
        else:
            avg_perf = None
        parameters = (col, how, n_level, invert, None, None, divide_points, balanced_scale, avg_perf)
        return origin, parameters
    
    if how == 1 or how=='naive':
        tmp = origin[col]
        for index, group in groups:
            
            v_max = group[col].max()
            v_min = group[col].min()
            
            interval = (v_max-v_min)/n_level
            
            assert interval!=0, 'zero dividend'
            
            origin.loc[index, col] = (origin.loc[index, col]-v_min)/interval
        origin[col] = origin[col].astype(int)
        if invert:
            origin[target_col_name] = n_level-origin[col]+1
            
        else:
            origin[target_col_name] = origin[target_col_name]+1
        origin[col] = tmp
        
        if balanced_scale:
            if avg_perf is None:
                avg_perf = origin[target_col_name].mean()
                avg_perf = int(avg_perf)
            origin[target_col_name] -= avg_perf
        else:
            avg_perf = None
        parameters = (col, how, n_level, invert, None, None, divide_points, balanced_scale, avg_perf)
        
        return origin, parameters
    
    
    
    
def merge_minority(data, by=['difficulty'], by2=['uid', 'day', 'exc_num', 'exc_times'],threshold=10, threshold2=3):
    cnt = 1
    while cnt>0:
        cnt = 0
        for index, group in data.groupby(by):
            d = index
            tmp = group.groupby(by2).mean()
            
            if len(group)<threshold or len(tmp)<threshold2:
                if d<data['difficulty'].max()/2:
                    data.loc[data.difficulty==d, 'difficulty'] +=1
                else:
                    data.loc[data.difficulty==d, 'difficulty'] -=1
                cnt += 1
                continue
            
        print(cnt)
    return data       

def rewrite_env_parameter(if_merge_min, sampling_rate):
    with open('../data/parameter/def_env.p', 'rb') as f:
        tmp = pickle.load(f)
        tmp = tmp + [if_merge_min] + [sampling_rate]
        
    with open('../data/parameter/def_env.p', 'wb') as f:
        pickle.dump(tmp, f)
        
    return tmp

def find_segment(df, cols, values):
    tmp = df.copy()
    for col, value in zip(cols, values):
        tmp = tmp[tmp[col]==value]
        
    return tmp

In [3]:
all_data = pd.read_csv('../data/step1_clear_data.csv')
all_data.head()

Unnamed: 0,day,uid,exc_num,exc_times,inv_force,difficulty,deviation,velocity
0,1.0,1.0,1.1,1.0,484.201507,1,0.005096,0.113868
1,1.0,1.0,1.1,1.0,486.018444,1,0.005096,0.113868
2,1.0,1.0,1.1,1.0,488.478646,1,0.005096,0.113868
3,1.0,1.0,1.1,1.0,489.455159,1,0.005096,0.113868
4,1.0,1.0,1.1,1.0,494.271205,1,0.005096,0.113868


In [4]:
# tmp = all_data.set_index(['day', 'exc_num', 'uid', target_col])

In [5]:
# all_data['vf'] = all_data['velocity']/(all_data['front']+0.1)

In [6]:
# env parameters
sampling_rate = 0.5
merge_min = False

target_col, diff_def, n_class, group, _, if_merge_min, sampleing_rate = rewrite_env_parameter(merge_min, sampling_rate)

### sampling

In [7]:
all_data = all_data.sample(frac=sampling_rate).reset_index(drop=True)
all_data.head()

Unnamed: 0,day,uid,exc_num,exc_times,inv_force,difficulty,deviation,velocity
0,3.0,1.0,3.1,2.0,446.495343,2,0.012169,0.35672
1,4.0,11.0,3.1,1.0,443.03552,2,0.17407,0.910888
2,4.0,1.0,4.1,3.0,487.461856,15,0.024825,0.206453
3,2.0,6.0,2.2,3.0,472.825979,3,0.010569,0.506055
4,1.0,2.0,1.1,3.0,485.70666,1,0.002376,0.226097


### average data

In [8]:
# target_col = 'front'
invert = False
divide_points = None
n_level = 9
how = 'cluster'
# how = 'naive'

item, parameters = ability_level_mapper(all_data, col=target_col, invert=invert, n_level=n_level, how=how,
                                        divide_points=divide_points)
ability_level_mapper(all_data, parameters=parameters)
if merge_min:
    item = merge_minority(item)
    
item = item[['day', 'exc_num', 'uid', 'exc_times', 'difficulty','performance']]
# item = item[['day', 'exc_num', 'uid', 'exc_times', 'difficulty','performance']]. \
#     groupby(by=['day', 'exc_num', 'uid', 'exc_times', 'difficulty']).mean()

# print(item['performance'].describe())
# uid_avg_score_per_item = item.reset_index()[['day', 'exc_num', 'uid', 'performance']].
#     groupby(by=['day', 'exc_num', 'uid']).mean()
# uid_avg_score_per_item = uid_avg_score_per_item.reset_index()
# uid_avg_score_per_item.describe()
with open('../data/parameter/ability_mapper.p', 'wb') as f:
    pickle.dump(parameters, f)
with open('../data/parameter/ability_mapper2.p', 'wb') as f:
    pickle.dump(parameters, f, protocol=2)

item.head()


Unnamed: 0_level_0,day,exc_num,uid,exc_times,difficulty,performance
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.0,3.1,1.0,2.0,2,4
1,4.0,3.1,11.0,1.0,2,4
2,4.0,4.1,1.0,3.0,15,8
3,2.0,2.2,6.0,3.0,3,7
4,1.0,1.1,2.0,3.0,1,8


In [9]:
item.describe()

Unnamed: 0,day,exc_num,uid,exc_times,difficulty,performance
count,148854.0,148854.0,148854.0,148854.0,148854.0,148854.0
mean,3.32421,2.90876,5.38922,2.336988,5.485147,6.053032
std,1.167841,0.911718,3.095653,1.415064,5.503145,2.291468
min,1.0,1.1,1.0,1.0,1.0,1.0
25%,2.0,2.1,3.0,1.0,1.0,4.0
50%,3.0,3.1,5.0,2.0,2.0,6.0
75%,4.0,4.1,8.0,3.0,13.0,8.0
max,5.0,4.2,11.0,8.0,18.0,10.0


In [10]:
print(item.head())
item.to_csv('../data/step2_expected_performance.csv', index=False)

       day  exc_num   uid  exc_times  difficulty  performance
index                                                        
0      3.0      3.1   1.0        2.0           2            4
1      4.0      3.1  11.0        1.0           2            4
2      4.0      4.1   1.0        3.0          15            8
3      2.0      2.2   6.0        3.0           3            7
4      1.0      1.1   2.0        3.0           1            8


In [11]:
find_segment(item,['uid', 'day', 'exc_num', 'exc_times'], [1,1,1.1,1])

Unnamed: 0_level_0,day,exc_num,uid,exc_times,difficulty,performance
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
283,1.0,1.1,1.0,1.0,4,10
1212,1.0,1.1,1.0,1.0,4,10
1549,1.0,1.1,1.0,1.0,4,9
1611,1.0,1.1,1.0,1.0,4,10
1714,1.0,1.1,1.0,1.0,1,7
2807,1.0,1.1,1.0,1.0,1,9
2962,1.0,1.1,1.0,1.0,4,10
3661,1.0,1.1,1.0,1.0,1,10
6062,1.0,1.1,1.0,1.0,1,10
7930,1.0,1.1,1.0,1.0,4,9


#### unidemention testing (TODO)

In [12]:
# check pearson coefficient between different items

tmp = uid_avg_score_per_item.reset_index()
# groups = tmp[(tmp['day']==1) & (tmp['exc_num']!=1.1)].groupby(by=['exc_num'])
# groups = tmp[(tmp['day']==2)].groupby(by=['exc_num'])
# groups = tmp[(tmp['day']==2) & (tmp['exc_num']>2)].groupby(by=['exc_num'])
# groups = tmp[(tmp['day']==3)].groupby(by=['exc_num'])
# groups = tmp[(tmp['day']==3) & (tmp['exc_num']>3)].groupby(by=['exc_num'])
# groups = tmp[(tmp['day']==4) & (tmp['exc_num']>4)].groupby(by=['exc_num'])
# groups = tmp[(tmp['day']==5) & ((tmp['exc_num']>4) | (tmp['exc_num']<2))].groupby(by=['exc_num'])
# groups = tmp[(tmp['day']==5)].groupby(by=['exc_num'])
# groups = tmp[((tmp['day']==2) | (tmp['day']==3)) & (tmp['exc_num']==2.3)].groupby(by=['day', 'exc_num'])
# groups = tmp[((tmp['day']==2) | (tmp['day']==3)) ].groupby(by=['day', 'exc_num'])
groups = tmp[((tmp['day']==2) | (tmp['day']==1)) ].groupby(by=['day', 'exc_num'])
for index1, group1 in groups:
    set1 = group1['uid'].tolist()

    for index2, group2 in groups:
        if index1==index2:
            continue
        else:
            set2 = group2['uid'].tolist()
            common_uid = list(set(set1).intersection(set2))
            l1 = group1.set_index(['uid']).loc[common_uid, 'performance']
            l2 = group2.set_index(['uid']).loc[common_uid, 'performance']
#             print(group1.set_index(['uid']).loc[common_uid])
#             print(group2.set_index(['uid']).loc[common_uid])
            pearson, p_vlaue = pearsonr(l1, l2)
            print(index1, index2)
            print(pearson)

NameError: name 'uid_avg_score_per_item' is not defined

In [None]:
groups = tmp[(tmp['day']==1) & (tmp['exc_num']>1)].groupby(by=['exc_num'])
sns.set_style('whitegrid')
f, ax= plt.subplots(figsize = (14, 10))
for index, group in groups:
#     print(group)
    
    ax = sns.lineplot(x='uid', y='performance', data=group.reset_index(drop=False))
#     ax.set_title('exc_num is'+str(index))

#### item effectivity

In [None]:
exc_nums = tmp['exc_num'].unique()
for en in exc_nums:
    data = tmp[tmp['exc_num']==en]
    print(data)
#     sns.set_style('whitegrid')
#     f, ax= plt.subplots(figsize = (14, 10))
#     ax = sns.lineplot(x='uid', y=target_col, hue='day', data=data.reset_index())
#     ax.set_title('exc_num is'+str(en))