In [1]:
%reset -f

import pandas as pd
import os
import numpy as np
from six.moves import xrange
import math
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import pickle

In [2]:
def ability_level_mapper(data, groups=None, col='front', how='naive', n_level=19, 
                         invert=True, parameters=None, imbalanced_data=False, target_col_name='performance'):
    # the raw data is divided into groups according to its exc_num, ability levels are calculated respectively
    # how: 1 is mapping without any other processing, called 'naive'
    origin = data.copy()
    # if target col is imbalanced, here we seperate it in parts with different scale value
    if parameters is not None:
        col, how, n_level, invert, v_max, v_min, imbalanced_data = parameters
        
    if imbalanced_data is not False:
        
        if imbalanced_data is True:
            origin['cnt'] = 1
            origin = origin.sort_values(by=col)
            origin = origin.reset_index()
            origin['cnt'] = origin['cnt'].cumsum()
            while(1):
                total = len(origin)
                each = int(total/(n_level+1))
                imbalanced_data = []
                for i in xrange(n_level):
                    v = origin.loc[(i+1)*each, col]
                    if v in imbalanced_data:
                        n_level -= 1
                        continue
                    else:
                        imbalanced_data.append(v)
                origin = origin.drop(['cnt'], axis=1)
                break
                    
        if type(imbalanced_data) is list:
            assert n_level==len(imbalanced_data), 'false length of imbalanced_data'
            # elements in inbalaced_data are ordered increasely
            origin[target_col_name] = 0
            for i,v in enumerate(imbalanced_data):
                if invert:
                    origin.loc[(origin.performance==0) & (origin[col]<=v), target_col_name] = n_level-i+1
                else:
                    origin.loc[(origin.performance==0) & (origin[col]<=v), target_col_name] = i+1
            if invert:
                origin.loc[origin.performance==0, target_col_name] = 1
            else:
                origin.loc[origin.performance==0, target_col_name] = n_level+1
                
        parameters = (col, how, n_level, invert, None, None, imbalanced_data)
        if 'index' in origin.columns:
            origin = origin.sort_values(by=['index'])
            origin = origin.set_index(['index'])
        return origin, parameters
            
    if parameters is not None:
        col, how, n_level, invert, v_max, v_min, imbalanced_data = parameters
        interval = (v_max- v_min)/n_level
        assert interval!=0, 'zero dividend'
        origin[target_col_name] = (origin[col]-v_min)/interval
        
        origin[target_col_name] = origin[target_col_name].astype(int)
        if invert:
            origin[target_col_name] = n_level-origin[target_col_name]+1
        else:
            origin[target_col_name] = origin[target_col_name]+1
        
        return origin, parameters
        
    if groups == None:
        v_max = origin[col].max()
        v_min = origin[col].min()
        
        interval = (v_max- v_min)/n_level
        assert interval!=0, 'zero dividend'
        origin[target_col_name] = (origin[col]-v_min)/interval
        
        origin[target_col_name] = origin[target_col_name].astype(int)
        if invert:
            origin[target_col_name] = n_level-origin[target_col_name]+1
        else:
            origin[target_col_name] = origin[target_col_name]+1
        
        parameters = (col, how, n_level, invert, v_max, v_min, False)
        return origin, parameters
    
    if how == 1 or how=='naive':
        tmp = origin[col]
        for index, group in groups:
            
            v_max = group[col].max()
            v_min = group[col].min()
            
            interval = (v_max-v_min)/n_level
            
            assert interval!=0, 'zero dividend'
            
            origin.loc[index, col] = (origin.loc[index, col]-v_min)/interval
        origin[col] = origin[col].astype(int)
        if invert:
            origin[target_col_name] = n_level-origin[col]+1
            
        else:
            origin[target_col_name] = origin[target_col_name]+1
        origin[col] = tmp
        parameters = (col, how, n_level, invert, v_max, v_min, False )
        
        return origin, parameters
    
    
def merge_minority(data, by=['difficulty'], by2=['uid', 'day', 'exc_num', 'exc_times'],threshold=10, threshold2=3):
    cnt = 1
    while cnt>0:
        cnt = 0
        for index, group in data.groupby(by):
            d = index
            tmp = group.groupby(by2).mean()
            
            if len(group)<threshold or len(tmp)<threshold2:
                if d<data['difficulty'].max()/2:
                    data.loc[data.difficulty==d, 'difficulty'] +=1
                else:
                    data.loc[data.difficulty==d, 'difficulty'] -=1
                cnt += 1
                continue
            
        print(cnt)
    return data       

def rewrite_env_parameter(if_merge_min, sampling_rate):
    with open('../data/parameter/def_env.p', 'rb') as f:
        tmp = pickle.load(f)
        tmp = tmp + [if_merge_min] + [sampling_rate]
        
    with open('../data/parameter/def_env.p', 'wb') as f:
        pickle.dump(tmp, f)
        
    return tmp

In [3]:
all_data = pd.read_csv('../data/step1_clear_data.csv')
all_data.head()

Unnamed: 0,day,uid,exc_num,exc_times,inv_tt_freq,difficulty,deviation,velocity
0,1.0,1.0,1.1,1.0,63.0,1,0.005096,0.113868
1,1.0,1.0,1.1,1.0,63.0,1,0.005096,0.113868
2,1.0,1.0,1.1,1.0,63.0,1,0.005096,0.113868
3,1.0,1.0,1.1,1.0,63.0,1,0.005096,0.113868
4,1.0,1.0,1.1,1.0,63.0,1,0.005096,0.113868


In [4]:
# tmp = all_data.set_index(['day', 'exc_num', 'uid', target_col])

In [5]:
# all_data['vf'] = all_data['velocity']/(all_data['front']+0.1)

In [6]:
# env parameters
sampling_rate = 0.5
merge_min = False

target_col, diff_def, n_class, group, _, if_merge_min, sampleing_rate = rewrite_env_parameter(merge_min, sampling_rate)

### sampling

In [7]:
all_data = all_data.sample(frac=sampling_rate).reset_index(drop=True)
all_data.head()

Unnamed: 0,day,uid,exc_num,exc_times,inv_tt_freq,difficulty,deviation,velocity
0,5.0,5.0,4.2,1.0,41.0,14,0.014294,0.472863
1,2.0,11.0,2.2,1.0,50.0,1,0.58653,0.594613
2,5.0,1.0,1.5,2.0,48.0,7,0.027822,0.487587
3,5.0,5.0,4.2,2.0,42.0,14,0.015891,0.600369
4,3.0,8.0,3.2,1.0,54.0,2,0.057048,0.569676


### average data

In [8]:
# target_col = 'front'
invert = False
imbalanced_data = False
n_level = 19

item, parameters = ability_level_mapper(all_data, col=target_col, invert=invert, n_level=n_level, 
                                        imbalanced_data=imbalanced_data)
if merge_min:
    item = merge_minority(item)
    
item = item[['day', 'exc_num', 'uid', 'exc_times', 'difficulty','performance']]
# item = item[['day', 'exc_num', 'uid', 'exc_times', 'difficulty','performance']]. \
#     groupby(by=['day', 'exc_num', 'uid', 'exc_times', 'difficulty']).mean()

# print(item['performance'].describe())
# uid_avg_score_per_item = item.reset_index()[['day', 'exc_num', 'uid', 'performance']].
#     groupby(by=['day', 'exc_num', 'uid']).mean()
# uid_avg_score_per_item = uid_avg_score_per_item.reset_index()
# uid_avg_score_per_item.describe()
with open('../data/parameter/ability_mapper.p', 'wb') as f:
    pickle.dump(parameters, f)
with open('../data/parameter/ability_mapper2.p', 'wb') as f:
    pickle.dump(parameters, f, protocol=2)

item.head()

Unnamed: 0,day,exc_num,uid,exc_times,difficulty,performance
0,5.0,4.2,5.0,1.0,14,13
1,2.0,2.2,11.0,1.0,1,15
2,5.0,1.5,1.0,2.0,7,15
3,5.0,4.2,5.0,2.0,14,13
4,3.0,3.2,8.0,1.0,2,17


In [9]:
item.describe()

Unnamed: 0,day,exc_num,uid,exc_times,difficulty,performance
count,146723.0,146723.0,146723.0,146723.0,146723.0,146723.0
mean,3.327324,2.912502,5.399992,2.332818,5.498313,11.845287
std,1.16768,0.911896,3.10279,1.413823,5.509484,2.68368
min,1.0,1.1,1.0,1.0,1.0,1.0
25%,2.0,2.1,3.0,1.0,1.0,10.0
50%,3.0,3.1,5.0,2.0,2.0,12.0
75%,4.0,4.1,8.0,3.0,13.0,14.0
max,5.0,4.2,11.0,8.0,18.0,20.0


In [10]:
print(item.head())
item.to_csv('../data/step2_expected_performance.csv', index=False)

   day  exc_num   uid  exc_times  difficulty  performance
0  5.0      4.2   5.0        1.0          14           13
1  2.0      2.2  11.0        1.0           1           15
2  5.0      1.5   1.0        2.0           7           15
3  5.0      4.2   5.0        2.0          14           13
4  3.0      3.2   8.0        1.0           2           17


#### unidemention testing (TODO)

In [11]:
# check pearson coefficient between different items

tmp = uid_avg_score_per_item.reset_index()
# groups = tmp[(tmp['day']==1) & (tmp['exc_num']!=1.1)].groupby(by=['exc_num'])
# groups = tmp[(tmp['day']==2)].groupby(by=['exc_num'])
# groups = tmp[(tmp['day']==2) & (tmp['exc_num']>2)].groupby(by=['exc_num'])
# groups = tmp[(tmp['day']==3)].groupby(by=['exc_num'])
# groups = tmp[(tmp['day']==3) & (tmp['exc_num']>3)].groupby(by=['exc_num'])
# groups = tmp[(tmp['day']==4) & (tmp['exc_num']>4)].groupby(by=['exc_num'])
# groups = tmp[(tmp['day']==5) & ((tmp['exc_num']>4) | (tmp['exc_num']<2))].groupby(by=['exc_num'])
# groups = tmp[(tmp['day']==5)].groupby(by=['exc_num'])
# groups = tmp[((tmp['day']==2) | (tmp['day']==3)) & (tmp['exc_num']==2.3)].groupby(by=['day', 'exc_num'])
# groups = tmp[((tmp['day']==2) | (tmp['day']==3)) ].groupby(by=['day', 'exc_num'])
groups = tmp[((tmp['day']==2) | (tmp['day']==1)) ].groupby(by=['day', 'exc_num'])
for index1, group1 in groups:
    set1 = group1['uid'].tolist()

    for index2, group2 in groups:
        if index1==index2:
            continue
        else:
            set2 = group2['uid'].tolist()
            common_uid = list(set(set1).intersection(set2))
            l1 = group1.set_index(['uid']).loc[common_uid, 'performance']
            l2 = group2.set_index(['uid']).loc[common_uid, 'performance']
#             print(group1.set_index(['uid']).loc[common_uid])
#             print(group2.set_index(['uid']).loc[common_uid])
            pearson, p_vlaue = pearsonr(l1, l2)
            print(index1, index2)
            print(pearson)

NameError: name 'uid_avg_score_per_item' is not defined

In [None]:
groups = tmp[(tmp['day']==1) & (tmp['exc_num']>1)].groupby(by=['exc_num'])
sns.set_style('whitegrid')
f, ax= plt.subplots(figsize = (14, 10))
for index, group in groups:
#     print(group)
    
    ax = sns.lineplot(x='uid', y='performance', data=group.reset_index(drop=False))
#     ax.set_title('exc_num is'+str(index))

#### item effectivity

In [None]:
exc_nums = tmp['exc_num'].unique()
for en in exc_nums:
    data = tmp[tmp['exc_num']==en]
    print(data)
#     sns.set_style('whitegrid')
#     f, ax= plt.subplots(figsize = (14, 10))
#     ax = sns.lineplot(x='uid', y=target_col, hue='day', data=data.reset_index())
#     ax.set_title('exc_num is'+str(en))