In [24]:
import pandas as pd
import numpy as np
from pprint import pprint
import copy

def learn_transition_reward_funtion(df, state_count, action_count):
    N_s_a = df[["s", "a", "r"]].groupby(["s", "a"]).count()
    N_s_a = N_s_a.add_suffix('_count').reset_index()
    N_s_a_sp = df[["s", "a", "sp", "r"]].groupby(["s", "a", "sp"]).count()
    N_s_a_sp = N_s_a_sp.add_suffix('_count').reset_index()
    r_s_a = df[["s", "a", "r"]].groupby(["s", "a"]).sum()
    r_s_a = r_s_a.add_suffix('_sum').reset_index()
    R_s_a = pd.merge(r_s_a, N_s_a, how='left', left_on=['s', 'a'], right_on=['s', 'a'])
    R_s_a['avg_r'] = R_s_a['r_sum'] / R_s_a['r_count']
    T_sp_s_a = pd.merge(N_s_a_sp, N_s_a, how='left', left_on=['s', 'a'], right_on=['s', 'a'])
    T_sp_s_a['sp_pos'] = T_sp_s_a['r_count_x'] / T_sp_s_a['r_count_y']

    R_s_a_array = np.zeros((state_count, action_count))
    for index, row in R_s_a.iterrows():
        i = int(row['s'])
        j = int(row['a'])
        # print(i,j)
        R_s_a_array[i - 1, j - 1] = row['avg_r']

    T_sp_s_a_array = np.zeros((state_count, action_count, state_count))
    for index, row in T_sp_s_a.iterrows():
        i = int(row['s'])
        j = int(row['a'])
        k = int(row['sp'])
        T_sp_s_a_array[i - 1, j - 1, k - 1] = row['sp_pos']

    return ([R_s_a_array, T_sp_s_a_array, R_s_a])


def learn_optimal_value_function_gauss_seidel(gamma,
                                              R_s_a_array,
                                              T_sp_s_a_array,
                                              state_count,
                                              action_count,
                                              num_iteration,
                                              avg_rewards):
    avg_rewards_by_state = avg_rewards[["s","avg_r"]].groupby("s").max().add_suffix('_max').sort_values("avg_r_max",ascending =False).reset_index()
    U_array = np.zeros(state_count)
    for k in range(0,num_iteration):
        print("K: " + str(k))
        for state in avg_rewards_by_state["s"].tolist():
            i = state - 1
            action_max = -99999
            for j in range(0,action_count):
                r_immed = R_s_a_array[i,j]
                r_discounted = gamma * np.dot(T_sp_s_a_array[i,j,:],U_array)
                r_total = r_immed + r_discounted
                if r_total > action_max:
                    action_max = r_total
            U_array[i] = action_max
        U_max = np.max(U_array)
        print(U_max)
    return(U_array)

def learn_policy(state_count,action_count,R_s_a_array,T_sp_s_a_array,U_array,gamma):
    policy_array = np.zeros(state_count)
    for i in range(0,state_count):
        max_index = 0
        max_value = -99999
        for j in range(0,action_count):
            temp_value = (R_s_a_array[i,j] + gamma * np.dot(T_sp_s_a_array[i,j,:],U_array))
            if temp_value > max_value:
                max_index = j
                max_value = temp_value
        policy_array[i] = int(max_index + 1)
    print(policy_array)
    return(policy_array)

In [2]:
#### Read data 
movie_df = pd.read_csv("final_labeled_data_v2.csv")
df = movie_df.drop('Unnamed: 0', 1)
user_list = movie_df[['user_id']].drop_duplicates()

#### Split into control & test
from sklearn.model_selection import train_test_split
train_user, test_user = train_test_split(user_list, test_size=0.2)
train = df.merge(train_user, on=['user_id'], how='inner')
test = df.merge(test_user, on=['user_id'], how='inner')

In [31]:
train.to_csv('final_labeled_data_train.csv')
test.to_csv('final_labeled_data_test.csv')

In [25]:
state_count = 14
action_count = 14
gamma = 0.95

df_functions = learn_transition_reward_funtion(train,state_count,action_count)

In [26]:
R_s_a_array = df_functions[0]
T_sp_s_a_array = df_functions[1]
avg_rewards = df_functions[2]
U_array = learn_optimal_value_function_gauss_seidel(gamma,R_s_a_array,T_sp_s_a_array,state_count,action_count,200, avg_rewards)
policy = learn_policy(state_count,action_count,R_s_a_array,T_sp_s_a_array,U_array,gamma)

K: 0
0.545454545455
K: 1
-0.096630626801
K: 2
-1.23285769607
K: 3
-2.5244089088
K: 4
-3.81839295005
K: 5
-5.05292600654
K: 6
-6.20679393947
K: 7
-7.27579030019
K: 8
-8.26239155718
K: 9
-9.17145412935
K: 10
-10.0084806038
K: 11
-10.7789458817
K: 12
-11.4880510289
K: 13
-12.1406466126
K: 14
-12.7412217377
K: 15
-13.2939175335
K: 16
-13.8025488642
K: 17
-14.2706279714
K: 18
-14.7013876696
K: 19
-15.0978032599
K: 20
-15.4626129169
K: 21
-15.7983365388
K: 22
-16.107293128
K: 23
-16.3916168002
K: 24
-16.6532715211
K: 25
-16.894064672
K: 26
-17.1156595326
K: 27
-17.3195867709
K: 28
-17.5072550155
K: 29
-17.6799605861
K: 30
-17.8388964473
K: 31
-17.9851604498
K: 32
-18.119762913
K: 33
-18.2436336043
K: 34
-18.3576281614
K: 35
-18.4625340023
K: 36
-18.5590757648
K: 37
-18.6479203117
K: 38
-18.7296813374
K: 39
-18.8049236067
K: 40
-18.8741668559
K: 41
-18.9378893833
K: 42
-18.9965313527
K: 43
-19.0504978338
K: 44
-19.1001616004
K: 45
-19.1458657054
K: 46
-19.1879258501
K: 47
-19.2266325653
K: 48

In [32]:
# print(R_s_a_array)
# print(T_sp_s_a_array)
# print(avg_rewards)
print(policy)

[  7.   2.   3.   4.   7.   5.   7.   8.   7.  10.   7.   7.  13.   7.]


In [33]:
test['p1']=test['s']
test.loc[test['s'] ==1, 'p1'] = 7
test.loc[test['s'] ==5, 'p1'] = 7
test.loc[test['s'] ==6, 'p1'] = 5
test.loc[test['s'] ==9, 'p1'] = 7
test.loc[test['s'] ==11, 'p1'] = 7
test.loc[test['s'] ==12, 'p1'] = 7
test.loc[test['s'] ==14, 'p1'] = 7

In [36]:
r1 = test.loc[test.r>0]
print(float((r1.loc[test.sp==test.p1]).shape[0])/r1.shape[0])

In [None]:
######## Archive #########
#df.loc[df['r'] == -5] = 0