## Truc Tran -- i6213546
### This file is used for analyzing results for 1st, 2nd and 3rd research question
### Users are divided randomly into 3 groups:
  Group A: The original Shadow Habtonomics Recommender
  
  Group B: The Multiply Hybrid Recommender
  
  Group C: The Convex Hxbrid Recommender
  
  significant level use: alpha = 0.1 and 0.2

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

### Some defined functions

In [None]:
mark1, mark2, mark3, mark4, mark5 = "2022-04-05", "2022-04-26", "2022-05-17", "2022-05-24", "2022-06-15"
alpha = [0.1, 0.2]
def take_group(_data:pd.DataFrame):
  """
    method to track which group the user is in
    this method is proposed by Back-end team
    args: user dataframe
    return: n dataframes for n groups
  """
  data = _data.copy()
  user_list = list(data["UserId"])
  userid_val = [sum([int(c) for c in userid if c.isdigit()]) for userid in user_list]
  user_group = [1 if i%3==0 else 2 if i%3 == 1 else 3 for i in userid_val]
  data["user_val"] = userid_val
  data["user_group"] = user_group
  
  group1 = data[data["user_group"] == 1].reset_index(drop=True)[["average_historic", "count_testing_period"]]
  group2 = data[data["user_group"] == 2].reset_index(drop=True)[["average_historic", "count_testing_period"]]
  group3 = data[data["user_group"] == 3].reset_index(drop=True)[["average_historic", "count_testing_period"]]
  return group1, group2, group3


def count_block_session(_data:pd.DataFrame):
  """
    method to track the total number of times to trigger blocks in testing period and average previous last 4 phases, each phase lasts 3 weeks
    block_count_1: from 05-04 to 26-04
    block_count_2: from 26-04 to 17-05
    count_testing_period: from 24-05 to 14-06
    block_count_average: average of 2 phases (1, 2): this will be used to compare with testing_period
    user_group: testing group the user is in
    return: dataframe contains userid and calculated block count in 3 phases
  """
  data = _data[(mark1 <= _data["TimestampUtc"]) & (_data["TimestampUtc"] <= mark5)].reset_index(drop=True).copy()
  p1 = data[(mark1 <= data["TimestampUtc"]) & (data["TimestampUtc"] < mark2)].groupby(["UserId"])["BlockCode"].count().rename("block_count_1").reset_index()
  p2 = data[(mark2 <= data["TimestampUtc"]) & (data["TimestampUtc"] < mark3)].groupby(["UserId"])["BlockCode"].count().rename("block_count_2").reset_index()
  p_test = data[(mark4 <= data["TimestampUtc"]) & (data["TimestampUtc"] < mark5)].groupby(["UserId"])["BlockCode"].count().rename("count_testing_period").reset_index()
  
  p = [p1, p2, p_test]
  for i in p:
    data = data.merge(i, on="UserId", how="left")
  data = data[["UserId", "block_count_1", "block_count_2", "count_testing_period"]]
  data.drop_duplicates(inplace=True)
  data["average_historic"] = data[["block_count_1", "block_count_2"]].mean(skipna=True, axis=1)
  return data.reset_index(drop=True)

def cohend(a, b, ind=True) -> float:
  """
    method to calculate the cohen d effect size of two groups
    args: 2 1-D numerical arrays a, b
          boolean ind: a and b is independent or dependent
    return: cohen'd size effect between a and b
  """
  result = 0
  if ind==True:
    n1, n2 = len(a), len(b)
    var1, var2 = np.var(a, ddof=1), np.var(b, ddof=1)
    s = np.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2))
    result = (np.mean(a) - np.mean(b))/s
  else:
    difference = np.array(a-b)
    s = np.std(difference, ddof=1)
    result = np.mean(difference)/s
  return result

## RESEARCH QUESTION 1: improve the number of times finishing blocks (block_finished event) (historic vs testing)
  1 tailed paired test

In [None]:
#only take the blocks which were completed
blockcomplete = spark.read.format("delta").table("norm_events_block_finished").toPandas()
blockcomplete = blockcomplete[blockcomplete["CompletionState"] == 1].reset_index(drop=True)
data = count_block_session(blockcomplete)
data_A, data_B, data_C = take_group(data)
# take out data to do the paired t test. 
_data_A, _data_B, _data_C = data_A.dropna(), data_B.dropna(), data_C.dropna()

#remove outlier 
_data_A = _data_A[(np.abs(stats.zscore(_data_A)) < 2).all(axis=1)]
_data_B = _data_B[(np.abs(stats.zscore(_data_B)) < 2).all(axis=1)]
_data_C = _data_C[(np.abs(stats.zscore(_data_C)) < 2).all(axis=1)]
#plot data
_data_A.boxplot()
plt.ylabel("number of blocks")
plt.title('Group A')
plt.show()
_data_B.boxplot()
plt.ylabel("number of blocks")
plt.title('Group B')
plt.show()
_data_C.boxplot()
plt.ylabel("number of blocks")
plt.title('Group C')
plt.show()
# index 0 is is historic, index 1 testing period
A = np.array([np.array(_data_A['average_historic']), np.array(_data_A['count_testing_period'])])
B = np.array([np.array(_data_B['average_historic']), np.array(_data_B['count_testing_period'])])
C = np.array([np.array(_data_C['average_historic']), np.array(_data_C['count_testing_period'])])

#test normality. Small dataset so use wilk-shairo test here
a_normal = stats.shapiro(A[1] - A[0])[1]
b_normal = stats.shapiro(B[1] - B[0])[1]
c_normal = stats.shapiro(C[1] - C[0])[1]
#do the test
a_pvalue, b_pvalue, c_pvalue = 0, 0, 0
if a_normal <= alpha[0]:
  print("Difference pairs of group A is not normally distributed")
  a_pvalue = stats.wilcoxon(A[0], A[1], alternative="greater")[1]
else:
  print("Difference pairs of group A is normally distributed")
  a_pvalue = stats.ttest_rel(A[0], A[1], alternative="greater")[1]
  
if b_normal <= alpha[0]:
  print("Difference pairs of group B is not normally distributed")
  b_pvalue = stats.wilcoxon(B[0], B[1], alternative="greater")[1]
else:
  print("Difference pairs of group B is normally distributed")
  b_pvalue = stats.ttest_rel(B[0], B[1], alternative="greater")[1]
  
if c_normal <= alpha[0]:
  print("Difference pairs of group C is not normally distributed")
  c_pvalue = stats.wilcoxon(C[0], C[1], alternative="greater")[1]
else:
  print("Difference pairs of group C is normally distributed")
  c_pvalue = stats.ttest_rel(C[0], C[1], alternative="greater")[1]
  
print("p value for the test of group A: ", a_pvalue)
print("p value for the test of group B: ", b_pvalue)
print("p value for the test of group C: ", c_pvalue) 

for i in alpha:
  print("With alpha = ", i)
  if b_pvalue <= i:
    print("   Reject the null hypothesis for group B, mean value of testing period is greater than that of average historic period")
  else:
    print("   Cannot reject the null hypothesis for group B")
  
  if c_pvalue <= i:
    print("   Reject the null hypothesis for group C, mean value of testing period is greater than that of average historic period")
  else:
    print("   Cannot reject the null hypothesis for group C")

print("effect size between testing and recent period of group B: ", cohend(B[0], B[1], False))
print("effect size between testing and recent period of group C: ", cohend(C[0], C[1], False))

## RESEARCH QUESTION 2: make better suggestions? (block_started and block_finished event) (A vs B and A vs C)
### unpaired t test 
### First part: On block started event
### Second part: On block finish event
  1 tailed unpaired t test

In [None]:
"""
  Test on block started event
"""
blockstart = spark.read.format("delta").table("norm_events_block_started").toPandas()
data = count_block_session(_data=blockstart)
data_A, data_B, data_C = take_group(data)
#filter nan value, take "count_testing_period" out
_data_A, _data_B, _data_C = data_A["count_testing_period"].dropna(), data_B["count_testing_period"].dropna(), data_C["count_testing_period"].dropna()

#remove outlier
_data_A = _data_A[np.abs(stats.zscore(_data_A)) <= 2]
_data_B = _data_B[np.abs(stats.zscore(_data_B)) <= 2]
_data_C = _data_C[np.abs(stats.zscore(_data_C)) <= 2]
# take out data to do the unpaired t test
A, B, C = np.array(_data_A), np.array(_data_B), np.array(_data_C)
#plot data
plt.boxplot(x = [A, B, C], labels = ["group A", "group B", "group C"])
plt.ylabel("number of blocks")
plt.grid(True)
plt.title("count testing period - block_started")
plt.show()

#test normality
anormal, bnormal, cnormal = stats.normaltest(A)[1], stats.normaltest(B)[1], stats.normaltest(C)[1]
if anormal <= alpha[0]:
  print("data from group A is not normally distributed")
else:
  print("data from group A is normally distributed")
  
if bnormal <= alpha[0]:
  print("data from group B is not normally distributed")
else:
  print("data from group B is normally distributed")
  
if cnormal <= alpha[0]:
  print("data from group C is not normally distributed")
else:
  print("data from group C is normally distributed")

#do the test
b_a_pvalue, c_a_pvalue = 0, 0
if (anormal > alpha[0]) & (bnormal > alpha[0]):
  b_a_pvalue = stats.ttest_ind(B, A, alternative="greater")[1]
else:
  b_a_pvalue = stats.mannwhitneyu(B, A, alternative="greater")[1]

if (anormal > alpha[0]) & (cnormal > alpha[0]):
  c_a_pvalue = stats.ttest_ind(C, A, alternative="greater")[1]
else:
  c_a_pvalue = stats.mannwhitneyu(C, A, alternative="greater")[1]

print("p value for the test between group B and group A: ", b_a_pvalue)
print("p value for the test between group C and group A: ", c_a_pvalue)

for i in alpha:
  print("With alpha = ", i)
  if b_a_pvalue <= i:
    print("   Reject the null hypothesis for the test between group A and B, number of started blocks in group B is greater")
  else:
    print("   Cannot reject the null hypothesis for the test between group A and B")  

  if c_a_pvalue <= i:
    print("   Reject the null hypothesis for the test between group A and C, number of started blocks in group C is greater")
  else:
    print("   Cannot reject the null hypothesis for the test between group A and C")

print("effect size between group B and A: ", cohend(B, A, ind=True))
print("effect size between group C and A: ", cohend(C, A, ind=True))

In [None]:
"""
  Test on block finished event
"""
blockcomplete = spark.read.format("delta").table("norm_events_block_finished").toPandas()
blockcomplete = blockcomplete[blockcomplete["CompletionState"] == 1].reset_index(drop=True)
data = count_block_session(blockcomplete)
data_A, data_B, data_C = take_group(data)
#filter nan value, take "count_testing_period" out
_data_A, _data_B, _data_C = data_A["count_testing_period"].dropna(), data_B["count_testing_period"].dropna(), data_C["count_testing_period"].dropna()
#remove outliers
_data_A = _data_A[np.abs(stats.zscore(_data_A)) <= 2]
_data_B = _data_B[np.abs(stats.zscore(_data_B)) <= 2]
_data_C = _data_C[np.abs(stats.zscore(_data_C)) <= 2]
# take out data to do the unpaired t test
A, B, C = np.array(_data_A), np.array(_data_B), np.array(_data_C)
#plot data
plt.boxplot(x = [A, B, C], labels = ["group A", "group B", "group C"])
plt.ylabel("number of blocks")
plt.grid(True)
plt.title("count testing period - block_finished")
plt.show()
#test normality
anormal, bnormal, cnormal = stats.normaltest(A)[1], stats.normaltest(B)[1], stats.normaltest(C)[1]
if anormal < alpha[0]:
  print("data from group A is not normally distributed")
else:
  print("data from group A is normally distributed")
  
if bnormal <= alpha[0]:
  print("data from group B is not normally distributed")
else:
  print("data from group B is normally distributed")
  
if cnormal <= alpha[0]:
  print("data from group C is not normally distributed")
else:
  print("data from group C is normally distributed")

#do the test
b_a_pvalue, c_a_pvalue = 0, 0
if (anormal > alpha[0]) & (bnormal > alpha[0]):
  b_a_pvalue = stats.ttest_ind(B, A, alternative="greater")[1]
else:
  b_a_pvalue = stats.mannwhitneyu(B, A, alternative="greater")[1]

if (anormal > alpha[0]) & (cnormal > alpha[0]):
  c_a_pvalue = stats.ttest_ind(C, A, alternative="greater")[1]
else:
  c_a_pvalue = stats.mannwhitneyu(C, A, alternative="greater")[1]

print("p value for the test between group B and group A: ", b_a_pvalue)
print("p value for the test between group C and group A: ", c_a_pvalue)

for i in alpha:
  print("With alpha = ", i)
  if b_a_pvalue <= i:
    print("   Reject the null hypothesis for the test between group A and B, number of started blocks in group B is greater")
  else:
    print("   cannot reject the null hypothesis for the test between group A and B")  
  if c_a_pvalue <= i:
    print("   Reject the null hypothesis for the test between group A and C, number of started blocks in group C is greater")
  else:
    print("   cannot reject the null hypothesis for the test between group A and C")

print("effect size between group B and A: ", cohend(B, A, ind=True))
print("effect size between group C and A: ", cohend(C, A, ind=True))

## RESEARCH QUESTION 3: satisfaction level (from survey)
### first part: one sample t test
### second part: unpaired t test

In [None]:
#read csv file
survey = pd.read_csv("survey_result.csv")
satisfied_level = np.array(survey["satisfied"])
"""
  first part: Measure satisfaction level of all users
"""
print("First part:")
pnormal = stats.normaltest(satisfied_level)[1]

p_value_55, p_value_60 = 0, 0
if pnormal > alpha[0]:
  print("data is approximately normally distributed")
  p_value_55 = stats.ttest_1samp(satisfied_level, 5.5, alternative="greater")[1]
  p_value_60 = stats.ttest_1samp(satisfied_level, 6, alternative="greater")[1]
else:
  print("data is not normally distributed")
  p_value_55 = stats.wilcoxon(satisfied_level-5.5, alternative="greater")[1]
  p_value_60 = stats.wilcoxon(satisfied_level-6.0, alternative="greater")[1]

print("5.5: ", p_value_55)
print("6: ", p_value_60)
for i in alpha:
  print("With alpha = ", i)
  if p_value_55 <= i:
    print("   At 5.5: Reject the null hypothesis. In average users are satisfied with the recommendations")
  else:
    print("   At 5.5: Cannot reject the null hypothesis")  
  if p_value_60 <= i:
    print("   At 6.0: Reject the null hypothesis. In average users are satisfied with the recommendations")
  else:
    print("   At 6.0: Cannot reject the null hypothesis")

"""
  second part: Perform comparison of A vs B, and A vs C (We have that A, B, C is approximately normally distributed)
"""
print("Second part:")
A = np.array(survey[survey["group"] == 0]["satisfied"])
B = np.array(survey[survey["group"] == 1]["satisfied"])
C = np.array(survey[survey["group"] == 2]["satisfied"])
plt.boxplot(x=[satisfied_level, A, B, C], labels=["all users", "group A", "group B", "group C"])
plt.ylabel("level")
plt.title("Satisfaction level of users in the testing period")
plt.show()

b_a_pvalue = stats.ttest_ind(B, A, alternative="greater")[1]
c_a_pvalue = stats.ttest_ind(C, A, alternative="greater")[1]
print("pvalue B vs A: ", b_a_pvalue)
print("pvalue C vs A: ", c_a_pvalue)

for i in alpha:
  print("With alpha = ", i)
  if b_a_pvalue <= i:
    print("   Reject the null hypothesis for the test between group A and B, users in group B feel more satsfied")
  else:
    print("   cannot reject the null hypothesis for the test between group A and B")  
  if c_a_pvalue <= i:
    print("   Reject the null hypothesis for the test between group A and C, users in group C feel more satsfied")
  else:
    print("   cannot reject the null hypothesis for the test between group A and C")
    
print("effect size between group B and A: ", cohend(B, A, ind=True))
print("effect size between group C and A: ", cohend(C, A, ind=True))