In [9]:
import math
import pandas as pd
from functools import reduce
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
from scipy.stats import pearsonr
from numpy import cov
# generate related variables
from numpy import mean
from numpy import std

#### Rational Listener Interpretation
$P_L(w|m) \propto P(w) \times P_s(m|w)$ </br>
(Both of them are normalized)


     ziji: 张伟说小明总把自己弄糊涂 = {w1,w2}
     taziji: 张伟说小明总把他自己弄糊涂 = {w2}
     ta: 张伟说小明总把他弄糊涂 = {w1,w3}
     Zhangwei says Xiaoming always makes self/himself/him confused.
  <br/>
 
     W: worlds
         w1: Speaker
         w2: Clause Subject
         w3: Others
     
     M: messages {ziji, ta, taziji}

In [10]:
data = pd.read_csv("final_data_bySenID.csv", index_col=[0])
data

c1 = ['co-argument','possessor'] #one for each condition
c2 = ['Speaker', 'Clause Subject']
c3 = ['ta', 'taziji', 'ziji']

In [11]:
# get normalized predicted posterior

def process_data(data):
    
    others = {'senID':[i for i in range(1,31)],
        'experiment':['likelihood' for x in range(1,31)],
        'condition': ['Others' for x in range(1,31)],
        'selection':['ta' for x in range(1,31)],
        'Prob':[1 for x in range(1,31)]
       }
    others = pd.DataFrame(others)
    others['type'] = np.where(others['senID'] < 15, 'co-argument', 'possessor')
    data = data.append(others, ignore_index = True) 

    df1 = data[(data.experiment == 'posterior')][['senID','type','selection','condition','Prob']].rename(columns={"Prob": "posterior",'selection':'c2','condition':"c3",'type':'c1'})
    df2 = data[(data.experiment == 'prior')][['senID','type','selection','Prob']].rename(columns={"Prob": "prior",'selection':'c2','type':'c1'})
    df3 = data[(data.experiment == 'likelihood')][['senID','type','selection','condition','Prob']].rename(columns={"Prob": "likelihood",'selection':'c3','condition':"c2",'type':'c1'})
    result = pd.merge(df1,df2)
    df = pd.merge(result,df3)

    df['estimate_posterior'] = df.apply(lambda row: row.prior * row.likelihood, axis=1)
    #df['estimate_posterior_log'] = df.apply(lambda row: np.log(row.prior) + np.log(row.likelihood), axis=1)
    df[["senID",'c1',"c2",'c3','posterior','estimate_posterior']]

    df = df.sort_values(by=['senID', 'c3'])
    
    for i in range(1,31):
        for c in c3:
            ddf = df[(df.senID == i)&(df.c3 == c)]
            val = ddf["estimate_posterior"].tolist()
            #print(ddf)
            norm = [float(i)/sum(val) if sum(val)!=0 else 0 for i in val]
            #print(norm)
            df.loc[(df.senID == i)&(df.c3 == c), "estimate_posterior"] = norm

    return df


In [12]:
df = process_data(data)
df.head(6)

  data = data.append(others, ignore_index = True)


Unnamed: 0,senID,c1,c2,c3,posterior,prior,likelihood,estimate_posterior
0,1,co-argument,Speaker,ta,0.882353,0.75,0.5,0.696133
3,1,co-argument,Clause Subject,ta,0.098039,0.107143,0.194444,0.038674
6,1,co-argument,Others,ta,0.019608,0.142857,1.0,0.265193
1,1,co-argument,Speaker,taziji,0.27907,0.75,0.107143,0.574468
4,1,co-argument,Clause Subject,taziji,0.697674,0.107143,0.555556,0.425532
2,1,co-argument,Speaker,ziji,0.682927,0.75,0.392857,0.916667


In [14]:
def plot_Posterior(df,c1,c2,c3):
    
    '''
    df(dataframe): normalized
    c1(str): co-argument or possessor
    c2(str): speaker or clause subject or others
    c3(str): ta or ziji or taziji
    '''
    
    df2 = df[(df.c1 == c1) & (df.c2 == c2) & (df.c3 == c3)]
    
    
    df2 = df2.set_index(df2['senID'])
    #display(df2)
    
    x = df2['posterior']
    y = df2['estimate_posterior']
    
    
    
    fig, ax = plt.subplots(figsize=(8,8))
    ax.scatter(x=x,y=y, c='DarkBlue')
    ax.set_title("Listener Interpretation: P ( {} | {} ) in {} condition".format(c2,c3,c1))
    ax.set_xlabel('Expermental Data') #x label
    ax.set_ylabel('Bayesian Model Prediction') #y label
#     ax.set_xlim([0, 1])
#     ax.set_ylim([0, 1])

    for i, row in df2.iterrows():
        plt.annotate(i, (row['posterior'], row['estimate_posterior']))

    # Plot regression line
    b, a = np.polyfit(x, y, deg=1)
    ax.plot(x, a + b * x);
    
#     ax.hlines(y=0.5, xmin=0, xmax=1, linestyles='--', color='r')
#     ax.vlines(x =0.5, ymin = 0, ymax = 1, linestyles='--', color='r')
#     ax.set_xlim([0, 1])
#     ax.set_ylim([0, 1])
#     s = [0,1]
#     ax.plot(s, s);

    plt.savefig('{}_{}_{}.png'.format(c3, c1,c2))
    plt.show()
    
    # summarize
    print('x: mean=%.3f stdv=%.3f' % (mean(x), std(x)))
    print('y: mean=%.3f stdv=%.3f' % (mean(y), std(y)))
    
    res = stats.linregress(x, y)
    pval = res.pvalue
    print("Statistics:", res)
    print(f"R-squared: {res.rvalue**2:.6f}")
    
    corr, _ = pearsonr(x, y)
    print('Pearsons correlation: %.3f' % corr)
    
    covariance = cov(x, y)
    print("covariance is: ", covariance)

    if pval<0.05:
        print("pval = {}, reject null hypothesis".format(pval))
    else:
        print("pval = {}, accept null hypothesis".format(pval))


In [6]:
def estimatePosterior(c1, c2, c3):
    '''
    c1(str): co-argument or possessor
    c2(str): speaker or clause subject or others
    c3(str): ta or ziji or taziji
    '''
    df = data[(data.type == c1) & ((data.condition == 'prior') & (data.selection == c2)) | ((data.condition == c2) & (data.selection == c3)) | ((data.condition == c3) & (data.selection == c2))]
    df = df.pivot(index='senID', columns = 'experiment', values = "Prob")[['prior', 'likelihood', 'posterior']]
    
    df['estimate_posterior'] = df.apply(lambda row: row.prior * row.likelihood, axis=1)
    #df['estimate_posterior'] = df.apply(lambda row: np.log(row.prior) + np.log(row.likelihood), axis=1)
    
    df = df.dropna(subset = ['estimate_posterior'])
    df = df.fillna(0)
    #display(df)
    
    
    x = df['posterior']
    y = df['estimate_posterior']
    fig, ax = plt.subplots(figsize=(8,5))
    ax.scatter(x=x,y=y, c='DarkBlue')
    ax.set_title("Listener Interpretation: P ( {} | {} ) in {} condition".format(c2,c3,c1))
    ax.set_xlabel('Expermental Data') #x label
    ax.set_ylabel('Bayesian Model Prediction') #y label

    for i, row in df.iterrows():
        plt.annotate(i, (row['posterior'], row['estimate_posterior']))
        
    # Plot regression line
    b, a = np.polyfit(x, y, deg=1)
    ax.plot(x, a + b * x);
    plt.show()

    
    res = stats.linregress(x, y)
    pval = res.pvalue
    print("Statistics:", res)
    print(f"R-squared: {res.rvalue**2:.6f}")
    
    if pval<0.05:
        print("pval = {}, reject null hypothesis".format(pval))
    else:
        print("pval = {}, accept null hypothesis".format(pval))

    

In [7]:
def estimatePosterior_log(c1, c2, c3):
    '''
    c1(str): co-argument or possessor
    c2(str): speaker or clause subject or others
    c3(str): ta or ziji or taziji
    '''
    df = data[(data.type == c1) & ((data.condition == 'prior') & (data.selection == c2)) | ((data.condition == c2) & (data.selection == c3)) | ((data.condition == c3) & (data.selection == c2))]
    df = df.pivot(index='senID', columns = 'experiment', values = "Prob")[['prior', 'likelihood', 'posterior']]
    
    df['estimate_posterior'] = df.apply(lambda row: np.log(row.prior) + np.log(row.likelihood), axis=1)
    
    df = df.dropna(subset = ['estimate_posterior'])
    df = df.fillna(0)
    #display(df)
    x = df['posterior']
    y = df['estimate_posterior']
    fig, ax = plt.subplots(figsize=(8,5))
    ax.scatter(x=x,y=y, c='DarkBlue')
    ax.set_title("Rational Listner: P ( {} | {} ) in {} condition".format(c2,c3,c1))
    ax.set_xlabel('observed posterior') #x label
    ax.set_ylabel('predicted posterior') #y label

    for i, row in df.iterrows():
        plt.annotate(i, ( row['estimate_posterior'],row['posterior']))
        
    # Plot regression line
    b, a = np.polyfit(x, y, deg=1)
    ax.plot(x, a + b * x);
    plt.show()
    
    res = stats.linregress(x, y)
    pval = res.pvalue
    print("Statistics for {},{},{}:".format(c1,c2,c3), res)
    print(f"R-squared: {res.rvalue**2:.6f}")
    
    if pval<0.05:
        print("pval = {}, reject null hypothesis".format(pval))
    else:
        print("pval = {}, accept null hypothesis".format(pval))
    
    


In [8]:
# c1 = 'possessor'
# c2 = 'Speaker'
# c3 = 'ziji'
# plot_Posterior(df,c1,c2,c3)