In [1]:
import pickle
from collections import Counter
import numpy as np
from scipy import stats
from DataObjs import Corpus, success_transform, confidence_transform

In [2]:
with open('Data/data_regression.p') as handle:
    raw_data = pickle.load(handle)
    
# length, lex, form -> all, p, e

# data point = (sccss_str, conf_str, ([], [], []), ([], [], []), ([], [], []))     
    
# nonoise
data_en = Corpus(raw_data[1], succs_map=success_transform, conf_map=confidence_transform)
# FTT
data_de = Corpus(raw_data[0], succs_map=success_transform, conf_map=confidence_transform) 

data_all = Corpus(dict(raw_data[0], **raw_data[1]), succs_map=success_transform, conf_map=confidence_transform)

print len(data_en)
print len(data_de)
print len(data_all)
print float(len(data_de))/len(data_all)

49
115
164
0.701219512195


In [3]:
from math import factorial
from collections import Counter

theoretical_data = data_de
observed_data = data_en
cur_var = Corpus.get_confs

obs_n = len(observed_data)
theo_probs = [v/float(len(theoretical_data)) for k, v in sorted(Counter(theoretical_data.get_confs()).iteritems())]
obs_probs = [v/float(len(observed_data)) for k, v in sorted(Counter(observed_data.get_confs()).iteritems())]

cats = sorted(set(cur_var(theoretical_data)).union(set(cur_var(observed_data))))
theo_counts = Counter(cur_var(theoretical_data))
theo_exp_counts = [theo_counts[c]/float(len(cur_var(theoretical_data))) if c in theo_counts else 0 for c in cats]
theo_exp_counts = [obs_n*exp_c for exp_c in theo_exp_counts]
obs_counts = Counter(cur_var(observed_data))
obs_counts = [obs_counts[c] if c in obs_counts else 0 for c in cats]

print theo_exp_counts
print obs_counts

# https://en.wikipedia.org/wiki/Multinomial_test
def p(probs, counts): 
    result = factorial(sum(counts))
    for prob, count in zip(probs, counts):
        result *= prob ** count / factorial(count)
    return result

z = [b for a, b in sorted(Counter(observed_data.get_confs()).items())]
possibles = [(i, j, k) for i in range(obs_n+1) for j in range(obs_n+1) 
                         for k in range(obs_n+1) if i + j + k == obs_n]
assert tuple(z) in possibles

P0 = p(theo_probs, z)
print 'probability of sample under H0', P0
print 'probability of sample under emp', p(obs_probs, z)
Psig = 0
for possible in possibles:
    P = p(theo_probs, possible)
    #print("{}: {}".format(possible, P))
    if P <= P0:
        Psig += P
print 'Significance', Psig

print theo_counts
print obs_counts

print stats.power_divergence(f_obs=obs_counts, f_exp=theo_exp_counts, lambda_='log-likelihood')

[4.6869565217391305, 12.782608695652174, 9.8, 21.730434782608697]
[0, 3, 2, 44]
probability of sample under H0 1.99790782162e-28
probability of sample under emp 0.0639820092446
Significance 3.06043414173e-27
Counter({4: 51, 2: 30, 3: 23, 1: 11})
[0, 3, 2, 44]
Power_divergenceResult(statistic=47.028092770382315, pvalue=3.4281114742444467e-10)


In [4]:
# measure frequencies
succs_counts = Counter(data_de.get_succs())

conf_counts = Counter(data_de.get_confs())

succ_confs = list(zip(data_de.get_succs(), data_de.get_confs()))

joint_counts = Counter(succ_confs)

n = float(sum(joint_counts.values()))
print 'n', n

n 115.0


In [5]:
# success statistics

print succs_counts
print 'Probs:', {k: v/n for k, v in succs_counts.iteritems()}
print 'entropy (%):', -sum([float(c)/n * np.log2(float(c)/n) for c in succs_counts.values()])/ np.log2(len(succs_counts.keys()))

Counter({0: 52, 1: 47, -1: 16})
Probs: {0: 0.45217391304347826, 1: 0.40869565217391307, -1: 0.1391304347826087}
entropy (%): 0.909322374784


In [6]:
# confidence statistics

print conf_counts
print 'Probs:', {k: v/n for k, v in conf_counts.iteritems()}
print 'entropy (%):', -sum([float(c)/n * np.log2(float(c)/n) for c in conf_counts.values()])/np.log2(len(conf_counts.keys()))

Counter({4: 51, 2: 30, 3: 23, 1: 11})
Probs: {1: 0.09565217391304348, 2: 0.2608695652173913, 3: 0.2, 4: 0.4434782608695652}
entropy (%): 0.907109881546


In [7]:
# contingencies
joint_table = []
print '\n\n1\t2\t3\t4'
print '---------------------------------\n'
for succ in [-1, 0, 1]:
    cur_row = []
    for conf in [1, 2, 3, 4]:
        cur_p = joint_counts[(succ, conf)]
        cur_row.append(cur_p)
        print cur_p, '\t',
    joint_table.append(cur_row)
    print '|', succ, '\n'

    
print '\n\n1\t2\t3\t4'
print '---------------------------------\n'
for succ in [-1, 0, 1]:
    for conf in [1, 2, 3, 4]:
        cur_p = round(joint_counts[(succ, conf)]/float(n), 3)
        print cur_p, '\t',
    print '|', succ, '\n'
    
H_succ = -sum([float(c)/n * np.log2(float(c)/n) for c in succs_counts.values()])
H_conf = -sum([float(c)/n * np.log2(float(c)/n) for c in conf_counts.values()])
H_joint = -sum([float(c)/n * np.log2(float(c)/n) for c in joint_counts.values()])

print '\n\ndependence (scaled I): ', (H_succ + H_conf - H_joint)/H_joint
print '\nsqrt(chi2/n):', (stats.chi2_contingency(joint_table)[0]/float(n))**0.5
print '\tp-value:', stats.chi2_contingency(joint_table)[1]
print '\tdof:', stats.chi2_contingency(joint_table)[2]



1	2	3	4
---------------------------------

5 	2 	1 	8 	| -1 

1 	28 	22 	1 	| 0 

5 	0 	0 	42 	| 1 



1	2	3	4
---------------------------------

0.043 	0.017 	0.009 	0.07 	| -1 

0.009 	0.243 	0.191 	0.009 	| 0 

0.043 	0.0 	0.0 	0.365 	| 1 



dependence (scaled I):  0.342716909302

sqrt(chi2/n): 0.961860204756
	p-value: 1.15789241521e-20
	dof: 6


In [8]:
# conditional probabilities

succ_marg_probs = {k: v/n for k, v in succs_counts.iteritems()}
conf_marg_probs = {k: v/n for k, v in conf_counts.iteritems()}
joint_probs = {k: v/n for k, v in joint_counts.iteritems()}

joint_table = []

print 'P(confidence_lvl | success_lvl)'
print '\n1\t2\t3\t4'
print '---------------------------------\n'
for succ in [-1, 0, 1]:
    cur_row = []
    for conf in [1, 2, 3, 4]:
        joint_p = joint_probs[(succ, conf)] if (succ, conf) in joint_probs else 0.0
        cur_p = round(joint_p/succ_marg_probs[succ], 4)
        cur_row.append(cur_p)
        print cur_p, '\t',
    joint_table.append(cur_row)
    print '|', succ, '\n'
    
print 'entropies', [stats.entropy(row) for row in joint_table], '(max = ', np.log(4), ')'

joint_table = []
    
print '\n\nP(success_lvl | confidence_lvl)'
print '\n-1\t0\t1'
print '-------------------------\n'
for conf in [1, 2, 3, 4]:
    cur_row = []
    for succ in [-1, 0, 1]:
        joint_p = joint_probs[(succ, conf)] if (succ, conf) in joint_probs else 0.0
        cur_p = round(joint_p/conf_marg_probs[conf], 4)
        cur_row.append(cur_p)
        print cur_p, '\t',
    joint_table.append(cur_row)
    print '|', conf, '\n'

print 'entropies', [stats.entropy(row) for row in joint_table], '(max = ', np.log(3), ')'

P(confidence_lvl | success_lvl)

1	2	3	4
---------------------------------

0.3125 	0.125 	0.0625 	0.5 	| -1 

0.0192 	0.5385 	0.4231 	0.0192 	| 0 

0.1064 	0.0 	0.0 	0.8936 	| 1 

entropies [1.1432752061942137, 0.84903147657617706, 0.33892183540531451] (max =  1.38629436112 )


P(success_lvl | confidence_lvl)

-1	0	1
-------------------------

0.4545 	0.0909 	0.4545 	| 1 

0.0667 	0.9333 	0.0 	| 2 

0.0435 	0.9565 	0.0 	| 3 

0.1569 	0.0196 	0.8235 	| 4 

entropies [0.93476989785827924, 0.2450179864450992, 0.17891210361100607, 0.52759031766068332] (max =  1.09861228867 )
