In [1]:
import refexps
from os import listdir, path
from itertools import combinations
from collections import defaultdict
import numpy as np
import nltk
from scipy import stats
import compound_splitter

In [3]:
def compute_values_move(refexps_dir, success_dir, compound = True):
    splits = compound_splitter.load_dict('de_lower.dict')
    move_alignment = defaultdict(list)
    for dialogue in listdir(refexps_dir):
        #extract dialogue type
        if dialogue.startswith('FTT'):
            language = 'german'
        else:
            language = 'english'
        dialogue = path.join(refexps_dir, dialogue)
        #load referring expressions data
        dialogue = refexps.load_data(dialogue)
        #move level
        dialogue_move = refexps.move_level(dialogue)
        #iterate over moves
        for move in dialogue_move:
            refs = move[1]
            #iterate over objects
            for obj in refs:
                #iterate over objects
                refobj = refs[obj]
                #SIMILARITY OF LEXICAL CONTENT INTO
                total_words = 0
                total_words_speaker = defaultdict(int)
                sameword_n = 0
                sameword_n_speaker = defaultdict(int)
                speakers = set()
                prev_words = set()
                for ref in refobj:
                    speaker = ref[1]
                    words = refexps.lex_material(ref, language, splits, compound)
                    if prev_words != set():
                        for w in words:
                            if w in prev_words:
                                sameword_n += 1
                                sameword_n_speaker[speaker] += 1
                    else:
                        starter = speaker
                        first_words = len(words)
                    speakers.add(speaker)
                    total_words_speaker[speaker] += len(words)
                    total_words += len(words)
                    for w in words:
                        prev_words.add(w)
                if (total_words - first_words) != 0:
                    alignment = float(sameword_n)/float(total_words - first_words)
                    move_alignment['Alignment'].append(alignment)
                for s in speakers:
                        if starter == s:
                            if (total_words_speaker[s] - first_words) != 0:
                                alignment = float(sameword_n_speaker[s])/float(total_words_speaker[s] - first_words)
                                move_alignment['Alignment '+s].append(alignment)
                        else:
                            if(total_words_speaker[s]) != 0:
                                alignment = float(sameword_n_speaker[s])/float(total_words_speaker[s])
                                move_alignment['Alignment '+s].append(alignment)
    for value in move_alignment:
        dataset = move_alignment[value]
        print '\t'+ value + ': '
        print '\tMean: '+ str(np.mean(dataset)) +'\tStandard deviation: '+ str(np.std(dataset))
        print '\tMax value: ' + str(max(dataset)) + '\tMin value: '+ str(min(dataset))
    return move_alignment

In [4]:
de_dataset_dir = 'En_De_Dataset/De/RefExp'
de_success_dir = 'En_De_Dataset/De/Success'

en_dataset_dir = 'En_De_Dataset/En/RefExp'
en_success_dir = 'En_De_Dataset/En/Success'

all_dataset_dir = 'En_De_Dataset/All/RefExp'
all_success_dir = 'En_De_Dataset/All/Success'

results_de = compute_values_move(de_dataset_dir, de_success_dir)
print '\n\n'
results_en = compute_values_move(en_dataset_dir, en_success_dir)
print '\n\n'
results_all = compute_values_move(all_dataset_dir, all_success_dir)

	Alignment p-utts: 
	Mean: 0.414674074211	Standard deviation: 0.357164221722
	Max value: 1.0	Min value: 0.0
	Alignment: 
	Mean: 0.41830445152	Standard deviation: 0.333837777053
	Max value: 1.0	Min value: 0.0
	Alignment e-utts: 
	Mean: 0.445911104565	Standard deviation: 0.393730910318
	Max value: 1.0	Min value: 0.0



	Alignment p-utts: 
	Mean: 0.463352753783	Standard deviation: 0.32780561905
	Max value: 1.0	Min value: 0.0
	Alignment: 
	Mean: 0.467999489262	Standard deviation: 0.325626997383
	Max value: 1.0	Min value: 0.0
	Alignment e-utts: 
	Mean: 0.513328748216	Standard deviation: 0.374278394634
	Max value: 1.0	Min value: 0.0



	Alignment p-utts: 
	Mean: 0.430436694263	Standard deviation: 0.348673763927
	Max value: 1.0	Min value: 0.0
	Alignment: 
	Mean: 0.43469762787	Standard deviation: 0.331975000742
	Max value: 1.0	Min value: 0.0
	Alignment e-utts: 
	Mean: 0.46740231848	Standard deviation: 0.388906933838
	Max value: 1.0	Min value: 0.0


In [5]:
results_de_nocs = compute_values_move(de_dataset_dir, de_success_dir, compound = False)
print '\n\n'

for var in results_de:
    mannwhitneyu =  stats.mannwhitneyu(results_de[var],results_de_nocs[var], alternative='two-sided')
    if mannwhitneyu[1] < 0.05:
        print var
        print str(mannwhitneyu)+'\tSignificantly different'
        print np.mean(results_de[var]), np.mean(results_de_nocs[var])

	Alignment p-utts: 
	Mean: 0.401525555651	Standard deviation: 0.353283224575
	Max value: 1.0	Min value: 0.0
	Alignment: 
	Mean: 0.409648027501	Standard deviation: 0.33252809117
	Max value: 1.0	Min value: 0.0
	Alignment e-utts: 
	Mean: 0.446232791425	Standard deviation: 0.397685455074
	Max value: 1.0	Min value: 0.0





In [7]:
print 'Differences between languages'
for var in results_all:
    ttest =  stats.mannwhitneyu(results_de[var],results_en[var], alternative='two-sided')
    if ttest[1] < 0.05:
        print var
        print str(ttest)+'\tSignificantly different'
print '\n'

print 'Differences between languages without compound splitter'
for var in results_all:
    ttest =  stats.mannwhitneyu(results_de_nocs[var],results_en[var], alternative='two-sided')
    if ttest[1] < 0.05:
        print var
        print str(ttest)+'\tSignificantly different'
print '\n'

print 'Differences between players'
ttest =  stats.mannwhitneyu(results_de['Alignment e-utts'],results_de['Alignment p-utts'], alternative='two-sided')
if ttest[1] < 0.05:
    print 'Players move alignment de'
    print str(ttest)+'\tSignificantly different'

ttest =  stats.mannwhitneyu(results_en['Alignment e-utts'],results_en['Alignment p-utts'], alternative='two-sided')
if ttest[1] < 0.05:
    print 'Players move alignment en'
    print str(ttest)+'\tSignificantly different'
                                                                        
ttest =  stats.mannwhitneyu(results_all['Alignment e-utts'],results_all['Alignment p-utts'], alternative='two-sided')
if ttest[1] < 0.05:
    print 'Players move alignment all'
    print str(ttest)+'\tSignificantly different'                                                             

Differences between languages
Alignment
MannwhitneyuResult(statistic=22717.5, pvalue=0.037662977390869275)	Significantly different


Differences between languages without compound splitter
Alignment p-utts
MannwhitneyuResult(statistic=16974.5, pvalue=0.041222634030500603)	Significantly different
Alignment
MannwhitneyuResult(statistic=22321.0, pvalue=0.018355923827195027)	Significantly different


Differences between players


In [25]:
def compute_values_game(refexps_dir, success_dir, compound = True):
    splits = compound_splitter.load_dict('de_lower.dict')
    game_alignment = defaultdict(list)
    for dialogue in listdir(refexps_dir):
        if dialogue.startswith('FTT'):
            language = 'german'
        else:
            language = 'english'
        dialogue = refexps.load_data(path.join(refexps_dir, dialogue))
        dialogue_game = refexps.gamerun_level(dialogue)
        for obj in dialogue_game:
            refobj = dialogue_game[obj]
            total_words = 0
            total_words_speaker = defaultdict(int)
            sameword_n = 0
            sameword_n_speaker = defaultdict(int)
            speakers = set()
            prev_words = set()
            for ref in refobj:
                speaker = ref[1]
                words = refexps.lex_material(ref, language, splits, compound)
                if prev_words != set():
                    for w in words:
                        if w in prev_words:
                            sameword_n += 1
                            sameword_n_speaker[speaker] += 1
                else:
                    starter = speaker
                    first_words = len(words)
                speakers.add(speaker)
                total_words_speaker[speaker] += len(words)
                total_words += len(words)
                for w in words:
                    prev_words.add(w)
            if (total_words - first_words) != 0:
                alignment = float(sameword_n)/float(total_words - first_words)
                game_alignment['Alignment'].append(alignment)
            for s in speakers:
                if starter == s:
                    if (total_words_speaker[s] - first_words) != 0:
                        alignment = float(sameword_n_speaker[s])/float(total_words_speaker[s] - first_words)
                        game_alignment['Alignment '+s].append(alignment)
                else:
                    if(total_words_speaker[s]) != 0:
                        alignment = float(sameword_n_speaker[s])/float(total_words_speaker[s])
                        game_alignment['Alignment '+s].append(alignment)
    for value in game_alignment:
        dataset = game_alignment[value]
        print '\t'+ value + ': '
        print '\tMean: '+ str(np.mean(dataset)) +'\tStandard deviation: '+ str(np.std(dataset))
        print '\tMax value: ' + str(max(dataset)) + '\tMin value: '+ str(min(dataset))
    return game_alignment

In [26]:

results_de = compute_values_game(de_dataset_dir, de_success_dir)
print '\n\n'
results_en = compute_values_game(en_dataset_dir, en_success_dir)
print '\n\n'
results_all = compute_values_game(all_dataset_dir, all_success_dir)

	Alignment p-utts: 
	Mean: 0.387927594512	Standard deviation: 0.289652044255
	Max value: 1.0	Min value: 0.0
	Alignment: 
	Mean: 0.390326706449	Standard deviation: 0.27498319437
	Max value: 1.0	Min value: 0.0
	Alignment e-utts: 
	Mean: 0.455576703235	Standard deviation: 0.33840588434
	Max value: 1.0	Min value: 0.0



	Alignment p-utts: 
	Mean: 0.454685957366	Standard deviation: 0.280814821686
	Max value: 1.0	Min value: 0.0
	Alignment: 
	Mean: 0.447240777605	Standard deviation: 0.273633548843
	Max value: 1.0	Min value: 0.0
	Alignment e-utts: 
	Mean: 0.489036808602	Standard deviation: 0.317244153296
	Max value: 1.0	Min value: 0.0



	Alignment p-utts: 
	Mean: 0.413452850897	Standard deviation: 0.288137502177
	Max value: 1.0	Min value: 0.0
	Alignment: 
	Mean: 0.413367448246	Standard deviation: 0.275855885221
	Max value: 1.0	Min value: 0.0
	Alignment e-utts: 
	Mean: 0.468076167883	Standard deviation: 0.331055079382
	Max value: 1.0	Min value: 0.0


In [27]:
results_de_nocs = compute_values_game(de_dataset_dir, de_success_dir, compound = False)
print '\n\n'

for var in results_de:
    mannwhitneyu = stats.mannwhitneyu(results_de[var],results_de_nocs[var], alternative='two-sided')
    if mannwhitneyu[1] < 0.05:
        print var
        print str(mannwhitneyu)+'\tSignificantly different'
        print np.mean(results_de[var]), np.mean(results_de_nocs[var])

	Alignment p-utts: 
	Mean: 0.376335396505	Standard deviation: 0.296288221042
	Max value: 1.0	Min value: 0.0
	Alignment: 
	Mean: 0.382724993793	Standard deviation: 0.278859354538
	Max value: 1.0	Min value: 0.0
	Alignment e-utts: 
	Mean: 0.453323741071	Standard deviation: 0.342783603488
	Max value: 1.0	Min value: 0.0





In [28]:
print 'Differences between languages'
for var in results_all:
    ttest =  stats.mannwhitneyu(results_de[var],results_en[var], alternative='two-sided')
    if ttest[1] < 0.05:
        print var
        print str(ttest)+'\tSignificantly different'
print '\n'

print 'Differences between languages without compound splitter'
for var in results_all:
    ttest =  stats.mannwhitneyu(results_de_nocs[var],results_en[var], alternative='two-sided')
    if ttest[1] < 0.05:
        print var
        print str(ttest)+'\tSignificantly different'
print '\n'


print 'Differences between players'
ttest =  stats.mannwhitneyu(results_de['Alignment e-utts'],results_de['Alignment p-utts'], alternative='two-sided')
if ttest[1] < 0.05:
    print 'Players game alignment de'
    print str(ttest)+'\tSignificantly different'

ttest =  stats.mannwhitneyu(results_en['Alignment e-utts'],results_en['Alignment p-utts'], alternative='two-sided')
if ttest[1] < 0.05:
    print 'Players game alignment en'
    print str(ttest)+'\tSignificantly different'
                                                                        
ttest =  stats.mannwhitneyu(results_all['Alignment e-utts'],results_all['Alignment p-utts'], alternative='two-sided')
if ttest[1] < 0.05:
    print 'Players game alignment all'
    print str(ttest)+'\tSignificantly different'                                                               
                                                                        

Differences between languages
Alignment p-utts
MannwhitneyuResult(statistic=9360.0, pvalue=0.023353231652392621)	Significantly different
Alignment
MannwhitneyuResult(statistic=11360.5, pvalue=0.030849047428130813)	Significantly different


Differences between languages without compound splitter
Alignment p-utts
MannwhitneyuResult(statistic=9190.0, pvalue=0.012463278702654667)	Significantly different
Alignment
MannwhitneyuResult(statistic=11199.5, pvalue=0.018760945101500399)	Significantly different


Differences between players
