# Imports

In [1]:
import sys
sys.path.append('../')
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)

import spacy

from spacy.lang.en import English
from spacy.attrs import POS
nlp = spacy.load('en_core_web_lg')

In [2]:
from nlp_utils import get_feats, lemmatize_doc
from gensim.models.tfidfmodel import TfidfModel
from gensim.corpora import Dictionary
from gensim.matutils import sparse2full

TODO: look at word frequency or something and do a better filter

In [3]:
def stop (t) :
    return t.is_stop or t.lemma_ in ['look','like','body','person','man','guy']
#common words that don't have meaning if this context but aren't on the stop words list


# pre-process text by lemmatizing

In [4]:
foobar = pd.read_csv('../../data/study1/combined.csv', encoding='latin-1')
#d_raw = pd.read_csv('../../data/single_speaker/combined.csv', encoding='latin-1')
d_raw = pd.read_csv('../../data/study2a/combined.csv', encoding='latin-1')

In [5]:
for text in d_raw['utterance']:
    if type(text)!=type("foo"):
        print(text)

In [6]:
d_raw['text'] = [nlp(text) for text in d_raw['utterance']]
d_raw['non_stop_text'] = [[token for token in text if not stop(token)] for text in d_raw['text']]

In [7]:
d_raw['lemmas'] = [lemmatize_doc(parsed_text) for parsed_text in d_raw['non_stop_text']]

In [8]:
d_raw.head()

Unnamed: 0,gameId,targetNum,repNum,trialNum,numPlayers,playerId,target,role,utterance,text,non_stop_text,lemmas
0,2yTRgeWp8WH65rQQq,0,0,0,6,6orNN6e8XB9vh277r,/experiment/tangram_L.png,speaker,sideways facing to the right with a point stic...,"(sideways, facing, to, the, right, with, a, po...","[sideways, facing, right, point, sticking, str...","[sideways, face, right, point, stick, straight..."
1,2yTRgeWp8WH65rQQq,0,0,0,6,ij5MEBTXzQKLmLSqb,/experiment/tangram_L.png,listener,is there a zig zag at the bottom? does it look...,"(is, there, a, zig, zag, at, the, bottom, ?, d...","[zig, zag, ?, cliff, ?]","[zig, zag, cliff]"
2,2yTRgeWp8WH65rQQq,0,0,0,6,qYBRmoNdiWY9aFSwW,/experiment/tangram_L.png,listener,are the feet flat on the ground? skinny or fat?,"(are, the, feet, flat, on, the, ground, ?, ski...","[feet, flat, ground, ?, skinny, fat, ?]","[foot, flat, ground, skinny, fat]"
3,2yTRgeWp8WH65rQQq,0,0,0,6,WaYui7sch79z2hjfN,/experiment/tangram_L.png,listener,is the bottom zigzag shape or square? is the t...,"(is, the, bottom, zigzag, shape, or, square, ?...","[zigzag, shape, square, ?, half, square, left,...","[zigzag, shape, square, half, square, left, po..."
4,2yTRgeWp8WH65rQQq,0,0,0,6,xHbcBCmnhjZonGDnw,/experiment/tangram_L.png,listener,do they have a hump on their back,"(do, they, have, a, hump, on, their, back)",[hump],[hump]


In [9]:
d = d_raw.copy()
gameidList = pd.unique(d.gameId.ravel()).tolist()
tangramList = pd.unique(d.target.ravel()).tolist()

### Look at where conventions were introduced

In [10]:
rows = []

# For each game, look at referring expressions produced by director on later round
for name, df in d.query('role == "speaker"').groupby(['gameId', 'repNum']) :
    for i, row in df.sort_values('target').reset_index().iterrows() :
        later_rep = row['repNum']
        target = row['target']
        content_words = np.unique(
            [t.lemma_ for t in row.text 
             if t.pos_ in ["NOUN", "ADJ", 'VERB'] #should reconsider if we care about this
             if not stop(t)]
        )
        query_str = 'target == "{}"'.format(target)
        for j, word in enumerate(content_words) :
            for earlier_rep in range(0, later_rep) :
                earlier_df = d.query('repNum == {} and gameId == "{}"'
                                     .format(earlier_rep, name[0])).sort_values('target').reset_index()
                match = word in np.array(list(earlier_df.query(query_str)['lemmas']),dtype=object).flatten()
                rows.append([row['gameId'], row['target'],
                             earlier_rep, later_rep, word, match])

In [11]:
words_df = pd.DataFrame(rows,
    columns = [ 'gameId', 'target', 
               'earlier_rep', 'later_rep',  'word', 'match']
)

In [12]:
words_df.to_csv('../../data/study2a/word_matches.csv', index=False)
#words_df.to_csv('../../data/single_speaker/word_matches.csv', index=False)

We can also look at the inverse: probability of words on current round appearing at end... 

In [13]:
# rows = []

# # For each game, look at referring expressions produced by director on final round
# for name, rep_df in d.query('role == "speaker"').groupby(['gameId', 'repNum']) :
#     rep_df = rep_df.sort_values('target').reset_index()
#     final_df = d.query('repNum == 5 and role == "speaker" and gameId == "{}"'.format(name[0])).sort_values('target').reset_index()
    
#     # For each word used with each tangram, check whether it occured in each earlier round
#     for i, row in rep_df.iterrows() :
#         target = row['target']
#         content_words = [t.lemma_ for t in row.text 
#                          if t.pos_ in ["NOUN", "ADJ", 'VERB'] 
#                          and not stop(t)]
#         print('content', content_words)
#         query_str = 'target == "{}"'.format(target)
#         print(np.array(list(final_df.query(query_str)['lemmas'])).flatten())
#         for j, word in enumerate(content_words) :
#             final_match = word in np.array(list(final_df.query(query_str)['lemmas'])).flatten()
#             rows.append([row['gameId'], row['repNum'], row['target'], row['playerId'], word, final_match])

In [14]:
# words_df = pd.DataFrame(rows,
#     columns = ['gameId', 'repNum', 'target', 'playerId', 'word', 'final_match']
# )
# words_df.to_csv('../../data/study1/inverse_word_matches.csv')