# Imports

In [1]:
import sys
sys.path.append('../')
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)

import spacy
import contextualSpellCheck

from spacy.lang.en import English
from spacy.attrs import POS
nlp = spacy.load('en_core_web_lg')

In [2]:
from nlp_utils import get_feats, lemmatize_doc
from gensim.models.tfidfmodel import TfidfModel
from gensim.corpora import Dictionary
from gensim.matutils import sparse2full

TODO: look at word frequency or something and do a better filter

In [3]:
def stop (t) :
    return t.is_stop or t.lemma_ in ['person', 'tangrams', 'tangram', 'look', 'like', 'ok']


# pre-process text by lemmatizing

In [4]:
d_raw = pd.read_csv('../../data/study1/combined.csv', encoding='latin-1')

In [5]:
d_raw['text'] = [nlp(text) for text in d_raw['utterance']] # once we add a spellchecker this will do it 
d_raw['non_stop_text'] = [[token for token in text if not stop(token)] for text in d_raw['text']]

In [6]:
d_raw['lemmas'] = [lemmatize_doc(parsed_text) for parsed_text in d_raw['non_stop_text']]

In [7]:
d_raw.head()

Unnamed: 0,gameId,targetNum,repNum,trialNum,numPlayers,playerId,target,role,countCorrect,utterance,text,non_stop_text,lemmas
0,2928FF7GcrkeFiDkh,0,0,0,2,okkXLvTuqNbsZkBZ2,/experiment/tangram_F.png,speaker,1,Sitting on the ground facing left,"(Sitting, on, the, ground, facing, left)","[Sitting, ground, facing, left]","[sit, ground, face, left]"
1,2928FF7GcrkeFiDkh,0,1,12,2,5L7PZWTLhnYMGQE8b,/experiment/tangram_B.png,speaker,1,"this guy is kneeling, facing right with arms r...","(this, guy, is, kneeling, ,, facing, right, wi...","[guy, kneeling, ,, facing, right, arms, raised...","[guy, kneel, face, right, arm, raise, arm, tri..."
2,2928FF7GcrkeFiDkh,0,2,24,2,okkXLvTuqNbsZkBZ2,/experiment/tangram_G.png,speaker,1,backwards c arms to the right,"(backwards, c, arms, to, the, right)","[backwards, c, arms, right]","[backwards, c, arm, right]"
3,2928FF7GcrkeFiDkh,0,3,36,2,5L7PZWTLhnYMGQE8b,/experiment/tangram_E.png,speaker,1,bunny ears,"(bunny, ears)","[bunny, ears]","[bunny, ear]"
4,2928FF7GcrkeFiDkh,0,4,48,2,okkXLvTuqNbsZkBZ2,/experiment/tangram_A.png,speaker,1,karate kid / plague doctor,"(karate, kid, /, plague, doctor)","[karate, kid, /, plague, doctor]","[karate, kid, plague, doctor]"


In [8]:
d = d_raw.copy()
gameidList = pd.unique(d.gameId.ravel()).tolist()
tangramList = pd.unique(d.target.ravel()).tolist()

### Look at where conventions were introduced

In [9]:
rows = []

# For each game, look at referring expressions produced by director on later round
for name, df in d.query('role == "speaker"').groupby(['gameId', 'repNum']) :
    for i, row in df.sort_values('target').reset_index().iterrows() :
        later_rep = row['repNum']
        target = row['target']
        content_words = np.unique(
            [t.lemma_ for t in row.text 
             #if t.pos_ in ["NOUN", "ADJ", 'VERB'] #should reconsider if we care about this
             if not stop(t)]
        )
        query_str = 'target == "{}"'.format(target)
        for j, word in enumerate(content_words) :
            for earlier_rep in range(0, later_rep) :
                earlier_df = d.query('repNum == {} and gameId == "{}"'
                                     .format(earlier_rep, name[0])).sort_values('target').reset_index()
                match = word in np.array(list(earlier_df.query(query_str)['lemmas']),dtype=object).flatten()
                rows.append([row['gameId'], row['target'],
                             earlier_rep, later_rep, word, match])

In [10]:
words_df = pd.DataFrame(rows,
    columns = [ 'gameId', 'target', 
               'earlier_rep', 'later_rep',  'word', 'match']
)

In [11]:
words_df.to_csv('../../data/study1/content/word_matches.csv', index=False)

We can also look at the inverse: probability of words on current round appearing at end... 

In [12]:
# rows = []

# # For each game, look at referring expressions produced by director on final round
# for name, rep_df in d.query('role == "speaker"').groupby(['gameId', 'repNum']) :
#     rep_df = rep_df.sort_values('target').reset_index()
#     final_df = d.query('repNum == 5 and role == "speaker" and gameId == "{}"'.format(name[0])).sort_values('target').reset_index()
    
#     # For each word used with each tangram, check whether it occured in each earlier round
#     for i, row in rep_df.iterrows() :
#         target = row['target']
#         content_words = [t.lemma_ for t in row.text 
#                          if t.pos_ in ["NOUN", "ADJ", 'VERB'] 
#                          and not stop(t)]
#         print('content', content_words)
#         query_str = 'target == "{}"'.format(target)
#         print(np.array(list(final_df.query(query_str)['lemmas'])).flatten())
#         for j, word in enumerate(content_words) :
#             final_match = word in np.array(list(final_df.query(query_str)['lemmas'])).flatten()
#             rows.append([row['gameId'], row['repNum'], row['target'], row['playerId'], word, final_match])

In [13]:
# words_df = pd.DataFrame(rows,
#     columns = ['gameId', 'repNum', 'target', 'playerId', 'word', 'final_match']
# )
# words_df.to_csv('../../data/study1/inverse_word_matches.csv')