In [1]:
import csv
from gensim.models import KeyedVectors
import numpy as np
import pickle
from spatial import get_grid, find_similarity
from utils import contain_punctuation

csv.field_size_limit(500 * 1024 * 1024)

131072

## Loading and Utility Functions

In [2]:
word2vec = KeyedVectors.load_word2vec_format('../../wiki/wiki.it.vec')

In [3]:
semantic_thresh = 0.50
lexical_thresh = 0.95

In [4]:
def get_words(language='en', words=None, length=None, isogram=False, max_len=None):    
    
    if words is None and language == 'en':
        with open('../../data/english/words_en.pickle', 'rb') as f:
            words = pickle.load(f)    
            
    if words is None and language == 'it':
        with open('../../data/italian/words_it.pickle', 'rb') as f:
            words = pickle.load(f)    
            
    if length is not None:
        words = [w for w in words if len(w) == length]
    
    if isogram == True:
        words = [w for w in words if len(set(w))==len(w)]
        
    if max_len is not None:
        words = [w for w in words if len(w) < max_len]
        
    return words

def filter_words(template): 
    return [w for w in words if w[:2] == template]

def get_semantic_neighbors(word):    
    densities = ''  
    words = word2vec.similar_by_word(word, topn=1000000)
    
    for i, word in enumerate(words):
        if word[1] > semantic_thresh:
            if i != 0:
                densities += ' '
            densities += word[0] 
            densities += ':'
            densities += str(word[1])

    return densities

def get_lexical_neighbors(target, words):
    densities = ''
        
    for i, w in enumerate(words):
        sim = find_similarity(w, target)

        if sim > lexical_thresh:
            if i != 0:
                densities += ' '
            densities += w 
            densities += ':' 
            densities += str(sim)

    return densities

def get_lexical_similarity(w1, w2):    
    return find_similarity(w1, w2)

def get_semantic_similarity(w1, w2):
    return word2vec.similarity(w1, w2)

In [5]:
words = get_words(language='it', max_len=26)

## Load experiment data (primes, targets and RTs)

<b>Read data from original experiment csv, compute semantic and lexical distances and add four new columns to the data. </b>

<b> TODO: Think of a concrete density metric which can then be added as a new column.</b>

In [31]:
freqs = {}

with open('../../data/italian/freq-it.txt', 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    
    for row in reader:
        word = row[0].split(' ')[0]
        count = row[0].split(' ')[1]
        freqs[word] = count

In [None]:
metrics = []



In [44]:
data = []
with open('../../data/italian/rt.csv', 'r') as f:
    reader = csv.reader(f)
    
    for row in reader:
        try:
            l_dist = 1 - get_lexical_similarity(row[0], row[1])
            s_dist = 1 - get_semantic_similarity(row[0], row[1])

            len_target, len_prime = len(row[0]), len(row[1])
            
            new_row = row
            new_row.extend([l_dist, s_dist, l_dist + s_dist, l_dist * s_dist, 
                           len_target, len_prime, freqs[row[0]], freqs[row[1]]])
            data.append(new_row)
        except:
            continue

In [47]:
with open('../../data/italian/rt_new.csv', 'w') as f:
    writer = csv.writer(f)
    header = ['Target', 'Prime', 'Tick(s)', 'RT', 'LD', 'SD', 'LD+SD', 'LD*SD',
             'Target_Length', 'Prime_Length', 'Target_Frequency', 'Prime_Frequency']
    
    writer.writerow(header)
    
    for row in data:
        writer.writerow(row)