In [1]:
import csv
from gensim.models import KeyedVectors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
from sklearn.decomposition import PCA
from spatial import get_grid, find_similarity
from utils import contain_punctuation

plt.style.use('ggplot')
plt.rc('text', usetex=True)
plt.rc('font', family='serif')

csv.field_size_limit(500 * 1024 * 1024)

%matplotlib inline

## Loading and Utility Functions

In [6]:
word2vec = KeyedVectors.load_word2vec_format('wiki/wiki.it.vec')

In [7]:
semantic_thresh = 0.50
lexical_thresh = 0.95

In [3]:
def get_words(language='en', words=None, length=None, isogram=False, max_len=None):    
    
    if words is None and language == 'en':
        with open('data/words_en.pickle', 'rb') as f:
            words = pickle.load(f)    
            
    if words is None and language == 'it':
        with open('data/words_it.pickle', 'rb') as f:
            words = pickle.load(f)    
            
    if length is not None:
        words = [w for w in words if len(w) == length]
    
    if isogram == True:
        words = [w for w in words if len(set(w))==len(w)]
        
    if max_len is not None:
        words = [w for w in words if len(w) < max_len]
        
    return words

def filter_words(template):
    
    return [w for w in words if w[:2] == template]

def get_semantic_neighbors(word):    
    densities = ''
    
    words = word2vec.similar_by_word(word, topn=1000000)
    
    for i, word in enumerate(words):
        if word[1] > semantic_thresh:
            if i != 0:
                densities += ' '
            densities += word[0] 
            densities += ':'
            densities += str(word[1])

    return densities

def get_lexical_neighbors(target, words):
    densities = ''
        
    for i, w in enumerate(words):
        sim = find_similarity(w, target)

        if sim > lexical_thresh:
            if i != 0:
                densities += ' '
            densities += w 
            densities += ':' 
            densities += str(sim)

    return densities

In [4]:
#words = [word for word in word2vec.vocab if not contain_punctuation(word)]
words = get_words(language='it', max_len=26)

## Load experiment data (primes, targets and RTs)

In [5]:
data = []
with open('data/experiment_data.csv', 'r') as f:
    reader = csv.reader(f)
    
    for row in reader:
        data.append(row)

targets = [d[0] for d in data if d[2] == '1']
primes = [d[1] for d in data if d[2] == '1']
desired = targets
desired.extend(primes)

## Save semantic neighbors of words

In [33]:
with open('data/semantic_neighbors1.csv', 'w') as f:
    writer = csv.writer(f)
    header = ['word', 'neighbors']
    writer.writerow(header)

    for i, d in enumerate(desired[556:]):
        print(i, d)
        vals = get_semantic_neighbors(d)
        row = [d, vals]
        writer.writerow(row)

0 visino
1 zainetto


## Lexical Neighbors of words

In [20]:
with open('data/lexical_neighbors1.csv', 'w') as f:
    writer = csv.writer(f)
    header = ['word', 'neighbors']
    writer.writerow(header)

    for i, d in enumerate(desired[532:]):
        print(i, d)
        vals = get_lexical_neighbors(d, filter_words(d[:2]))
        row = [d, vals]
        writer.writerow(row)

0 sassata
1 scarpina
2 scimmietta
3 servile
4 soldino
5 stilista
6 tastiera
7 tavolata
8 taxista
9 tazzina
10 tendone
11 terrestre
12 tigrotto
13 torello
14 torretta
15 tortina
16 trombetta
17 unghietta
18 vasetto
19 veliero
20 vestitino
21 vetrata
22 vigilanza
23 villona
24 visino
25 zainetto


### Read lexical neigbor list and find the distance of top neighbors on semantic map

In [16]:
final_words = []
final_neighbors = []

with open('data/lexical_neighbors.csv', 'r') as f:
    reader = csv.reader(f)
    
    # How many top lexical neigbors to extract?
    TOP_ITEMS = None
    
    for count, row in enumerate(reader):
        
        if count == 0:
            continue
        
        target = row[0]
        neighbors = row[1].split(" ")[1:]
        names = [n.split(":")[0] for n in neighbors]
        scores = [float(n.split(":")[1]) for n in neighbors]
        
        order = np.flip(np.argsort(scores), axis=0)
        names = [names[o] for o in order]
        
        if TOP_ITEMS is not None:
            if len(order) > TOP_ITEMS:
                order = order[:TOP_ITEMS]
                names = names[:TOP_ITEMS]
                
        try:
            similarities = [word2vec.similarity(target, n) for n in names]
            neighbors = [neighbors[o] + ':' + str(s) + ':' + str(len(scores)) for o,s in zip(order, similarities)]
            neighbors = ' '.join(neighbors)
        except:
            continue
        
        final_words.append(target)
        final_neighbors.append(neighbors)
        
        count += 1

print(len(final_words))
rows = zip(final_words, final_neighbors)

for row in rows:
    with open('data/neighbors/' + row[0] + '.csv', 'w') as f:
        writer = csv.writer(f)  
        header = ['word', 'neighbors']
        writer.writerow(header)
        writer.writerow(row)

553


### Read semantic neighbors csv and create csv for each word

In [21]:
rows = []

with open('data/semantic_neighbors.csv', 'r') as f:
    reader = csv.reader(f)
    
    for row in reader:
        rows.append(row)
        
for row in rows:
    with open('data/semantic_neighbors/' + row[0] + '.csv', 'w') as f:
        writer = csv.writer(f)  
        header = ['word', 'neighbors']
        writer.writerow(header)
        writer.writerow(row)

In [None]:
def get_words(language='en', words=None, length=None, isogram=True):    
    
    if words is None and language == 'en':
        with open('data/words_en.pickle', 'rb') as f:
            words = pickle.load(f)    
            
    if length is not None:
        words = [w for w in words if len(w) == length]
    
    if isogram == True:
        words = [w for w in words if len(set(w))==len(w)]
        
    return words

def get_semantic_vectors(words):
    vectors = np.array([word2vec[word] for word in words])
    return vectors

def get_lexical_vectors(words, dimensions):
    grids = np.array([get_grid(w, repeat=1, normalize=True) for w in words])
    pca = PCA(n_components=dimensions)
    pca.fit(grids)
    return pca.fit_transform(grids)

def save_txt_to_csv():
    # Maybe required later.
    targets, primes, ticks, conditions, rts = [], [], [], [], []
    data = pd.read_csv('data/data.txt', delimiter='\t')

    for i in range(len(data)):
        row = data[data.columns[0]][i].split(' ')
        row = [r for r in row if r != '']
        targets.append(row[0])
        primes.append(row[1][1:-1])
        ticks.append(row[2][1:-1])
        conditions.append(row[3][1:-1])
        rts.append(row[4])

    rows = zip(targets, primes, ticks, conditions, rts)

    with open('data/experiment_data.csv', 'w') as f:
        writer = csv.writer(f)

        for row in rows:
            writer.writerow(row)