In [6]:
import pandas as pd
import spacy
import random
import os

nlp = spacy.load("en_core_web_sm")

In [2]:
def check_pos(words):
    nouns = []
    for doc in nlp.pipe(map(str, words)):
        for token in doc:
            if token.tag_ == "NN":
                nouns.append(token.text)
    return nouns

def get_nouns(xl_file):
    df = pd.read_excel(xl_file)
    words = df['Word'].to_list()
    processed_words = check_pos(words)
    ratings = df['Conc.M'].to_list()
    dictionary_of_ratings = {"concrete" : [], "middle" : [], "abstract" : []}
    for word, rating in zip(words, ratings):
        if word in processed_words:
            if rating <= 2:
              dictionary_of_ratings["abstract"].append((word, rating))
            elif rating > 2 and rating < 4:
              dictionary_of_ratings["middle"].append((word, rating))
            elif rating > 4:
              dictionary_of_ratings["concrete"].append((word, rating))
    return dictionary_of_ratings

dictionary_of_ratings = get_nouns("brysbaert_dataset.xlsx")

In [3]:
def create_data(wordlist, tag):
    dictionaries = []
    for i in range(len(wordlist)):
        data = {"word" : wordlist[i][0], "rating" : wordlist[i][1], "tag" : tag}
        dictionaries.append(data)
    return dictionaries

def get_random_nouns(dictionary):
    new_dictionaries = []
    concrete = dictionary["concrete"]
    middle = dictionary["middle"]
    abstract = dictionary["abstract"]

    conc = create_data(random.sample(concrete, 100), "concrete")
    mid = create_data(random.sample(middle, 100), "middle")
    abs = create_data(random.sample(abstract, 100), "abstract")

    new_dictionaries.append(conc)
    new_dictionaries.append(mid)
    new_dictionaries.append(abs)

    return new_dictionaries

data = get_random_nouns(dictionary_of_ratings)

In [4]:
def create_data_table(tag, dictionary_list):
    column_names=["word", "rating", "tag"]

    df = pd.DataFrame(dictionary_list, columns=column_names)
    filepath = tag + "_dataset.csv"
    file = df.to_csv(filepath, index=False)
    read_file = pd.read_csv(filepath)

#create_data_table("concrete", data[0])
#create_data_table("middle", data[1])
#create_data_table("abstract", data[2])

In [5]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss

  from .autonotebook import tqdm as notebook_tqdm


In [59]:
def get_concrete_words(file):
  file_name = os.path.basename(file)
  if 'xlsx'in file_name:
    df = pd.read_excel(file)
    words = df['Word'].to_list()
    ratings = df['Conc.M'].to_list()
    concrete_words = {'word' : []}
    for word, rating in zip(words, ratings):
        if rating > 4:
            concrete_words['word'].append(word)
    return concrete_words

In [60]:
concrete_words = get_concrete_words('brysbaert_dataset.xlsx')
#df = pd.DataFrame(concrete_words)
#df.to_csv('all_concrete_words.csv', index=False)

In [61]:
df = pd.read_csv('all_concrete_words.csv')
text = df['word']
encoder = SentenceTransformer("paraphrase-mpnet-base-v2")
vectors = encoder.encode(text)

In [62]:
vector_dimension = vectors.shape[1]
index = faiss.IndexFlatL2(vector_dimension)
faiss.normalize_L2(vectors)
index.add(vectors)

In [63]:
import numpy as np
search_text = 'alzheimer'
search_vector = encoder.encode(search_text)
_vector = np.array([search_vector])
faiss.normalize_L2(_vector)

In [64]:
k = 3
distances, ann = index.search(_vector, k=k)

In [65]:
results = pd.DataFrame({'distances': distances[0], 'ann': ann[0]})
merge = pd.merge(results, df, left_on='ann', right_index=True)
merge
words = text
neighbors = [words[idx] for idx in ann[0]]
print(neighbors)

['brain', 'grandmother', 'grandma']


In [66]:
def get_nearest_neighbors(abstract_file):
    all_concrete_words = pd.read_csv('all_concrete_words.csv')
    all_concrete_words = all_concrete_words['word']
    encoder = SentenceTransformer("paraphrase-mpnet-base-v2")
    vectors = encoder.encode(all_concrete_words)

    vector_dimension = vectors.shape[1]
    index = faiss.IndexFlatL2(vector_dimension)
    faiss.normalize_L2(vectors)
    index.add(vectors)

    list_of_dicts = []
    df = pd.read_csv(abstract_file)
    words = df['word'].to_list()
    for word in words:
        search_vector = encoder.encode(word)
        _vector = np.array([search_vector])
        faiss.normalize_L2(_vector)
        distances, ann = index.search(_vector, k=3)
        neighbors = [all_concrete_words[idx] for idx in ann[0]]
        dictionary = {'word' : word, "neighbours" : neighbors}
        list_of_dicts.append(dictionary)
    return list_of_dicts

In [67]:
middle_neighbors = get_nearest_neighbors('middle_dataset.csv')
abstract_neighbors = get_nearest_neighbors('abstract_dataset.csv')

In [68]:
def update_table(existing_file, dictionary):
    existing_data = pd.read_csv(existing_file)
    new_data = pd.DataFrame(dictionary)

    merged_file = pd.merge(existing_data, new_data, on='word', how='outer')
    merged_file.to_csv(existing_file, index=False)


In [69]:
#update_table('middle_dataset.csv', middle_neighbors)
#update_table('abstract_dataset.csv', abstract_neighbors)

In [10]:
#!pip install flickrapi
import flickrapi
import requests 

In [11]:
flickr_keys = {'key' : u'c9221c0662473c4749ebbf3ee984ca7b',
               'secret' : u'd2ee04a8ab2d9f4e'}

key, secret = flickr_keys.values()

flickr = flickrapi.FlickrAPI(key, secret, format = 'parsed-json')
#gets images for concrete words
def get_photos(datafile):
    df = pd.read_csv(datafile)
    list_of_urls = []
    words = df['word'].to_list()
    for word in words:
        objects = flickr.photos.search(
            text = word, # Search term
            per_page=12,
            extras = 'url_c', # Number of results per page
            privacy_filter=1, #public photos
            safe_search=1, # is safe
            sort = 'relevance'
            )
        urls = []
        for object in objects['photos']['photo']:
            url = object.get('url_c')
            if url is not None:
                urls.append(url)
        list_of_urls.append(urls)
    df['photos'] = list_of_urls
    df.to_csv(datafile, index=False)
    return list_of_urls

In [12]:
urls = get_photos('concrete_dataset.csv')

In [13]:
import ast
def download_image(datafile, directory):
    if not os.path.exists(directory):
       os.makedirs(directory)
    df = pd.read_csv(datafile)
    df['photos'] = df['photos'].apply(ast.literal_eval)
    urls = df['photos'].to_list()
    filenames = []
    for url_list in urls:
        image_names = []
        for url in url_list:
            filename = os.path.join(directory, url.split('/')[-1])
            response = requests.get(url)
            image_names.append(filename.split('/')[-1])
            with open(filename, "wb") as outputfile:
                outputfile.write(response.content)
        filenames.append(image_names)
    df['photos'] = filenames
    df.to_csv(datafile, index=False)

In [14]:
download_image('concrete_dataset.csv', "images")

In [15]:
import ast
#gets images for abstract and middle words
def get_photos(datafile):
    df = pd.read_csv(datafile)
    list_of_urls = []
    neighbor_words = df['neighbours'].apply(ast.literal_eval)
    for neighbor_list in neighbor_words:
      neighbor_urls = []
      for neighbor in neighbor_list:
        objects = flickr.photos.search(
            text = neighbor, # Search term
            per_page=4,
            extras = 'url_c', # Number of results per page
            privacy_filter=1, #public photos
            safe_search=1, # is safe
            sort = 'relevance'
            )
        for object in objects['photos']['photo']:
            url = object.get('url_c')
            if url is not None:
                neighbor_urls.append(url)
      list_of_urls.append(neighbor_urls)
    df['photos'] = list_of_urls
    df.to_csv(datafile, index=False)
    return list_of_urls

In [16]:
middle_photos = get_photos('middle_dataset.csv')
abstract_photos = get_photos('abstract_dataset.csv')

In [17]:
download_image('middle_dataset.csv', "images")

In [None]:
download_image('abstract_dataset.csv', "images")

In [None]:
conc = pd.read_csv('concrete_dataset.csv')
mid = pd.read_csv('middle_dataset.csv')
abs = pd.read_csv('abstract_dataset.csv')

merged_data = pd.concat([conc, mid, abs], ignore_index=True)
merged_data.to_csv('merged_data.csv', index=False)