<a href="https://colab.research.google.com/github/tnusser/SensorimotorClassifier/blob/master/SensorimotorClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install bert-embedding



In [2]:
import csv
import pandas as pd
import numpy as np
import ast

import gzip
import shutil

from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

from bert_embedding import BertEmbedding

In [75]:
def parse_to_dict(file_path):
    """
    Creates hashmap with word as key and concept vector as value
    :param file_path: path to the conceptnet dictionary file
    :return: hashmap of word and vectors
    """
    concept_hash = {}
    with open(file_path, encoding="utf8") as f:
        text = f.readlines()[1:]
        for line in text:
            first_item = line.split(" ").__getitem__(0)
            concept_hash[first_item] = line
    f.close()
    return concept_hash

def find_word(embedding, word, dictionary, mode=None):
    """
    Finds embedding vector for a word in the conceptnet hashmap
    :param word: input word to analyze
    :param concept_hash: hashmap of word and conceptnet vector
    :return: returns the appropriate vector or none if its not in the hashmap
    """
    if embedding == "conceptnet":
      if word in dictionary.keys():
          vector = dictionary[word].split(" ")[1:]
          vector = [float(i) for i in vector]
      else:
          vector = []
    if embedding == "bert":
      bert_vec = bert_embedding([word])[0][1]
      if mode == "add":
        vector = np.asarray([sum(x) for x in zip(*bert_vec)])
    return vector

def from_np_array(array_string):
    """
    Converts string array from imported csv to an actual
    numpy array
    :array_string input string which can be represented as np array
    """
    array_string = ','.join(array_string.replace('[ ', '[').split())
    return np.array(ast.literal_eval(array_string))

In [4]:
# Sensorimotor Dataset
!wget -O "data.csv" "https://osf.io/48wsc/download"

# ConceptNet Word Embeddings
!wget https://conceptnet.s3.amazonaws.com/downloads/2019/numberbatch/numberbatch-en-19.08.txt.gz

--2020-08-11 13:20:37--  https://osf.io/48wsc/download
Resolving osf.io (osf.io)... 35.190.84.173
Connecting to osf.io (osf.io)|35.190.84.173|:443... connected.
HTTP request sent, awaiting response... 302 FOUND
Location: https://files.de-1.osf.io/v1/resources/rwhs6/providers/osfstorage/5cc2d6441906ec0017056ba8?action=download&direct&version=1 [following]
--2020-08-11 13:20:38--  https://files.de-1.osf.io/v1/resources/rwhs6/providers/osfstorage/5cc2d6441906ec0017056ba8?action=download&direct&version=1
Resolving files.de-1.osf.io (files.de-1.osf.io)... 35.186.249.111
Connecting to files.de-1.osf.io (files.de-1.osf.io)|35.186.249.111|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 17196336 (16M) [text/csv]
Saving to: ‘data.csv’


2020-08-11 13:20:46 (4.61 MB/s) - ‘data.csv’ saved [17196336/17196336]

--2020-08-11 13:20:47--  https://conceptnet.s3.amazonaws.com/downloads/2019/numberbatch/numberbatch-en-19.08.txt.gz
Resolving conceptnet.s3.amazonaws.com (conceptnet

In [5]:
with gzip.open("numberbatch-en-19.08.txt.gz",'rb') as f_in:
    with open('numberbatch-en.txt','wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
concept_hash = parse_to_dict("numberbatch-en.txt")

In [6]:
df = pd.read_csv("data.csv", usecols=["Word", "Auditory.mean", "Gustatory.mean", "Haptic.mean", "Interoceptive.mean", "Olfactory.mean", "Visual.mean"])
df.columns = ["word", "auditory", "gustatory", "haptic", "interoceptive", "olfactory", "visual"]
df["word"] = df["word"].str.lower()
df['word'] = df['word'].str.replace(' ','_')
df.head()

Unnamed: 0,word,auditory,gustatory,haptic,interoceptive,olfactory,visual
0,a,2.214286,0.0,0.428571,0.0,0.0,2.428571
1,a_cappella,4.333333,0.0,0.222222,0.722222,0.0,1.666667
2,aardvark,1.625,0.5625,1.625,0.0625,1.25,4.125
3,aback,1.294118,0.058824,0.294118,1.352941,0.0,2.823529
4,abacus,1.555556,0.166667,3.722222,0.277778,0.111111,3.944444


In [50]:
vecs = []
df["max_val"] = df.iloc[:,1:7].idxmax(axis=1)
for index, row in df.iterrows():
    word_vec = find_word(embedding="conceptnet", word=row['word'], dictionary=concept_hash)
    if word_vec == None:
        df.drop(index, inplace=True)
    else:
        vecs.append(word_vec)
df["vec"] = vecs
df.head()

Unnamed: 0,word,auditory,gustatory,haptic,interoceptive,olfactory,visual,max_val,vec
0,a,2.214286,0.0,0.428571,0.0,0.0,2.428571,visual,"[-0.1011, -0.0806, -0.0092, 0.0901, -0.0323, -..."
2,aardvark,1.625,0.5625,1.625,0.0625,1.25,4.125,visual,"[0.0341, 0.0697, 0.0826, -0.0504, -0.1586, 0.0..."
3,aback,1.294118,0.058824,0.294118,1.352941,0.0,2.823529,visual,"[-0.0821, -0.0935, 0.0306, -0.0153, 0.0239, -0..."
4,abacus,1.555556,0.166667,3.722222,0.277778,0.111111,3.944444,visual,"[-0.0015, 0.0511, -0.0005, 0.0978, -0.1432, -0..."
5,abandon,0.941176,0.117647,0.294118,2.117647,0.058824,2.176471,visual,"[-0.1269, -0.1875, -0.0127, -0.0012, 0.1389, 0..."


In [51]:
bert_embedding = BertEmbedding()


In [84]:
df_bert = pd.read_csv("data.csv", usecols=["Word", "Auditory.mean", "Gustatory.mean", "Haptic.mean", "Interoceptive.mean", "Olfactory.mean", "Visual.mean"])
df_bert.columns = ["word", "auditory", "gustatory", "haptic", "interoceptive", "olfactory", "visual"]
vecs = []
df_bert["max_val"] = df_bert.iloc[:,1:7].idxmax(axis=1)
for index, row in df_bert.iterrows():
    word_vec = find_word(embedding="bert", word=row["word"], dictionary=None, mode="add")
    if len(word_vec) < 1:
        df_bert.drop(index, inplace=True)
    else:
        vecs.append(word_vec)
    if index % 1000 == 0:
      print(index)
    if index == 100:
      break;
df_bert_sample = df_bert[:101]
df_bert_sample["vec"] = vecs
df_bert_sample.head()
df_bert_sample.to_csv("output")

0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [85]:
from google.colab import files
files.download('output') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [101]:
df_temp = pd.read_csv("output")
df_temp.vec = df_temp.vec.apply(lambda x: from_np_array(x))
df_temp.vec[0].shape

(768,)

In [8]:
 X_train, X_test, y_train, y_test = train_test_split(np.stack(df.vec, axis=0), df.max_val, test_size=0.2, random_state=42)

In [9]:
gnb = GaussianNB()
neigh = KNeighborsClassifier(n_neighbors=3)
clf = GradientBoostingClassifier(random_state=0, verbose=1)
lgr = LogisticRegression(random_state=0, verbose=1)
dtc = DecisionTreeClassifier(random_state=1)

In [10]:
from sklearn.metrics import precision_recall_fscore_support as score
print(classification_report(y_test, y_pred))

NameError: ignored

In [None]:
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

In [None]:
import matplotlib.pyplot as plt
df.groupby(by="max_val").count().word

In [None]:
y_pred = lgr.fit(X_train, y_train).predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
f1_score(y_test, y_pred, average="weighted")

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Ridge