In [14]:
import csv
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [8]:
!wget "https://osf.io/48wsc/download"

--2020-08-07 14:27:46--  https://osf.io/48wsc/download
Resolving osf.io (osf.io)... 35.190.84.173
Connecting to osf.io (osf.io)|35.190.84.173|:443... connected.
HTTP request sent, awaiting response... 302 FOUND
Location: https://files.de-1.osf.io/v1/resources/rwhs6/providers/osfstorage/5cc2d6441906ec0017056ba8?action=download&direct&version=1 [following]
--2020-08-07 14:27:46--  https://files.de-1.osf.io/v1/resources/rwhs6/providers/osfstorage/5cc2d6441906ec0017056ba8?action=download&direct&version=1
Resolving files.de-1.osf.io (files.de-1.osf.io)... 35.186.249.111
Connecting to files.de-1.osf.io (files.de-1.osf.io)|35.186.249.111|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 17196336 (16M) [text/csv]
Saving to: ‘download.2’


2020-08-07 14:27:48 (65.1 MB/s) - ‘download.2’ saved [17196336/17196336]



In [9]:
df = pd.read_csv("download", usecols=["Word", "Auditory.mean", "Gustatory.mean", "Haptic.mean", "Interoceptive.mean", "Olfactory.mean", "Visual.mean"])
df.columns = ["word", "auditory", "gustatory", "haptic", "interoceptive", "olfactory", "visual"]
df["word"] = df["word"].str.lower()
df['word'] = df['word'].str.replace(' ','_')
df.head()

Unnamed: 0,word,auditory,gustatory,haptic,interoceptive,olfactory,visual
0,a,2.214286,0.0,0.428571,0.0,0.0,2.428571
1,a_cappella,4.333333,0.0,0.222222,0.722222,0.0,1.666667
2,aardvark,1.625,0.5625,1.625,0.0625,1.25,4.125
3,aback,1.294118,0.058824,0.294118,1.352941,0.0,2.823529
4,abacus,1.555556,0.166667,3.722222,0.277778,0.111111,3.944444


In [10]:
def parse_to_dict(file_path):
    """
    Creates hashmap with word as key and concept vector as value
    :param file_path: path to the conceptnet dictionary file
    :return: hashmap of word and vectors
    """
    concept_hash = {}
    with open(file_path, encoding="utf8") as f:
        text = f.readlines()[1:]
        for line in text:
            first_item = line.split(" ").__getitem__(0)
            concept_hash[first_item] = line
    f.close()
    return concept_hash

def find_word(word, concept_hash):
    """
    Finds conceptnet vector for a word in the conceptnet hashmap
    :param word: input word to analyze
    :param concept_hash: hashmap of word and conceptnet vector
    :return: returns the appropriate vector or none if its not in the hashmap
    """
    if word in concept_hash.keys():
        vector = concept_hash[word].split(" ")[1:]
        vector = [float(i) for i in vector]
    else:
        vector = None
    return vector

In [11]:
file = !wget https://conceptnet.s3.amazonaws.com/downloads/2019/numberbatch/numberbatch-en-19.08.txt.gz
  
import gzip
import shutil
with gzip.open("numberbatch-en-17.06.txt.gz",'rb') as f_in:
    with open('numberbatch-en.txt','wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
concept_hash = parse_to_dict("numberbatch-en.txt")

In [12]:
vecs = []
df["max_val"] = df.iloc[:,1:7].idxmax(axis=1)
for index, row in df.iterrows():
    word_vec = find_word(row['word'], concept_hash)
    if word_vec == None:
        df.drop(index, inplace=True)
    else:
        vecs.append(word_vec)
df["vec"] = vecs
df.head()

Unnamed: 0,word,auditory,gustatory,haptic,interoceptive,olfactory,visual,max_val,vec
0,a,2.214286,0.0,0.428571,0.0,0.0,2.428571,visual,"[0.1649, 0.1359, 0.0912, -0.1026, -0.0617, 0.0..."
2,aardvark,1.625,0.5625,1.625,0.0625,1.25,4.125,visual,"[-0.0285, -0.0678, 0.0822, 0.0536, 0.0536, 0.1..."
3,aback,1.294118,0.058824,0.294118,1.352941,0.0,2.823529,visual,"[0.0945, 0.0931, 0.0303, 0.0072, 0.0287, -0.00..."
4,abacus,1.555556,0.166667,3.722222,0.277778,0.111111,3.944444,visual,"[0.0149, -0.0317, 0.0713, -0.0536, -0.0918, 0...."
5,abandon,0.941176,0.117647,0.294118,2.117647,0.058824,2.176471,visual,"[0.1708, 0.1615, -0.0372, 0.0039, 0.0269, -0.1..."


In [15]:
train = df[:30000]
test = df[30000:]

X_train = np.stack(train.vec, axis=0)
y_train = train.max_val
X_test = np.stack(test.vec, axis=0)
y_test = test.max_val

In [38]:
gnb = GaussianNB()
neigh = KNeighborsClassifier(n_neighbors=3)
clf = GradientBoostingClassifier(random_state=0, verbose=1)
lgr = LogisticRegression(random_state=0, verbose=1)
dtc = DecisionTreeClassifier(random_state=1)

In [39]:
y_pred = dtc.fit(X_train, y_train).predict(X_test)

In [40]:
accuracy_score(y_test, y_pred)

0.6841191066997518

In [47]:
f1_score(y_test, y_pred, average="weighted")

0.687473361346805