# Three Words Summary

Import Packages

In [1]:
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer

import gensim
from gensim.models import Word2Vec

import csv

import numpy as np

Read File and Set Up

In [2]:
punc_tokenizer = RegexpTokenizer(r'\w+')

words = []
with open("Cascadia_2018_form_340_responses.csv") as src_file:
    csv_reader = csv.reader(src_file, delimiter=',')
    for row in csv_reader:
        content = row[4]
        words.append(content)

words = words[1:]

Filter out the adjectives and nouns

In [3]:
def parse_token(str):
    return punc_tokenizer.tokenize(str.lower())

parsed_words = []
for w in words:
    temp = parse_token(w)
    parsed_words.extend(temp)

adj_words = []
n_words = []
for w in parsed_words:
    token = word_tokenize(w)
    _, tag = nltk.pos_tag(token)[0]
    if tag == "JJ":
        adj_words.append(token)
    elif tag == "NN":
        n_words.append(token)

Constructing word vector banks for adjectives and nouns

In [4]:
f = open('glove.6B.100d.txt', 'r')
lines = f.read().split('\n')
words_vectors_bank = {}
for l in lines:
    vectors = l.split()
    w = vectors[0]
    v = [float(i) for i in vectors[1:]]
    words_vectors_bank[w] = v

d = 100
adj_vec_matrix = np.zeros(d)
n_vec_matrix = np.zeros(d)
for w in adj_words:
    if w[0] in words_vectors_bank:
        adj_vec_matrix = np.vstack((adj_vec_matrix, words_vectors_bank[w[0]]))
adj_vec_matrix = adj_vec_matrix[1:]
for w in n_words:
    if w[0] in words_vectors_bank:
        n_vec_matrix = np.vstack((n_vec_matrix, words_vectors_bank[w[0]]))
n_vec_matrix = n_vec_matrix[1:]

print("Done generating vector matrices!")

def get_pos_vec_bank(p_tag):
    ret = {}
    for w in words_vectors_bank:
        token = word_tokenize(w)
        _, tag = nltk.pos_tag(token)[0]
        if tag == p_tag:
            ret[w] = words_vectors_bank[w]
    return ret

adj_word_vectors_bank = get_pos_vec_bank("JJ")
n_word_vectors_bank = get_pos_vec_bank("NN")

print("Done generating word vector banks for adj and noun!")

Done generating vector matrices!
Done generating word vector banks for adj and noun!


Finding K-means for adjectives and nouns

In [5]:
def get_Kmeans_matrix(m, k):
    N, d = np.shape(m)
    ret = np.random.randn(k, d)
    for i in range(500):
        sum = np.zeros((k, d))
        cnt = np.zeros(k)
        for j in range(N):
            data = m[j]
            diff = np.sum(np.square(ret - data), axis=1).reshape((1, k))
            sum[np.argmin(diff)] = sum[np.argmin(diff)] + data
            cnt[np.argmin(diff)] += 1
        for j in range(k):
            if cnt[j] != 0:
                sum[j] = sum[j] / cnt[j]
        ret = sum
    return ret

k_adj = 5
adj_kmeans_matrix = get_Kmeans_matrix(adj_vec_matrix, k_adj)
k_n = 5
n_kmeans_matrix = get_Kmeans_matrix(n_vec_matrix, k_n)

print("Done generating K-means matrices!")

Done generating K-means matrices!


Output the summarized words: finding 2 words that are most cosine-similar to each k-mean vector

In [6]:
def output_cos_words(word_vectors_bank, kmeans_matrix, k, num):
    for j in range(k):
        cosine_similarity_w1 = {}
        v1 = kmeans_matrix[j]
        for w in word_vectors_bank:
            v = word_vectors_bank[w]
            cos1 = np.dot(v, v1) / np.sqrt(np.sum(np.square(v)) * np.sum(np.square(v1)))
            cosine_similarity_w1[cos1] = w

        sorted_cos1 = sorted(cosine_similarity_w1, reverse=True)[:num]
        for i in range(0, num):
            print("   " + str(cosine_similarity_w1[sorted_cos1[i]]) + ' ' + str(sorted_cos1[i]))

num_adj = 2
num_n = 2
print(str(k_adj*num_adj) + " summarized adjectives are:")
output_cos_words(adj_word_vectors_bank, adj_kmeans_matrix, k_adj, num_adj)
print(str(k_n*num_n) + " summarized nouns are:")
output_cos_words(n_word_vectors_bank, n_kmeans_matrix, k_n, num_n)

10 summarized adjectives are:
   local 0.8856689514515084
   regional 0.8712964016473018
   inclusive 0.8099856756452893
   sensible 0.7573036691801338
   same 0.8400302926235728
   own 0.8126935754984307
   hippy 0.892779887924643
   spiritualistic 0.70272526837797
   sustainable 0.8751905637343818
   ecological 0.8643748437565358
10 summarized nouns are:
   freedom 0.847512151594694
   democracy 0.8103868936803786
   bioregional 0.8143149498762233
   bioregion 0.7678613207133711
   environmentalism 0.708959283848486
   socialism 0.6790859349680515
   way 0.8942560284709925
   time 0.8513820338878982
   secession 0.9833103918913542
   secede 0.6970632262229792
