# Task2: Words Clustering

In [9]:
# useful imports and others
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
import random
corpusB_path = "./corpusB/"
corpusC_path = "./corpusC/"

In [10]:
# read target words list
target_words = []
with open("./target-words.txt", "r") as f:
    words = f.readlines()

for target_word in words:
    target_words.append(target_word.strip("\n"))

print(target_words)

['abstraction', 'actually', 'add', 'address', 'answer', 'argument', 'arguments', 'back', 'call', 'car', 'case', 'cdr', 'computer', 'course', 'dictionary', 'different', 'evaluator', 'function', 'general', 'got', 'idea', 'kind', 'lambda', 'machine', 'mean', 'object', 'operator', 'order', 'pair', 'part', 'particular', 'pattern', 'place', 'problem', 'process', 'product', 'program', 'reason', 'register', 'result', 'set', 'simple', 'structure', 'system', 'they', 'together', 'using', 'variable', 'why', 'zero']


In [11]:
# read corpus words
corpus = []
files = os.listdir(corpusB_path)
for file in files:
    with open(corpusB_path + file, "r") as f:
        # read whole file in a string
        content = f.read()
        content = content.replace("\n", "")
        corpus.append(content)

# print(corpus)



In [12]:
def target_vocab_matrix(corpus, target_words):
    # build doc-term matrix
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(corpus)
    vocablulary_words = vectorizer.get_feature_names()
    word_id_dict = vectorizer.vocabulary_
    doc_term_matrix = np.array(X.toarray())

    # build target_vocab_matrix
    term_term_matrix = []
    for word in target_words:
        index = word_id_dict.get(word)
        if index is not None:
            target_list = doc_term_matrix[:, index]
            term_term_row = []
            for vocab_word in vocablulary_words:
                vocab_index = word_id_dict.get(vocab_word)
                vocab_list = doc_term_matrix[:, vocab_index]
                term_term_row.append(np.dot(target_list, vocab_list))
            term_term_matrix.append(term_term_row)

    term_term_matrix = np.array(term_term_matrix)
    # print(term_term_matrix.shape)
    return term_term_matrix

In [13]:
def cluster_target_words(corpus, target_words, num_clusters):
    target_vocab = target_vocab_matrix(corpus, target_words)
    kmeans = KMeans(n_clusters=num_clusters, random_state=None).fit(target_vocab)
    # print(kmeans.labels_)
    return kmeans.labels_

In [14]:
# build reversed target words
reversed_target_words = []
for target_word in target_words:
    reversed_target_words.append(target_word)
    reversed_target_words.append(target_word[::-1])

print(reversed_target_words)
# print(len(reversed_target_words))


['abstraction', 'noitcartsba', 'actually', 'yllautca', 'add', 'dda', 'address', 'sserdda', 'answer', 'rewsna', 'argument', 'tnemugra', 'arguments', 'stnemugra', 'back', 'kcab', 'call', 'llac', 'car', 'rac', 'case', 'esac', 'cdr', 'rdc', 'computer', 'retupmoc', 'course', 'esruoc', 'dictionary', 'yranoitcid', 'different', 'tnereffid', 'evaluator', 'rotaulave', 'function', 'noitcnuf', 'general', 'lareneg', 'got', 'tog', 'idea', 'aedi', 'kind', 'dnik', 'lambda', 'adbmal', 'machine', 'enihcam', 'mean', 'naem', 'object', 'tcejbo', 'operator', 'rotarepo', 'order', 'redro', 'pair', 'riap', 'part', 'trap', 'particular', 'ralucitrap', 'pattern', 'nrettap', 'place', 'ecalp', 'problem', 'melborp', 'process', 'ssecorp', 'product', 'tcudorp', 'program', 'margorp', 'reason', 'nosaer', 'register', 'retsiger', 'result', 'tluser', 'set', 'tes', 'simple', 'elpmis', 'structure', 'erutcurts', 'system', 'metsys', 'they', 'yeht', 'together', 'rehtegot', 'using', 'gnisu', 'variable', 'elbairav', 'why', 'yhw',

In [15]:
# build randomly reversed corpusB
reversed_corpus = []
for doc in corpus:
    new_doc = []
    tokens = doc.split(" ")
    for token in tokens:
        if token in target_words:
            if random.random() > 0.5:
                token = token[::-1]
        new_doc.append(token)
    new_doc = " ".join(new_doc)
    reversed_corpus.append(new_doc)

# print(len(reversed_corpus))



In [16]:
overall_accuracy = 0
for i in range(5):
    result = cluster_target_words(reversed_corpus, reversed_target_words, 50)
    # print(result)
    cnt_correct = 0
    for i in range(50):
        index_list = list(np.where(result==i))[0]
        for j in range(len(index_list)-1):
            if index_list[j] + 1 == index_list[j+1] and index_list[j] % 2 == 0:
                cnt_correct += 1
                break    

    print("Correct Pairs: " + str(cnt_correct))
    print("Accuracy: " + str(cnt_correct / 50))

    overall_accuracy += cnt_correct / 50

print("Overall Accuracy: " + str(overall_accuracy / 5))

Correct Pairs: 7
Accuracy: 0.14
Correct Pairs: 6
Accuracy: 0.12
Correct Pairs: 6
Accuracy: 0.12
Correct Pairs: 6
Accuracy: 0.12
Correct Pairs: 5
Accuracy: 0.1
Overall Accuracy: 0.12


In [17]:
# build corpus
def build_corpus(context_size, corpus, target_words):
    new_corpus = []
    for target_word in target_words:
        for doc in corpus:
            words_array = doc.split(" ")
            for i in range(len(words_array)):
                if words_array[i] in target_words:
                    if i-context_size >= 0:
                        slice_start = i-context_size
                    else:
                        slice_start = 0
                    if i+context_size <= len(words_array):
                        slice_end = i+context_size
                    else:
                        slice_end = len(words_array)
                    context = words_array[slice_start:slice_end]
                    context = " ".join(context)
                    new_corpus.append(context)

    return new_corpus
    

In [None]:
corpusB_win5 = build_corpus(5, reversed_corpus, reversed_target_words)
overall_accuracy = 0
for i in range(5):
    result = cluster_target_words(corpusB_win5, reversed_target_words, 50)
    # print(result)
    cnt_correct = 0
    for i in range(50):
        index_list = list(np.where(result==i))[0]
        for j in range(len(index_list)-1):
            if index_list[j] + 1 == index_list[j+1] and index_list[j] % 2 == 0:
                cnt_correct += 1
                break    

    print("Correct Pairs: " + str(cnt_correct))
    print("Accuracy: " + str(cnt_correct / 50))

    overall_accuracy += cnt_correct / 50

print("Overall Accuracy: " + str(overall_accuracy / 5))