# Task 2: Text Clustering

In [1]:
# useful imports and definitions
import os
import numpy as np
from sklearn.cluster import KMeans
import random
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
import string
from copy import deepcopy
corpusB_path = "./corpusB/"
corpusC_path = "./corpusC/"

## Define functions that can be used later

In [2]:
# read target words list
target_words = []
with open("./target-words.txt", "r") as f:
    words = f.readlines()

for target_word in words:
    target_words.append(target_word.strip("\n"))

print(target_words)

# build reversed target words
reversed_target_words = []
for target_word in target_words:
    reversed_target_words.append(target_word)
    reversed_target_words.append(target_word[::-1])

print(reversed_target_words)

['abstraction', 'actually', 'add', 'address', 'answer', 'argument', 'arguments', 'back', 'call', 'car', 'case', 'cdr', 'computer', 'course', 'dictionary', 'different', 'evaluator', 'function', 'general', 'got', 'idea', 'kind', 'lambda', 'machine', 'mean', 'object', 'operator', 'order', 'pair', 'part', 'particular', 'pattern', 'place', 'problem', 'process', 'product', 'program', 'reason', 'register', 'result', 'set', 'simple', 'structure', 'system', 'they', 'together', 'using', 'variable', 'why', 'zero']
['abstraction', 'noitcartsba', 'actually', 'yllautca', 'add', 'dda', 'address', 'sserdda', 'answer', 'rewsna', 'argument', 'tnemugra', 'arguments', 'stnemugra', 'back', 'kcab', 'call', 'llac', 'car', 'rac', 'case', 'esac', 'cdr', 'rdc', 'computer', 'retupmoc', 'course', 'esruoc', 'dictionary', 'yranoitcid', 'different', 'tnereffid', 'evaluator', 'rotaulave', 'function', 'noitcnuf', 'general', 'lareneg', 'got', 'tog', 'idea', 'aedi', 'kind', 'dnik', 'lambda', 'adbmal', 'machine', 'enihca

In [3]:
# read corpus words
def load_corpus(path, stem=False):
    stemmer = PorterStemmer()
    corpus = []
    files = os.listdir(path)
    for file in files:
        with open(path + file, "r") as f:
            # read whole file in a string
            content = f.read()
            # remove symbols and numbers
            content = content.replace("\n", " ")
            content = content.lower()
            content = content.translate(str.maketrans("", "", string.punctuation))
            for i in range(10):
                content = content.replace(str(i), "")
            content = content.split(" ")
            # remove stop words
            stop_words = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
            filtered_content = [w for w in content if not w in stop_words]

            # choose if stemming is needed
            if stem:
                stemmed_content = []
                for plural in filtered_content:
                    if plural not in target_words:
                        stemmed_content.append(stemmer.stem(plural))
                    else:
                        stemmed_content.append(plural)
                filtered_content = stemmed_content 
            corpus.append(filtered_content)

    return corpus


# get the vocabulary words of a corpus
def get_vocab_words(corpus):
    vocab_words = []
    for tokens in corpus:
        vocab_words.extend(tokens)
    vocab_words = list(set(vocab_words))

    return vocab_words

In [4]:
# build a target words vocabulary words matrix of a corpus
def build_target_vocab_matrix(corpus, target_words, vocab_words):
    vocab_dict = {}
    index_list = []
    for i in range(len(vocab_words)):
        index_list.append(i)
    vocab_dict = dict(zip(vocab_words, index_list))

    target_vocab_matrix = []
    for target_word in target_words:
        matrix_row = []
        for vocab_word in vocab_words:
            matrix_row.append(0)
        target_vocab_matrix.append(matrix_row)

    for i in range(len(target_words)):
        for context_tokens in corpus:
            num_oc = context_tokens.count(target_words[i])
            if num_oc > 0:
                for word in context_tokens:
                    index = vocab_dict[word]
                    if word != target_words[i]:
                        target_vocab_matrix[i][index] += num_oc
                    else:
                        target_vocab_matrix[i][index] += 1
    
    return np.array(target_vocab_matrix)

In [5]:
# apply kmeans cluster and return the label of each target word
def cluster_words(n_clusters, target_vocab_matrix):
    kmeans = KMeans(n_clusters=n_clusters).fit(target_vocab_matrix)

    return kmeans.labels_


In [6]:
# for each target word, randomly sample half of its occurances and reverse them
# return the reversed corpus
def build_randomly_reverse_corpus(corpus, target_words):
    reversed_corpus = deepcopy(corpus)
    for target_word in target_words:
        occur_list = []
        for k in range(len(reversed_corpus)):
            for i in range(len(reversed_corpus[k])):
                if reversed_corpus[k][i] == target_word:
                    occur_list.append([k, i])
        random_num = len(occur_list) // 2
        random_index_list = random.sample(occur_list, random_num)
        for index in random_index_list:
            reversed_corpus[index[0]][index[1]] = target_word[::-1]

    return reversed_corpus

In [7]:
# build corpus based on a smaller context window size
def build_corpus(context_size, corpus, target_words):
    new_corpus = []
    for words_array in corpus:
        for i in range(len(words_array)):
            if words_array[i] in target_words:
                if i-context_size >= 0:
                    slice_start = i-context_size
                else:
                    slice_start = 0
                if i+context_size+1 <= len(words_array):
                    slice_end = i+context_size+1
                else:
                    slice_end = len(words_array)
                context = words_array[slice_start:slice_end]
                new_corpus.append(context)
    return new_corpus

## (a) Calculate Target Words Vocabulary Words Matrix

In [8]:
# print matrix for corpusB as an example
corpusB = load_corpus(corpusB_path)
vocab_words = get_vocab_words(corpusB)
target_vocab_matrix = build_target_vocab_matrix(corpusB, target_words, vocab_words)
print("matrix shape: " + str(target_vocab_matrix.shape))
print(target_vocab_matrix)
result = cluster_words(50, target_vocab_matrix)
print(result)

matrix shape: (50, 5711)
[[23281    48     7 ...     0     0     0]
 [ 1320    22     2 ...     0     0     0]
 [21048    27     0 ...     0     0     0]
 ...
 [45364   120     5 ...     0     0     0]
 [    0     0     0 ...     0     0     0]
 [ 9386     7     2 ...     0     0     0]]
[ 4  1 34 23 44 21  0 45 25 35 39 14 33 47 48 29 10 22 26 49 18 37 17  9
 46 19 31  3 32 15 41  8 40 36  7 38 27 43  2 28 11 16 20  5  6 42 12 13
 24 30]


## (b) Apply Cluster on CorpusB with whole document
This could be the standard for (c) analysis to compare with

In [9]:
overall_accuracy = 0
corpusB = load_corpus(corpusB_path)
for k in range(5):
    reversed_corpusB = build_randomly_reverse_corpus(corpusB, target_words)
    vocab_words = get_vocab_words(reversed_corpusB)
    target_vocab_matrix = build_target_vocab_matrix(reversed_corpusB, reversed_target_words, vocab_words)

    result = cluster_words(50, target_vocab_matrix)
    print(result)

    cnt_correct = 0
    for i in range(50):
        index_list = list(np.where(result==i))[0]
        index_list.sort()
        for j in range(len(index_list)-1):
            if index_list[j] + 1 == index_list[j+1] and index_list[j] % 2 == 0:
                cnt_correct += 1
                break    

    print("Correct Pairs: " + str(cnt_correct))
    print("Accuracy: " + str(cnt_correct / 50))

    overall_accuracy += cnt_correct / 50

print("Overall Accuracy: " + str(overall_accuracy / 5))

[24 24  4 18 38 38 40 40  4 35 22 44 47 48 18 18 31 31 33 11 49 49 33 11
 26 26 18 18 18 18 41 41 30 12 21 21  2  2 18 18 35 35  4  4 15 15 17  3
 18 18 36 46  1  1  8 19 42 42 43 43 35 35 25  7  4  4 13 13  6 45 29 29
 16 39 18 18 23  9 28 28 27  0 34 34 37 37  5 32 13 13 35 35 14 14 20 10
 18 18 29 29]
Correct Pairs: 22
Accuracy: 0.44
[ 8  8 16 16 30 30 44 44  2  2 10 10 28 40 16 16 29 29 24 24  7  7  0 24
 39 49 35 35 35 35 25 25 22 12 18 18 17 37 35 35 47 47  2  2 27 27 23 19
 35 35 36 41 14 14 45  1  0  0 31 31 47 47 21  3  2  2 46 46 42  9 32 32
 43 28 16 35 20  5 15 15 11 26 17 17 34 34 48 13  7  7 47 47  4 38 33  6
 35 35 32 32]
Correct Pairs: 22
Accuracy: 0.44
[20 35  6  6 35 37 17 17 22 22 39 49 23 23  6  6  0  0 15 15 38 38 15 15
 44 46 30 30 30 30 32 32 29  9 28 36 25 25 30 30 22 22 22 22 48  0 26  4
 30 30 12 27 45 45  1 19  5  5 43 43 22 22 31  7 22  6  2  2 33 13 34 34
 47 14 30  6 24 10 27 27 18  8 40 40 42 42 21  3 38 38 22 22 11 11 41 16
 30 30 34 34]
Correct Pairs: 1

## (c) Change type of features to stemmed vocabulary words
Using stemmed corpusB to do clustering, it seems that there are not much difference compared with (b)

In [10]:
overall_accuracy = 0
corpusB = load_corpus(corpusB_path, True)
for k in range(5):
    reversed_corpusB = build_randomly_reverse_corpus(corpusB, target_words)
    vocab_words = get_vocab_words(reversed_corpusB)
    target_vocab_matrix = build_target_vocab_matrix(reversed_corpusB, reversed_target_words, vocab_words)

    result = cluster_words(50, target_vocab_matrix)
    
    cnt_correct = 0
    for i in range(50):
        index_list = list(np.where(result==i))[0]
        index_list.sort()
        for j in range(len(index_list)-1):
            if index_list[j] + 1 == index_list[j+1] and index_list[j] % 2 == 0:
                cnt_correct += 1
                break    

    print("Correct Pairs: " + str(cnt_correct))
    print("Accuracy: " + str(cnt_correct / 50))

    overall_accuracy += cnt_correct / 50

print("Overall Accuracy: " + str(overall_accuracy / 5))

Correct Pairs: 21
Accuracy: 0.42
Correct Pairs: 22
Accuracy: 0.44
Correct Pairs: 22
Accuracy: 0.44
Correct Pairs: 21
Accuracy: 0.42
Correct Pairs: 18
Accuracy: 0.36
Overall Accuracy: 0.41600000000000004


## (c) Change size of context to different window size contexts
The accuracy increases when the size of context increases, the maximum accuracy is around 72%

In [11]:
for size in range(5, 41, 5):
    # change context size
    corpusB = load_corpus(corpusB_path)
    corpusB = build_corpus(size, corpusB, target_words)
    overall_accuracy = 0
    for k in range(5):
        reversed_corpusB = build_randomly_reverse_corpus(corpusB, target_words)
        vocab_words = get_vocab_words(reversed_corpusB)
        target_vocab_matrix = build_target_vocab_matrix(reversed_corpusB, reversed_target_words, vocab_words)

        result = cluster_words(50, target_vocab_matrix)
        
        cnt_correct = 0
        for i in range(50):
            index_list = list(np.where(result==i))[0]
            index_list.sort()
            for j in range(len(index_list)-1):
                if index_list[j] + 1 == index_list[j+1] and index_list[j] % 2 == 0:
                    cnt_correct += 1
                    break    

        overall_accuracy += cnt_correct / 50

    print("Context size " + str(size) + " Overall Accuracy: " + str(overall_accuracy / 5))

Context size 5 Overall Accuracy: 0.10800000000000001
Context size 10 Overall Accuracy: 0.308
Context size 15 Overall Accuracy: 0.404
Context size 20 Overall Accuracy: 0.512
Context size 25 Overall Accuracy: 0.624
Context size 30 Overall Accuracy: 0.6799999999999999
Context size 35 Overall Accuracy: 0.712
Context size 40 Overall Accuracy: 0.728


## (c) training data on the quality of generated clusters
The quality of corpusC seems better than corpusB

### train cluster on copusC on whole documents

In [12]:
overall_accuracy = 0
corpusC = load_corpus(corpusC_path)
for k in range(5):
    reversed_corpusC = build_randomly_reverse_corpus(corpusC, target_words)
    vocab_words = get_vocab_words(reversed_corpusC)
    target_vocab_matrix = build_target_vocab_matrix(reversed_corpusC, reversed_target_words, vocab_words)

    result = cluster_words(50, target_vocab_matrix)
    
    cnt_correct = 0
    for i in range(50):
        index_list = list(np.where(result==i))[0]
        index_list.sort()
        for j in range(len(index_list)-1):
            if index_list[j] + 1 == index_list[j+1] and index_list[j] % 2 == 0:
                cnt_correct += 1
                break    

    print("Correct Pairs: " + str(cnt_correct))
    print("Accuracy: " + str(cnt_correct / 50))

    overall_accuracy += cnt_correct / 50

print("Overall Accuracy: " + str(overall_accuracy / 5))



Correct Pairs: 43
Accuracy: 0.86
Correct Pairs: 37
Accuracy: 0.74
Correct Pairs: 40
Accuracy: 0.8
Correct Pairs: 38
Accuracy: 0.76
Correct Pairs: 35
Accuracy: 0.7
Overall Accuracy: 0.772


### train on corpusC and using smaller context windows

In [13]:
for size in range(5, 41, 5):
    overall_accuracy = 0
    corpusC = load_corpus(corpusC_path)
    corpusC = build_corpus(size, corpusC, target_words)
    for k in range(5):
        reversed_corpusC = build_randomly_reverse_corpus(corpusC, target_words)
        vocab_words = get_vocab_words(reversed_corpusC)
        target_vocab_matrix = build_target_vocab_matrix(reversed_corpusC, reversed_target_words, vocab_words)

        result = cluster_words(50, target_vocab_matrix)
        
        cnt_correct = 0
        for i in range(50):
            index_list = list(np.where(result==i))[0]
            index_list.sort()
            for j in range(len(index_list)-1):
                if index_list[j] + 1 == index_list[j+1] and index_list[j] % 2 == 0:
                    cnt_correct += 1
                    break    

        overall_accuracy += cnt_correct / 50

    print("Context size " + str(size) + " Overall Accuracy: " + str(overall_accuracy / 5))

Context size 5 Overall Accuracy: 0.20800000000000002
Context size 10 Overall Accuracy: 0.808
Context size 15 Overall Accuracy: 0.9279999999999999
Context size 20 Overall Accuracy: 0.944
Context size 25 Overall Accuracy: 0.96
Context size 30 Overall Accuracy: 0.96
Context size 35 Overall Accuracy: 0.96
Context size 40 Overall Accuracy: 0.96


### train cluster on combination of corpusB and corpusC

In [14]:
corpusC = load_corpus(corpusC_path)
corpusBC = load_corpus(corpusB_path)
corpusBC.extend(corpusC)

overall_accuracy = 0
for k in range(5):
    reversed_corpusBC = build_randomly_reverse_corpus(corpusBC, target_words)
    vocab_words = get_vocab_words(reversed_corpusBC)
    target_vocab_matrix = build_target_vocab_matrix(reversed_corpusBC, reversed_target_words, vocab_words)
    
    result = cluster_words(50, target_vocab_matrix)
    
    cnt_correct = 0
    for i in range(50):
        index_list = list(np.where(result==i))[0]
        index_list.sort()
        for j in range(len(index_list)-1):
            if index_list[j] + 1 == index_list[j+1] and index_list[j] % 2 == 0:
                cnt_correct += 1
                break    

    print("Correct Pairs: " + str(cnt_correct))
    print("Accuracy: " + str(cnt_correct / 50))

    overall_accuracy += cnt_correct / 50

print("Overall Accuracy: " + str(overall_accuracy / 5))

Correct Pairs: 33
Accuracy: 0.66
Correct Pairs: 35
Accuracy: 0.7
Correct Pairs: 35
Accuracy: 0.7
Correct Pairs: 36
Accuracy: 0.72
Correct Pairs: 35
Accuracy: 0.7
Overall Accuracy: 0.696


### train cluster on combination of corpusB and corpusC using smaller context windows

In [15]:
for size in range(5, 41, 5):
    corpusB = load_corpus(corpusB_path)
    corpusBC = load_corpus(corpusC_path)
    corpusBC.extend(corpusB)
    corpusBC = build_corpus(size, corpusBC, target_words)

    overall_accuracy = 0
    for k in range(5):
        reversed_corpusBC = build_randomly_reverse_corpus(corpusBC, target_words)
        vocab_words = get_vocab_words(reversed_corpusBC)
        target_vocab_matrix = build_target_vocab_matrix(reversed_corpusBC, reversed_target_words, vocab_words)
        
        result = cluster_words(50, target_vocab_matrix)
        
        cnt_correct = 0
        for i in range(50):
            index_list = list(np.where(result==i))[0]
            index_list.sort()
            for j in range(len(index_list)-1):
                if index_list[j] + 1 == index_list[j+1] and index_list[j] % 2 == 0:
                    cnt_correct += 1
                    break    

        overall_accuracy += cnt_correct / 50

    print("Context size " + str(size) + " Overall Accuracy: " + str(overall_accuracy / 5))

Context size 5 Overall Accuracy: 0.156
Context size 10 Overall Accuracy: 0.5
Context size 15 Overall Accuracy: 0.664
Context size 20 Overall Accuracy: 0.784
Context size 25 Overall Accuracy: 0.8880000000000001
Context size 30 Overall Accuracy: 0.9200000000000002
Context size 35 Overall Accuracy: 0.8960000000000001
Context size 40 Overall Accuracy: 0.9200000000000002
