# Task 2 Generate Sparse Representations 

## Import Library

In [1]:
import nltk
import numpy as np
from os import listdir
from collections import OrderedDict

## Vocabulary list and segment vectors

### First we read the stop-word into a set

In [2]:
with open('./stopwords_en.txt','r') as f:
    stopwords = set()
    i = 0
    for line in f:
        if line.endswith('\n'):
            stopwords.add(line[:-1])
        else:
            stopwords.add(line)

### Then we can generate the boundary vectors and vocabulary list by searching through all transcripts

#### Initialize tokenizer

We initialize a regex tokenizer object from the ntlk package

In [3]:
tokenizer = nltk.tokenize.RegexpTokenizer("\w+(?:[-']\w+)?")

#### Generata segment vector and vocabulary list

In [4]:
vocabs = {}
with open('./topic_segs.txt','w') as seg:
    for file in sorted(listdir('./txt_files/')):
        word_set = set()
        with open('./txt_files/'+file,'r') as f:
            i = 0
            vec = np.zeros(0,dtype=str)
            for line in f:
                words = tokenizer.tokenize(line.lower())
                if line.startswith(' '):
                    for word in words:
                        if word not in word_set and word not in stopwords:#new words not in stopwords are added
                            word_set.add(word)
                        else:
                            pass
                    vec = np.append(vec,0)
                    i += 1 
                if line == '**********\n':
                    vec[i-1] = 1# when meet a topical boundary, alter the last element in the array to mark it
        for word in word_set:
            if word in vocabs:
                vocabs[word] += 1
            else:
                vocabs[word] = 1
        seg.write(file[:-4]+':')
        seg.write(','.join(list(vec))+'\n')
        


Then we can write the vocabularies into the txt file in the required format:

In [5]:
final_vocabs = []
for vocab in vocabs:
    if  vocabs[vocab] <= 132:
        final_vocabs.append(vocab)
final_vocabs = enumerate(sorted(final_vocabs))
with open('./vocab.txt','w') as f:
    for i,e in final_vocabs:
        f.write(e + ':' + str(i) + '\n')

## Sparse representation

### Now we already have the word list with index, we can no generate the sparse representation of each transcript

first we need to load the vocabulary list into the memory:

In [6]:
vocab_index = {}
with open('./vocab.txt','r') as f:
    for line in f:
        word,index = line.split(':')[0], line.split(':')[1][:-1]
        vocab_index[word] = index

Then we can generate the sparse files:

In [7]:
for file in sorted(listdir('./txt_files/')):
    with open('./sparse_files/' + file,'w') as f, open('./txt_files/'+file,'r') as read :
        for line in read:
            sparse = OrderedDict()
            if line.startswith(' '):
                words = tokenizer.tokenize(line.lower())
                for word in words:
                    if word in vocab_index:
                        index = vocab_index[word]
                        if index not in sparse:
                            sparse[index] = 1
                        else:
                            sparse[index] += 1
            if len(sparse) == 0:
                continue
            for k,v in list(sparse.items())[:-1]:
                f.write(str(k) + ':' + str(v)+',')
            f.write(str(list(sparse.items())[-1][0]) + ':' + str(list(sparse.items())[-1][1]))
            f.write('\n')