## ASG5
          

Cluster the business sections (/blue/acg7849/share/BS) using Doc2Vec (50 clusters) in two ways:

Using a counter as the ‘tag’ (as in 5.1.5)

Using a counter as the ‘tag’, and the industry code as an additional tag (yield TaggedDocument(words=file_tokens, tags=[i, SIC]) where SIC is a string holding the tag (for example ‘1740’)

Extract the 4-digit SIC industry code from the annual report header (STANDARD INDUSTRIAL CLASSIFICATION).

Required: Evaluate whether adding the industry code as an additional tag improves the clustering. Use the standard deviation of profitability as a way to evaluate this. (Firms that are more similar, should have similar performance. Therefore, a better clustering would result in lower standard deviations for each cluster, relative to a worse clustering).

Do this for the filings for the year 2019 only. Calculate the standard deviation of performance for each cluster (use the year of CONFORMED END OF PERIOD, which are the first 4 digits of ‘date’ in summary.text).

For 50 clusters that means you will have 2 standard deviations for each cluster (one for each approach, with the extra SIC tag vs not adding the extra SIC tag). Use a t-test to test for a difference between the two sets of 50 standard deviations.

In [1]:
# all imports
import os as os
import pandas as pd
import glob
import csv
from pathlib import Path
import html, re
from w3lib.html import replace_entities
import nltk
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from scipy.cluster import  hierarchy
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
import gensim
stopWords = set(stopwords.words('english') )
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk import FreqDist
from nltk.collocations import *



# alternative way for files - piazza

In [3]:
with open('/blue/acg7849/share/BS/summary.txt') as f:
    files = [{k: v for k, v in row.items()}
        for row in csv.DictReader(f, skipinitialspace=True, delimiter="|")]

# files with length > 1000
files = [f for f in files if int(f["length"])> 1000 and (f["date"][0:4]) == '2019']
#print (files[0:10]) # a list of dict
len(files) # =14359 = 4604
#type(files) # list

# files with year 2019
#files = [f for f in files if (f["date"])== "20191231"]  # only date = 1231

4604

#### important variable: files - all files we needed

In [4]:
# generator function that returns one file at the time (just a string) - csv
# note that fit_transform expects one string for each file, so do not tokenize it
# this would be different for doc2vec, which expects a taggeddocument element

def readBSGen():
    for f in files[0:80]: # first 500 
        with open ( '/blue/acg7849/share/BS/item1/{}'.format(f['filename']) , encoding='utf-8') as b:
            BS = b.read()
        yield BS


In [5]:
# set up vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
# read documents using generator
tfidf = vectorizer.fit_transform( readBSGen(  ) )                # used readBSGen function and files w/yield
# dense
tfidf = tfidf.todense()                                   # this returns a matrix

In [6]:
print (tfidf)

[[0.         0.00466174 0.00678589 ... 0.         0.         0.        ]
 [0.0050993  0.02528593 0.         ... 0.         0.         0.        ]
 [0.         0.02800195 0.         ... 0.         0.         0.        ]
 ...
 [0.         0.08022802 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.00563861 0.00978607 0.         ... 0.         0.         0.        ]]


In [7]:
import numpy as np
#c = np.savetxt('geekfile1.txt', tfidf, delimiter =', ') #1500 files = 3.4G large

# codes from 515

In [6]:
# add some punctuation to string.punctuation
punc = string.punctuation + '“”'

# documents get tagged by an index (number), while filenames have different numbers
# keep track of this
fileIdToIndex = {} # given a fileId -> tag
indexToFileId=[] # given a tag -> fileId

class MyFiles(object):
    def __init__(self, dirname, tokens_only = False):
        self.dirname = dirname
        self.tokens_only = tokens_only
 
    def __iter__(self):
        for i, fname in enumerate(os.listdir(self.dirname)[0:200]):
        #for fname in os.listdir(self.dirname):
        # enumerate = return a list of tuples, iterate from start to end, [0:200] must be there
        # os.listdir = return index of a directory, input = directory address
        # this part enumerates the first 200 units in the index under dirname
        
            with open( os.path.join(self.dirname, fname), encoding='utf-8') as f:
                content = f.read()
            # filter
                #content = [f for f in content if int(f["length"])> 1000 and (f["date"][0:4]) == '2019']
            
            # grab id from filename
            myCounter = int (  re.findall(r'(\d*)\.txt', fname)[0] )
            # update 
            fileIdToIndex [ myCounter] = i
            indexToFileId.append( myCounter)
            #print('fname', fname, 'tag', myCounter)
            file_tokens = [x for x in word_tokenize(content) if x.isalpha() and x.lower() not in stopWords and x not in string.punctuation]
            
            if self.tokens_only == True:
                yield file_tokens
            else:
                yield TaggedDocument(words=file_tokens, tags=[i] )                    

In [7]:
# Hipergator
ffiles = MyFiles(r'/blue/acg7849/share/BS/item1/') # a memory-friendly iterator
# dirname = '/blue/acg7849/share/BS/item1/'?

In [8]:
print(ffiles)

<__main__.MyFiles object at 0x2af50fc50790>


In [9]:
# create a model, build vocabulary
model = gensim.models.doc2vec.Doc2Vec(vector_size=100, min_count=2, epochs=10)
model.build_vocab(ffiles)
# train it
model.train(ffiles, total_examples=model.corpus_count, epochs=model.epochs)

In [10]:
# Hipergator
def tokenizeFile(file_id):
    with open( r'/blue/acg7849/share/BS/item1/'+str(file_id)+'.txt', encoding='utf-8') as f:
            content = f.read()
    return ([x for x in word_tokenize(content) if x.isalpha() and x.lower() not in stopWords and x not in string.punctuation] )

In [34]:
t = tokenizeFile(267762)
inferred_vector = model.infer_vector( t )
# dv is short for docvecs
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
#sims = model.dv.most_similar([inferred_vector], topn=4)
sims

[(0, 0.9205102324485779),
 (36, 0.6702579855918884),
 (165, 0.6661437749862671),
 (170, 0.6392346024513245),
 (19, 0.6283066272735596),
 (174, 0.6180595755577087),
 (147, 0.6105557084083557),
 (2, 0.5762907862663269),
 (194, 0.5749393105506897),
 (131, 0.567370593547821),
 (196, 0.5535136461257935),
 (195, 0.5491869449615479),
 (141, 0.534377932548523),
 (83, 0.5232897400856018),
 (76, 0.5225411653518677),
 (122, 0.49498140811920166),
 (23, 0.4948655664920807),
 (50, 0.46032992005348206),
 (178, 0.4539501965045929),
 (54, 0.44672587513923645),
 (197, 0.4432719647884369),
 (69, 0.4347832500934601),
 (55, 0.4179377555847168),
 (192, 0.4177309274673462),
 (123, 0.41336366534233093),
 (49, 0.3947412967681885),
 (129, 0.3885799050331116),
 (119, 0.3856053352355957),
 (15, 0.37767598032951355),
 (11, 0.37748435139656067),
 (91, 0.3748438358306885),
 (96, 0.3704301416873932),
 (45, 0.3697308599948883),
 (56, 0.36824342608451843),
 (71, 0.3641180396080017),
 (185, 0.3579240143299103),
 (162, 0

In [27]:
# letter with filename 1.txt is the first letter, so tag is 0
similar_doc = model.docvecs.most_similar(0)
similar_doc

  similar_doc = model.docvecs.most_similar(0)


[(36, 0.8087785243988037),
 (165, 0.7963026762008667),
 (19, 0.7845235466957092),
 (170, 0.7723568677902222),
 (147, 0.7609975337982178),
 (174, 0.7606906890869141),
 (2, 0.7382687330245972),
 (196, 0.7235947251319885),
 (131, 0.7055116295814514),
 (194, 0.6995262503623962)]

In [28]:
print('number of documents', model.corpus_count)
print('model.docvecs', len(model.dv))

number of documents 200
model.docvecs 200


In [29]:
# Hipergator
# reread the files, and get the vector for each file
# feed vector into k-means algorithm to make clusters
wordLists = MyFiles(r'/blue/acg7849/share/BS/item1/', tokens_only = True) # a memory-friendly iterator
vectors = [ model.infer_vector( w ) for w in wordLists]
len(vectors)
vectors[0]

array([-2.27697149e-01, -5.57108355e+00, -8.76148939e-01, -3.13555098e+00,
        5.74351406e+00, -3.09132266e+00,  3.92001420e-01, -4.20823765e+00,
       -1.64134932e+00, -1.01604569e+00,  1.00201261e+00,  6.66401958e+00,
       -1.60218263e+00, -1.07662237e+00, -4.37635899e+00,  1.36437035e+00,
        3.61313486e+00, -3.64824319e+00,  2.21086666e-01, -4.05213308e+00,
       -3.29910254e+00,  2.12190766e-03,  4.83307242e-01,  9.45488870e-01,
        4.69715929e+00,  1.47742617e+00,  4.44642973e+00,  7.16185663e-03,
       -1.90026569e+00, -3.62248921e+00,  2.56235147e+00, -6.02982640e-01,
       -1.46367574e+00,  6.50530338e-01, -3.38752127e+00,  1.73800576e+00,
        1.15068877e+00, -7.62334883e-01, -1.39826822e+00,  2.98995185e+00,
       -2.31975842e+00,  1.67648923e+00, -8.86507690e-01, -1.84130120e+00,
       -1.80053151e+00,  8.79312381e-02, -1.10911393e+00, -1.52270377e+00,
        2.78991723e+00, -2.91873789e+00,  2.14391065e+00, -1.94369113e+00,
       -4.65059996e+00,  

In [30]:
from nltk.cluster import KMeansClusterer
num_clusters = 10
kclusterer = KMeansClusterer(num_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25)
assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True)

In [31]:
import collections
print(collections.Counter(assigned_clusters))

Counter({8: 55, 7: 29, 5: 23, 6: 20, 2: 18, 4: 17, 3: 11, 1: 11, 0: 9, 9: 7})


# asdsada

In [8]:
# clustering
threshold = 0.5
# Z is the cosine distance matrix
%time Z = hierarchy.linkage(tfidf,"average", metric="cosine")                              #%time=  time used for this step
%time len(Z)                                                                                # match B

CPU times: user 39.6 ms, sys: 29 µs, total: 39.6 ms
Wall time: 39.8 ms
CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 5.25 µs


79

In [9]:
# C are the clusters assigned
%time C = hierarchy.fcluster(Z, threshold, criterion="distance")
print(C)
len(C) #                                                                                 # match C
#max(C)

CPU times: user 329 µs, sys: 0 ns, total: 329 µs
Wall time: 333 µs
[35  3 30 44  5 56 58 52 32 19 22 49 42 71 65 58 53 62 63 66 17  8 10 60
 59 46 45 47 33 43 38 34 24 61 40  1 31 39 54  6 14 50 62 20 63  2 11 37
 15 55 41 23 12 57 69 36 62 21 64 19 51 70 10  9 27 16 13 26 18 48  7 67
 68 62 28 62  4 25 29 19]


80

In [10]:
#keep track of fileIds for letters in different clusters
ids = [ [] for i in range(100) ]
for i, clust in enumerate(C):
    # look up the filename for i (which is the tag), then add it to the list for that cluster
    ids[ clust - 1 ].append( indexToFileId[ i ]  )               #  list index out of range

# first two clusters
ids[0:2]

NameError: name 'indexToFileId' is not defined

In [None]:
for i, clust in enumerate(ids):
    content = ""
    for file in clust:
        with open( str(file) + ".txt", encoding='utf-8') as f:# No such file or directory: '266806.txt'
            content += f.read()
    file_tokens = [x.lower() for x in word_tokenize(content) if x.isalpha() and x.lower() not in stopWords and x not in string.punctuation]
    # now we can use nltk functions on the text
    fdist = FreqDist(file_tokens)
    print ('cluster', i+ 1, 'most common words in this letter', fdist.most_common(5) )