In [None]:
import os
import csv
import subprocess
import re
import random
import numpy as np

os.chdir("/Users/yezheng/Dropbox/Onedrive/CIS530/HW3/lyz")


def read_in_shakespeare():
  '''Reads in the Shakespeare dataset processesit into a list of tuples.
     Also reads in the vocab and play name lists from files.

  Each tuple consists of
  tuple[0]: The name of the play
  tuple[1] A line from the play as a list of tokenized words.

  Returns:
    tuples: A list of tuples in the above format.
    document_names: A list of the plays present in the corpus.
    vocab: A list of all tokens in the vocabulary.
  '''

  tuples = []

  with open('will_play_text.csv') as f:
    csv_reader = csv.reader(f, delimiter=';')
    for row in csv_reader:    
      play_name = row[1]
      line = row[5]
      line_tokens = re.sub(r'[^a-zA-Z0-9\s]', ' ', line).split()
      line_tokens = [token.lower() for token in line_tokens]
      tuples.append((play_name, line_tokens))

  with open('vocab.txt') as f:
    vocab =  [line.strip() for line in f]

  with open('play_names.txt') as f:
    document_names =  [line.strip() for line in f]

  return tuples, document_names, vocab

def get_row_vector(matrix, row_id):
  return matrix[row_id, :]

def get_column_vector(matrix, col_id):
  return matrix[:, col_id]

#line_tuples, document_names, vocab=read_in_shakespeare()

def create_term_document_matrix(line_tuples, document_names, vocab):
  '''Returns a numpy array containing the term document matrix for the input lines.

  Inputs:
    line_tuples: A list of tuples, containing the name of the document and 
    a tokenized line from that document.
    document_names: A list of the document names
    vocab: A list of the tokens in the vocabulary

  Let n = len(document_names) and m = len(vocab).

  Returns:
    td_matrix: A mxn numpy array where the number of rows is the number of documents
        and each column corresponds to a token in the corpus. A_ij contains the
        frequency with which word i occurs in document j.
    vocab: A list containing the tokens being represented by each column.
  '''

  vocab_to_id = dict(zip(vocab, range(0, len(vocab))))
  docname_to_id = dict(zip(document_names, range(0, len(document_names))))
  n = len(docname_to_id) ; m = len(vocab_to_id)
  td_matrix=np.array(   [   [0 for i in range(n)] for j in range(m)        ]     )
  for doc_vocab in line_tuples:
      doc = doc_vocab[0]
      for word in doc_vocab[1]:
          td_matrix[  vocab_to_id[word]  ,docname_to_id[doc]   ]+=1
  return td_matrix

def create_term_context_matrix(line_tuples, vocab, context_window_size=1):
  '''Returns a numpy array containing the term context matrix for the input lines.

  Inputs:
    line_tuples: A list of tuples, containing the name of the document and 
    a tokenized line from that document.
    vocab: A list of the tokens in the vocabulary

  Let n = len(vocab).

  Returns:
    tc_matrix: A nxn numpy array where A_ij contains the frequency with which
        word j was found within context_window_size to the left or right of
        word i in any sentence in the tuples.
    vocab: A list containing the tokens being represented by each column.
  '''
  vocab_to_id = dict(zip(vocab, range(0, len(vocab))))
  m = len(vocab)
  tc_matrix=np.array(   [   [0 for i in range(m)] for j in range(m)        ]     )
  for doc_vocab in line_tuples:
      num_of_vocab = len(doc_vocab[1])
      vocab_list = doc_vocab[1]
      for index in range(num_of_vocab):
          target_word = vocab_list[index]
          for j in range(   max(index-context_window_size,0), min( index+context_window_size+1, num_of_vocab  )  ): 
              if(not j==index):
                context_word =  vocab_list[j]  
                tc_matrix[  vocab_to_id[ target_word ]  ,vocab_to_id[ context_word ]  ]+=1
  return tc_matrix
  

def create_PPMI_matrix(term_context_matrix):
  '''Given a term context matrix, output a PPMI matrix.
  
  See section 15.1 in the textbook.
  
  Hint: Use numpy matrix and vector operations to speed up implementation.
  
  Input:
    term_context_matrix: A nxn numpy array, where n is
        the numer of tokens in the vocab.
  
  Returns: A nxn numpy matrix, where A_ij is equal to the
     point-wise mutual information between the ith word
     and the jth word in the term_context_matrix.
  '''   
  word_freq =  np.sum(term_context_matrix, axis=1);
  total_freq = sum(word_freq )
  term_context_matrix_rmzero= np.copy(term_context_matrix)
  tmp=1.0/(2*total_freq)
  #term_context_matrix_rmzero[term_context_matrix_rmzero==0] = tmp
  term_context_matrix_rmzero = np.maximum(term_context_matrix_rmzero, tmp)
  PPMI=np.subtract(   np.log2(term_context_matrix_rmzero), np.log2(word_freq ) )
  PPMI=np.subtract(   PPMI, np.log2(   np.transpose(np.array([word_freq,] ) ))  )
  PPMI=PPMI+np.log2(total_freq)
  PPMI=np.maximum(PPMI,0) 
  
  
     
  '''
  prob_w = np.sum(term_context_matrix, axis=1); 
  total_sum = sum(prob_w);total_sum =total_sum *1.0
  prob_w =prob_w /total_sum
  prob_c=np.sum(term_context_matrix, axis=0)/total_sum
  prob_wc=term_context_matrix/total_sum
  joint_over_independent = np.multiply(1.0/np.outer(prob_w, prob_c), prob_wc)    #avoid log0 error
  joint_over_independent[joint_over_independent==0]=0.1
  PPMI=np.maximum(np.log2( joint_over_independent  ),0) 
  '''
  return PPMI
  

def create_tf_idf_matrix(term_document_matrix):
  '''Given the term document matrix, output a tf-idf weighted version.

  See section 15.2.1 in the textbook.
  
  Hint: Use numpy matrix and vector operations to speed up implementation.

  Input:
    term_document_matrix: Numpy array where each column represents a document 
    and each row, the frequency of a word in that document.

  Returns:
    A numpy array with the same dimension as term_document_matrix, where
    A_ij is weighted by the inverse document frequency of document h.
  '''
  # YOUR CODE HERE
  N = term_document_matrix.shape[1]; N = 1.0*N
  indicator = 1.0*(term_document_matrix>0)
  df = np.sum(indicator,axis=1)
  idf = np.log(N/df)
  tfidf=np.multiply(term_document_matrix, np.transpose(np.array([idf,])))
  return tfidf

def compute_cosine_similarity(vector1, vector2):
  '''Computes the cosine similarity of the two input vectors.

  Inputs:
    vector1: A nx1 numpy array
    vector2: A nx1 numpy array

  Returns:
    A scalar similarity value.
  '''
  n1=np.inner(vector1, vector1)
  n2=np.inner(vector2, vector2)
  if( n1==0 or n2==0 ) :  sim = 0
  else:
      sim =  1.0*np.inner(vector1, vector2) / (  np.sqrt(n1)  *  np.sqrt(n2) )
  return sim

def compute_jaccard_similarity(vector1, vector2):
  '''Computes the cosine similarity of the two input vectors.

  Inputs:
    vector1: A nx1 numpy array
    vector2: A nx1 numpy array

  Returns:
    A scalar similarity value.
  '''
#  if(   sum(1.0*(np.minimum(vector1, vector2)<0))>0   ):
#      raise ValueError('negative value in vectors')
  denominator = sum(np.maximum(vector1, vector2))
  if(denominator==0):sim = 0
  else:
      sim = 1.0*sum(np.minimum(vector1, vector2) )/denominator
  return sim

def compute_dice_similarity(vector1, vector2):
  '''Computes the cosine similarity of the two input vectors.

  Inputs:
    vector1: A nx1 numpy array
    vector2: A nx1 numpy array

  Returns:
    A scalar similarity value.
  '''
#  if(   sum(1.0*(np.minimum(vector1, vector2)<0))>0   ):
#      raise ValueError('negative value in vectors')
  denominator = sum( vector1+vector2   )*1.0/2
  if(denominator==0):sim = 0
  else:
      sim = 1.0*sum(np.minimum(vector1, vector2))/denominator
  return sim  

def rank_plays(target_play_index, term_document_matrix, similarity_fn):
  ''' Ranks the similarity of all of the plays to the target play.

  # NOTE: THIS DOCSTRING WAS UPDATED ON JAN 24, 12:51 PM.

  Inputs:
    target_play_index: The integer index of the play we want to compare all others against.
    term_document_matrix: The term-document matrix as a mxn numpy array.
    similarity_fn: Function that should be used to compared vectors for two
      documents. Either compute_dice_similarity, compute_jaccard_similarity, or
      compute_cosine_similarity.

  Returns:
    A length-n list of integer indices corresponding to play names,
    ordered by decreasing similarity to the play indexed by target_play_index
  '''
  #ind_to_docname = dict(zip(range(0, len(document_names)), document_names))
  play_sim={}
  for other_doc_ind in range(term_document_matrix.shape[1]):
      play_sim[ other_doc_ind   ] = similarity_fn(  term_document_matrix[:,target_play_index], term_document_matrix[:,other_doc_ind]   )  
  rank = sorted(play_sim, key=play_sim.get, reverse=True)     
  return rank

def rank_words(target_word_index, matrix, similarity_fn):
  ''' Ranks the similarity of all of the words to the target word.

  Inputs:
    vocab: List of terms, corresponding to target_word_index rows (i.e. word corresponding
      to target_word_index[i,:] is given by vocab[i])
    target_word_index: The index of the word we want to compare all others against.
    matrix: Numpy matrix where the ith row represents a vector embedding of the ith word.
    similarity_fn: Function that should be used to compared vectors for two word
      ebeddings. Either compute_dice_similarity, compute_jaccard_similarity, or
      compute_cosine_similarity.

  Returns:
    A length-n list of words, ordered by decreasing similarity to the 
    target word indexed by word_index
  '''
  
  word_sim={}
  for other_word_ind in range(matrix.shape[1]):
      word_sim[ other_word_ind   ] = similarity_fn(  matrix[target_word_index, :], matrix[other_word_ind, :]   )
  rank = sorted(word_sim, key=word_sim.get, reverse=True)    
  return rank

In [1]:
if __name__ == '__main__':
  tuples, document_names, vocab = read_in_shakespeare()
  import time
  print('Computing term document matrix...')
  T0 = time.time()
  td_matrix = create_term_document_matrix(tuples, document_names, vocab)
  T1 = time.time() 
  print("elapsed time", T1 - T0, "(",T1-T0,")" ) 
  
  print('Computing tf-idf matrix...')
  tf_idf_matrix = create_tf_idf_matrix(td_matrix)
  T2 = time.time() 
  print("elapsed time", T2 - T0, "(",T2-T1,")" ) 
  
  print('Computing term context matrix...')
  tc_matrix = create_term_context_matrix(tuples, vocab, context_window_size=2)
  T3 = time.time() 
  print("elapsed time", T3 - T0, "(",T3-T2,")" ) 
  tc_matrix=np.maximum(tc_matrix, 1e-6) 
  #54s/40s

  print('Computing PPMI matrix...')
  PPMI_matrix = create_PPMI_matrix(tc_matrix)
  T4 = time.time() 
  print("elapsed time", T4 - T0, "(",T4-T3,")" ) 

  random_idx = random.randint(0, len(document_names)-1)
  similarity_fns = [compute_cosine_similarity, compute_jaccard_similarity, compute_dice_similarity]
  for sim_fn in similarity_fns:
    print('\nThe 10 most similar plays to "%s" using %s are:' % (document_names[random_idx], sim_fn.__qualname__))
    ranks = rank_plays(random_idx, td_matrix, sim_fn)
    for idx in range(0, 10):
      doc_id = ranks[idx]
      print('%d: %s' % (idx+1, document_names[doc_id]))

  word = 'juliet'
  vocab_to_index = dict(zip(vocab, range(0, len(vocab))))
  for sim_fn in similarity_fns:
    print('\nThe 10 most similar words to "%s" using %s on term-context frequency matrix are:' % (word, sim_fn.__qualname__))
    ranks = rank_words(vocab_to_index[word], tc_matrix, sim_fn)
    for idx in range(0, 10):
      word_id = ranks[idx]
      print('%d: %s' % (idx+1, vocab[word_id]))
  T5 = time.time() 
  print("elapsed time", T5 - T0, "(",T5-T4,")" ) 



  word = 'juliet'
  vocab_to_index = dict(zip(vocab, range(0, len(vocab))))
  for sim_fn in similarity_fns:
    print('\nThe 10 most similar words to "%s" using %s on PPMI matrix are:' % (word, sim_fn.__qualname__))
    ranks = rank_words(vocab_to_index[word], PPMI_matrix, sim_fn)
    for idx in range(0, 10):
      word_id = ranks[idx]
      print('%d: %s' % (idx+1, vocab[word_id]))
  T6 = time.time() 
  print("elapsed time", T6 - T0, "(",T6-T5,")" ) 

'''
1: juliet
2: pined
3: waken
4: capulet
5: tybalt
6: muffled
7: fares
8: provost
9: wills
10: county
1: juliet
2: tybalt
3: capulet
4: silvia
5: lucio
6: nurse
7: romeo
8: montague
9: leonato
10: provost
1: juliet
2: tybalt
3: capulet
4: silvia
5: lucio
6: nurse
7: romeo
8: montague
9: leonato
10: provost
'''

Computing term document matrix...
elapsed time 0.681891679763794 ( 0.681891679763794 )
Computing tf-idf matrix...
elapsed time 0.7039790153503418 ( 0.02208733558654785 )
Computing term context matrix...
elapsed time 71.05132269859314 ( 70.3473436832428 )
Computing PPMI matrix...
elapsed time 197.34517192840576 ( 126.29384922981262 )


NameError: name 'elapsed_time' is not defined