## Homework 1 - Finding similar items

#### Shingling / Shingles

A class Shingling that constructs k–shingles of a given length k (e.g., 10) from a given document, computes a hash value for each unique shingle and represents the document in the form of an ordered set of its hashed k-shingles.

In [135]:
class Shingling:

  def constructShingles(self, s, k=5):
    """
    Split a string into k equal sizes 
    """
    return set([s[i:(i+k)].lower() for i in range(0, len(s)-k+1)])

  def constructHashedShingles(self, text, k=5, hashBoundary=((2**32)-1)):
    
    shingles = self.constructShingles(text, k)
    hashed_shingles = [hash(shingle) % hashBoundary for shingle in shingles]
    return set(hashed_shingles)
  

if __name__ == "__main__":
  text1 = "editorial"
  text2 = "factorial"
  
  shing = Shingling()

  shingles = shing.constructShingles(text1, 5)
  print('editorial:', shingles)

  shingles1 = shing.constructHashedShingles(text1, 5)
  print('Hashed "editorial":', shingles1)

  shingles2 = shing.constructHashedShingles(text2, 5)
  print('Hashed "factorial":', shingles2)


editorial: {'itori', 'orial', 'edito', 'ditor', 'toria'}
Hashed "editorial": {3283703438, 133522000, 2128747985, 2344419152, 1122035573}
Hashed "factorial": {751242816, 3390532172, 3283703438, 2344419152, 879469552}


#### Compare Sets

A class CompareSets computes the Jaccard similarity of two sets of integers – two sets of hashed shingles.

In [137]:
import numpy as np

class CompareSets:

  def similarity(self, A, B):
    """
    Jaccardi similarity between two sets of hashed shingles.
    """
    return len(A & B) / len(A | B)

  def distance(self, A, B):
      """
      Jaccardi distance.
      """
      return 1-self.similarity(A, B)


if __name__ == "__main__":
  
  compSet = CompareSets()
  sim = compSet.similarity(shingles1, shingles2)

  print('Jaccardi Similarity:', sim)

Jaccardi Similarity: 0.25


#### Min Hashing

A class MinHashing that builds a minHash signature (in the form of a vector or a set) of a given length n from a given set of integers (a set of hashed shingles).

In [139]:
import numpy as np

class MinHashing:

  def constructCharMat(self, setList):
    union = set().union(*setList)
    return np.array([[int(e in s) for s in setList] for e in union])

  def createSigMatPer(self, setList, K=100):
    charMat = self.constructCharMat(setList)
    sigMat = []
    
    for i in range(K):
      perCharMat = np.take(charMat, np.random.RandomState(seed=i).permutation(charMat.shape[0]),axis=0)
      sigMatRow = [np.where(col == 1)[0][0] for col in perCharMat.T]
      sigMat.append(sigMatRow)

    return np.array(sigMat)

  def createSigMatHash(self, setList, K=100):
    maxID = (2**31)-1
    c = 4294967311 # Next prime > maxID

    a = np.random.RandomState(seed=1).randint(low=0, high=maxID, size=K, dtype=np.int64)
    b = np.random.RandomState(seed=2).randint(low=0, high=maxID, size=K, dtype=np.int64)

    sigMat = np.array([[np.min([(a[i] * e + b[i]) % c for e in s]) for s in setList] for i in range(K)])

    return sigMat

  def calcSigMatSim(self, vector1, vector2):
    return (np.sum(vector1 == vector2)) / len(vector1)
    

if __name__ == "__main__":
  
  print('Union:', set().union(*[shingles1, shingles2]))
  print('Shingles 1:', shingles1)
  print('Shingles 2:', shingles2)

  minHash = MinHashing()

  charMat = minHash.constructCharMat([shingles1, shingles2])
  print(charMat)
  
  import time

  # ----- Creating Signature Matrix with Permutations -----
  start = time.time()
  
  sigMatPer = minHash.createSigMatPer([shingles1, shingles2], 50000)
  print('SigMatSim Permutation:', sigMatPer)
  
  end = time.time()
  print('Time elapsed Permutation:', end - start)

  # ----- Creating Signature Matrix with Hashing -----

  start = time.time()
  
  sigMatHash = minHash.createSigMatHash([shingles1, shingles2], 50000)
  print('SigMatSim Hash:', sigMatHash)
  
  end = time.time()
  print('Time elapsed Hash:', end - start)



Union: {751242816, 3390532172, 3283703438, 133522000, 2128747985, 2344419152, 879469552, 1122035573}
Shingles 1: {3283703438, 133522000, 2128747985, 2344419152, 1122035573}
Shingles 2: {751242816, 3390532172, 3283703438, 2344419152, 879469552}
[[0 1]
 [0 1]
 [1 1]
 [1 0]
 [1 0]
 [1 1]
 [0 1]
 [1 0]]
SigMatSim Permutation: [[1 0]
 [0 1]
 [0 1]
 ...
 [0 3]
 [1 0]
 [1 0]]
Time elapsed Permutation: 9.018193244934082
SigMatSim Hash: [[ 898991277 1265927949]
 [1166200107  550004930]
 [  25604440  920699938]
 ...
 [ 792924501 2207452454]
 [1000043151 1270047230]
 [  35499106   35499106]]
Time elapsed Hash: 1.0669276714324951


#### Compare Signatures

A class CompareSignatures estimates the similarity of two integer vectors – minhash signatures – as a fraction of components in which they agree.

In [141]:
import numpy as np

class CompareSignatures:

  def calcSigMatSim(self, vector1, vector2):
    return (np.sum(vector1 == vector2)) / len(vector1)
    

if __name__ == "__main__":
  
  compSig = CompareSignatures()
  
  print('Signature Matrix Simimilarity Permutation:', compSig.calcSigMatSim(sigMatPer[:,0], sigMatPer[:,1]))
  print('Signature Matrix Simimilarity Hash:', compSig.calcSigMatSim(sigMatHash[:,0], sigMatHash[:,1]))



Signature Matrix Simimilarity Permutation: 0.24568
Signature Matrix Simimilarity Hash: 0.25122


#### Locality-Sensitive Hashing

A class LSH that implements the LSH technique: given a collection of minhash signatures (integer vectors) and a similarity threshold t, the LSH class (using banding and hashing) finds candidate pairs of signatures agreeing on at least a fraction t of their components.

In [275]:
import numpy as np
import math

class LSH:

  def estimateBandR(self, sigMat, simThresh=0.8):
    # rangeMax = int(len(sigMat)/math.log(len(sigMat)))
    rangeMax = len(sigMat)
    brSolutions = [[b,r] for b in range(rangeMax) for r in range(rangeMax) if (b * r)==len(sigMat)]

    similarities = [(1/ab[0])**(1/ab[1]) for ab in brSolutions]

    closestFit = min(similarities, key=lambda x:abs(x-simThresh))
    index = similarities.index(closestFit)
    return brSolutions[index][0], brSolutions[index][1]

  def findCandidatePairs(self, sigMat, simThresh=0.80, bands=None, k=10**6):

    if bands is None:
      b, r = self.estimateBandR(sigMat, simThresh)

    assert len(sigMat) % b == 0

    candidate_pairs = []

    # print(sigMat)

    # print(sigMat[0:20,0])

    for i in range(sigMat.shape[1]):
      for j in range(sigMat.shape[1]):
        if i == j:
          continue
        
        for n in range(b):
          hashBand1 = hash(frozenset(sigMat[n*r:(n+1)*r,i])) % k
          hashBand2 = hash(frozenset(sigMat[n*r:(n+1)*r,j])) % k

          if hashBand1 == hashBand2:
            # print((i,j))
            if (i,j) not in candidate_pairs and (j,i) not in candidate_pairs:
              candidate_pairs.append((i,j))
              # print((i,j))
            break


    print(sigMat.shape)
    
    print(b, r)
    print((1/b)**(1/r))
    return candidate_pairs

    

if __name__ == "__main__":
  
  lsh = LSH()
  lsh.findCandidatePairs(sigMatHash, 0.10)



(400, 4)
200 2
0.07071067811865475


#### Testing

In [202]:
textMinHashGuide = 'In this post, Im providing a brief tutorial, along with some example Python code, for applying the MinHash algorithm to compare a large number of documents to one another efficiently. I first learned about this topic through Stanfords Mining of Massive Datasets (“MMDS”) course available for free on Coursera here (Update 11/5/19 - Its no longer on Coursera, but still freely available here). Whats especially great about that course is that the authors also provide their textbook online for free! You can find the textbook here, with a separate PDF file for each chapter. Chapter 3 covers the MinHash algorithm, and Id refer you to that text as a more complete discussion of the topic. On to the tutorial! There is an interesting computing problem that arises in a number of contexts called “set similarity”. Lets say you and I are both subscribers to Netflix, and weve each watched roughly 100 movies on Netflix. The list of movies Ive seen is a set, and the list of movies youve seen is another set. To measure the similarity between these two sets, you can use the Jaccard Similarity, which is given by the intersection of the sets divided by their union. That is, count the number of movies weve both seen, and divide that by the total number of unique movies that weve both collectively seen. If weve each watched exactly 100 movies, and 50 of those were seen by both of us, then the intersection is 50 and the union is 150, so our Jaccard Similarity is 1/3. What seems to be the more common application of “set similarity” is the comparison of documents. One way to represent a document would be to parse it for all of its words, and represent the document as the set of all unique words it contains. In practice, youd hash the words to integer IDs, and then maintain the set of IDs present in the document. By representing the documents as sets of words, you could then use the Jaccard Similarity as a measure of how much overlap there is between two documents. Its important to note that were not actually extracting any semantic meaning of the documents here, were simply looking at whether they contain the same words. This technique of comparing documents probably wont work as well, for example, for comparing documents that cover similar concepts but are otherwise completely unique. Instead, the applications of this technique are found where theres some expectation that the documents will specifically contain a lot of the same words. One example is aggregating news articles. When the Associated Press releases an article about a particular event, many news agencies will take the AP article, perhaps modify it some, and publish it on their website. A news aggregator needs to recognize that a group of articles are really all based on the same AP article about one particular story. Comparing the web pages using this “similar sets” approach is one way to accomplish this. Another example is detecting plagiarism. The dataset used in my example code is a large collection of articles, some of which are plagiarisms of each other (where theyve been just slightly modified). You might say that these are all applications of “near-duplicate” detection. A small detail here is that it is more common to parse the document by taking, for example, each possible string of three consecutive words from the document (e.g., “A small detail”, “small detail here”, “detail here is”, etc.) and hashing these strings to integers. This retains a little more of the document structure than just hashing the individual words. This technique of hashing substrings is referred to as “shingling”, and each unique string is called a “shingle”. Another shingling technique thats described in the Mining of Massive Datasets textbook is k-shingles, where you take each possible sequence of k characters. Im not clear on the motivation of this approach—it may have to do with the fact that it always produces strings of length k, whereas the three-word approach produces variable length strings. In the example code, Im using three-word shingles, and it works well. Problem scale So far, this all sounds pretty straight forward and manageable. Where it gets interesting is when you look at the compute requirements for doing this for a relatively large number of documents. Let’s say you have a large collection of documents, and you want to find all of the pairs of documents that are near-duplicates of each other. You’d do this by calculating the Jaccard similarity between each pair of documents, and then selecting those with a similarity above some threshold. To compare each document to every other document requires a lot of comparisons! It’s not quite N-squared comparisons, since that would include doing a redundant comparison of ‘a’ to ‘b’ and ‘b’ to ‘a’, as well as comparing every document to itself. The number of comparisons required is given by the following formula, which is pronounced “N-choose-2” N-Choose-2 Eq As noted in the equation, a good approximation is N^2 / 2 (This is approximation is equivalent to comparing each document pair only once, but also needlessly comparing each document to itself). Lets say we have a collection of 1 million documents, and that on average, a PC can calculate the Jaccard similarity between two sets in 1ms per pair. First, let’s calculate the rough number of comparisons required: 1M doc comparisons Next, the amount of time required: 1M doc comparisons time 16 years of compute time! Good luck with that. You’d need 1,000 servers just to get the compute time down to a week. But there’s a better way… MinHash Signatures The MinHash algorithm will provide us with a fast approximation to the Jaccard Similarity between two sets. For each set in our data, we are going to calculate a MinHash signature. The MinHash signatures will all have a fixed length, independent of the size of the set. And the signatures will be relatively short—in the example code, they are only 10 components long. To approximate the Jaccard Similarity between two sets, we will take their MinHash signatures, and simply count the number of components which are equal. If you divide this count by the signature length, you have a pretty good approximation to the Jaccard Similarity between those two sets. We can compare two MinHash signatures in this way much quicker than we can calculate the intersection and union between two large sets. This is partly because the MinHash signatures tend to be much shorter than the number of shingles in the documents, and partly because the comparison operation is simpler. In the example code, we have a collection of 10,000 articles which contain, on average, 250 shingles each. Computing the Jaccard similarities directly for all pairs takes 20 minutes on my PC, while generating and comparing the MinHash signatures takes only about 2 minutes and 45 seconds. MinHash Algorithm The MinHash algorithm is actually pretty easy to describe if you start with the implementation rather than the intuitive explanation. The key ingredient to the algorithm is that we have a hash function which takes a 32-bit integer and maps it to a different integer, with no collisions. Put another way, if you took the numbers 0 – (2^32 – 1) and applied this hash function to all of them, you’d get back a list of the same numbers in random order. To demystify it a bit, here is the definition of the hash function, which takes an input integer ‘x’: Random Hash Eq The coefficients a and b are randomly chosen integers less than the maximum value of x. c is a prime number slightly bigger than the maximum value of x. For different choices of a and b, this hash function will produce a different random mapping of the values. So we have the ability to “generate” as many of these random hash functions as we want by just picking different values of a and b. So here’s how you compute the MinHash signature for a given document. Generate, say, 10 random hash functions. Take the first hash function, and apply it to all of the shingle values in a document. Find the minimum hash value produced (hey, “minimum hash”, that’s the name of the algorithm!) and use it as the first component of the MinHash signature. Now take the second hash function, and again find the minimum resulting hash value, and use this as the second component. And so on. So if we have 10 random hash functions, we’ll get a MinHash signature with 10 values. We’ll use the same 10 hash functions for every document in the dataset and generate their signatures as well. Then we can compare the documents by counting the number of signature components in which they match. That’s it!'
textMinHashWiki = 'Motivation The task of finding nearest neighbours is very common. You can think of applications like finding duplicate or similar documents, audio/video search. Although using brute force to check for all possible combinations will give you the exact nearest neighbour but it’s not scalable at all. Approximate algorithms to accomplish this task has been an area of active research. Although these algorithms don’t guarantee to give you the exact answer, more often than not they’ll be provide a good approximation. These algorithms are faster and scalable. Locality sensitive hashing (LSH) is one such algorithm. LSH has many applications, including: Near-duplicate detection: LSH is commonly used to deduplicate large quantities of documents, webpages, and other files. Genome-wide association study: Biologists often use LSH to identify similar gene expressions in genome databases. Large-scale image search: Google used LSH along with PageRank to build their image search technology VisualRank. Audio/video fingerprinting: In multimedia technologies, LSH is widely used as a fingerprinting technique A/V data. In this blog, we’ll try to understand the workings of this algorithm. General Idea LSH refers to a family of functions (known as LSH families) to hash data points into buckets so that data points near each other are located in the same buckets with high probability, while data points far from each other are likely to be in different buckets. This makes it easier to identify observations with various degrees of similarity. Finding similar documents Let’s try to understand how we can leverage LSH in solving an actual problem. The problem that we’re trying to solve: Goal: You have been given a large collections of documents. You want to find “near duplicate” pairs. In the context of this problem, we can break down the LSH algorithm into 3 broad steps: Shingling Min hashing Locality-sensitive hashing Don’t read much into the figure for now. It’s just to give you the idea of the process flow. We’ll discuss each step in detail. Shingling In this step, we convert each document into a set of characters of length k (also known as k-shingles or k-grams). The key idea is to represent each document in our collection as a set of k-shingles. For ex: One of your document (D): “Nadal”. Now if we’re interested in 2-shingles, then our set: {Na, ad, da, al}. Similarly set of 3-shingles: {Nad, ada, dal}. Similar documents are more likely to share more shingles Reordering paragraphs in a document of changing words doesn’t have much affect on shingles k value of 8–10 is generally used in practice. A small value will result in many shingles which are present in most of the documents (bad for differentiating documents) Jaccard Index We’ve a representation of each document in the form of shingles. Now, we need a metric to measure similarity between documents. Jaccard Index is a good choice for this. Jaccard Index between document A & B can be defined as: It’s also known as intersection over union (IOU). Suppose A: “Nadal” and B: “Nadia”, then 2-shingles representation will be: A: {Na, ad, da, al} and B: {Na, ad, di, ia}. Jaccard Index = 2/6 More number of common shingles will result in bigger Jaccard Index and hence more likely that the documents are similar. Let’s discuss 2 big issues that we need to tackle: Time complexity Now you may be thinking that we can stop here. But if you think about the scalability, doing just this won’t work. For a collection of n documents, you need to do n*(n-1)/2 comparison, basically O(n²). Imagine you have 1 million documents, then the number of comparison will be 5*10¹¹ (not scalable at all!). Space complexity The document matrix is a sparse matrix and storing it as it is will be a big memory overhead. One way to solve this is hashing. Hashing The idea of hashing is to convert each document to a small signature using a hashing function H. Suppose a document in our corpus is denoted by d. Then: H(d) is the signature and it’s small enough to fit in memory If similarity(d1,d2) is high then Probability(H(d1)==H(d2)) is high If similarity(d1,d2) is low then Probability(H(d1)==H(d2)) is low Choice of hashing function is tightly linked to the similarity metric we’re using. For Jaccard similarity the appropriate hashing function is min-hashing. Min hashing This is the critical and the most magical aspect of this algorithm so pay attention: Step 1: Random permutation (π) of row index of document shingle matrix. Step 2: Hash function is the index of the first (in the permuted order) row in which column C has value 1. Do this several time (use different permutations) to create signature of a column. Min-hash property The similarity of the signatures is the fraction of the min-hash functions (rows) in which they agree. So the similarity of signature for C1 and C3 is 2/3 as 1st and 3rd row are same. Expected similarity of two signatures is equal to the Jaccard similarity of the columns. The longer the signatures, the lower the error In the below example you can see this to some extent. There is difference as we have signatures of length 3 only. But if increase the length the 2 similarities will be closer. So using min-hashing we have solved the problem of space complexity by eliminating the sparseness and at the same time preserving the similarity. In actual implementation their is a trick to create permutations of indices which I’ll not cover but you can check this video around 15:52. Min-hash implementation Locality-sensitive hashing Goal: Find documents with Jaccard similarity of at least t The general idea of LSH is to find a algorithm such that if we input signatures of 2 documents, it tells us that those 2 documents form a candidate pair or not i.e. their similarity is greater than a threshold t. Remember that we are taking similarity of signatures as a proxy for Jaccard similarity between the original documents. Specifically for min-hash signature matrix: Hash columns of signature matrix M using several hash functions If 2 documents hash into same bucket for at least one of the hash function we can take the 2 documents as a candidate pair Now the question is how to create different hash functions. For this we do band partition.'
textUkraineArticle1 = 'US National Security Adviser Jake Sullivan has confirmed communication channels between Washington and Moscow remain open. It comes as the White House refuses to deny reports that Mr Sullivan has been leading talks with Russia to prevent a nuclear escalation in Ukraine. Speaking in New York, Mr Sullivan said it was "in the interests" of the US to maintain contact with the Kremlin. But he insisted officials were "clear-eyed about who we are dealing with". The Wall Street Journal reports that Mr Sullivan has held confidential discussions with his Russian counterpart, Security Council secretary Nikolai Patrushev, and senior Kremlin foreign policy aide Yuri Ushakov, over the past several months. Senior officials told the paper the men had discussed ways to guard against the risk of nuclear escalation in the war in Ukraine, but had not engaged in any negotiations around ways to end the conflict. Last month, Mr Sullivan said any use of nuclear weapons would have "catastrophic consequences for Russia". He told the US broadcaster NBC that senior officials had "spelled out" the scope of the potential US response in private discussions with Russian officials. US National Security Council spokeswoman Adrienne Watson refused to confirm the story, telling the paper that "people claim a lot of things", while Kremlin spokesperson Dmitry Peskov accused Western newspapers of "publishing numerous hoaxes". But White House press secretary Karin Jean-Pierre said on Monday that the United States reserved the right to hold talks with Russia. And Mr Sullivan - who is said to be one of the most senior advisers to US President Joe Biden still pushing for discussions with Russia - said maintaining contact with Moscow was in the "interests of every country who is affected by this conflict". Last week, the Washington Post reported that senior US officials were urging Kyiv to signal an openness to hold negotiations with Russia and drop their public refusal to discuss an end to the war while President Vladimir Putin remained in power. But Mr Sullivan told a public event in New York that the Biden administration had "an obligation to pursue accountability" and pledged to work with international partners to "hold the perpetrators of grave and grotesque war crimes in Ukraine responsible for what they have done". "I was just in Kyiv on Friday. and I had the opportunity to meet with President [Volodymyr] Zelensky and my counterpart Andriy Yermak, with the military leadership and also to get a briefing on just what level of death and devastation has been erupted by Putins war on that country," Mr Sullivan said. Concerns have been heightened in recent months that Russia could resort to using nuclear weapons in a desperate attempt to defend four regions of eastern and southern Ukraine that it illegally annexed. Meanwhile, Ukraine has invoked its war-time martial laws to take control of the assets of five strategically important companies. Some of the companies - which include two energy companies and firms that make engines, vehicles and transformers - are linked to oligarch Vyacheslav Bohuslayev, who was arrested on suspicion of collaborating with Russia. President Zelensky said the move would help Ukraines defence sector meet the needs of the military, which is currently engaged in counteroffensives in southern and eastern Ukraine.'
textUkraineArticle2 = 'US President Joe Biden has warned that the risk of a nuclear "Armageddon" is at its highest level for 60 years. Mr Biden said that Russian President Vladimir Putin was "not joking" when he warned that Moscow would use "all means we have" to defend Russian territory. Mr Putin has also said that the US created a "precedent" by using nuclear weapons in World War Two. But analysts suggest Mr Putins words should probably be interpreted as a warning to other countries not to escalate their involvement in Ukraine, rather than signalling any desire to use nuclear weapons. Nuclear weapons have existed for almost 80 years and many countries see them as a deterrent that continues to guarantee their national security. How many nuclear weapons does Russia have? All figures for nuclear weapons are estimates but, according to the Federation of American Scientists, Russia has 5,977 nuclear warheads - the devices that trigger a nuclear explosion - though this includes about 1,500 that are retired and set to to be dismantled. Of the remaining 4,500 or so, most are considered strategic nuclear weapons - ballistic missiles, or rockets, which can be targeted over long distances. These are the weapons usually associated with nuclear war. Graphic showing estimated number of Russias strategic nuclear warheads The rest are smaller, less destructive nuclear weapons for short-range use on battlefields or at sea. But this does not mean Russia has thousands of long-range nuclear weapons ready to go. Experts estimate around 1,500 Russian warheads are currently "deployed", meaning sited at missile and bomber bases or on submarines at sea. How does this compare with other countries? Nine countries have nuclear weapons: China, France, India, Israel, North Korea, Pakistan, Russia, the US and the UK. Comparison of the estimated number of warheads held each of the nine nuclear-armed countries. China, France, Russia, the US and the UK are also among 191 states signed up to the Treaty on the Non-Proliferation of Nuclear Weapons (NPT). Under the agreement, they have to reduce their stockpile of nuclear warheads and, in theory, are committed to their complete elimination. And it has reduced the number of warheads stored in those countries since the 1970 and 80s. India, Israel and Pakistan never joined the NPT - and North Korea left in 2003. Israel is the only country of the nine never to have formally acknowledged its nuclear programme - but it is widely accepted to have nuclear warheads. Ukraine has no nuclear weapons and, despite accusations by President Putin, there is no evidence it has attempted to acquire them. line War in Ukraine: More coverage IN-DEPTH: Biden says nuclear risk highest for 60 years ANALYSIS: Putins dream of victory slipping away ON THE GROUND: Ukraine war in maps READ MORE: Full coverage of the crisis line How destructive are nuclear weapons? Nuclear weapons are designed to cause maximum devastation. The extent of the destruction depends on a range of factors, including: the size of the warhead how high above the ground it detonates the local environment Graphic showing damage zones from nuclear blast But even the smallest warhead could cause huge loss of life and lasting consequences. The bomb that killed up to 146,000 people in Hiroshima, Japan, during World War Two, was 15 kilotons. And nuclear warheads today can be more than 1,000 kilotons. Little is expected to survive in the immediate impact zone of a nuclear explosion. After a blinding flash, there is a huge fireball and blast wave that can destroy buildings and structures for several kilometres. What does nuclear deterrent mean and has it worked? The argument for maintaining large numbers of nuclear weapons has been having the capacity to completely destroy your enemy would prevent them from attacking you. The most famous term for this became mutually assured destruction (Mad). Though there have been many nuclear tests and a constant increase in their technical complexity and destructive power, nuclear weapons have not been used in an armed confrontation since 1945. Russian policy also acknowledges nuclear weapons solely as a deterrent and lists four cases for their use: the launch of ballistic missiles attacking the territory of the Russian Federation or its allies the use of nuclear weapons or other types of weapons of mass destruction against the Russian Federation or its allies an attack on critical governmental or military sites of the Russian Federation that threatens its nuclear capability aggression against the Russian Federation with the use of conventional weapons when the very existence of the state is in jeopardy line The use of nuclear weapons is far from likely Analysis box by Gordon Corera, security correspondent The shadow of nuclear weapons has hung over this conflict from its earliest days - and that has been a deliberate choice on the part of Vladimir Putin. He has raised their use at moments when he has been on the back foot - for instance after the failure of his initial February plan to quickly overthrow the Ukrainian government and now again when a Ukrainian offensive has driven his forces back, His hope will be that a reminder of the devastating power of these weapons will intimidate and deter his opponents and force them to rethink how far they are willing to push. There is also a domestic motive - the Russian population will be worried by the partial mobilisation and Putins own claims that Nato is somehow threatening Russia itself. Talking about nuclear weapons is a way of reassuring domestic opinion that despite this dark turn, the country remains capable of defending itself. Russian military doctrine says nuclear weapons will only be used if the Russian state itself is threatened. It was notable that Putin framed their use in a defensive sense responding to what he claimed were Western nuclear threats. His reference to this not being a bluff referred to a situation when Russias territorial integrity was threatened. An important question is how far Russia sees its territory extending after the upcoming referenda in Ukrainian territory. All of this suggests that the use of nuclear weapons is far from imminent or even likely. While the possibility of their use can not be dismissed, especially if Putin feels the security of the state threatened, the response from the West for the moment will likely be to watch closely Russias actual behaviour rather than the rhetoric and to remain focused on their strategy.'


In [296]:
shing = Shingling()

shingles1 = shing.constructHashedShingles(textMinHashGuide, 3)
shingles2 = shing.constructHashedShingles(textMinHashWiki, 3)
shingles3 = shing.constructHashedShingles(textUkraineArticle1, 3)
shingles4 = shing.constructHashedShingles(textUkraineArticle2, 3)

vocabulary = shingles1.union(shingles2).union(shingles3).union(shingles4)

print(len(shingles1))
print(len(shingles2))
print(len(shingles3))
print(len(shingles4))

1723
1581
1246
1653


In [297]:
minHash = MinHashing()

import time

# ----- Creating Signature Matrix with Hashing -----

start = time.time()

sigMatHash = minHash.createSigMatHash([shingles1, shingles2, shingles3, shingles4], 200)
# print('SigMatSim Hash:', sigMatHash)

end = time.time()
print('Time elapsed Hash:', end - start)

Time elapsed Hash: 1.2057468891143799


In [298]:
compSet = CompareSets()
sim12 = compSet.similarity(shingles1, shingles2)
sim13 = compSet.similarity(shingles1, shingles3)
sim14 = compSet.similarity(shingles1, shingles4)
sim23 = compSet.similarity(shingles2, shingles3)
sim24 = compSet.similarity(shingles2, shingles4)
sim34 = compSet.similarity(shingles3, shingles4)

print('Jaccardi 0-1:', sim12)
print('Jaccardi 0-2:', sim13)
print('Jaccardi 0-3:', sim14)
print('Jaccardi 1-2:', sim23)
print('Jaccardi 1-3:', sim24)
print('Jaccardi 2-3:', sim34)

compSig = CompareSignatures()
  
print('SigMat Simimilarity 0-1:', compSig.calcSigMatSim(sigMatHash[:,0], sigMatHash[:,1]))
print('SigMat Simimilarity 0-2:', compSig.calcSigMatSim(sigMatHash[:,0], sigMatHash[:,2]))
print('SigMat Simimilarity 0-3:', compSig.calcSigMatSim(sigMatHash[:,0], sigMatHash[:,3]))
print('SigMat Simimilarity 1-2:', compSig.calcSigMatSim(sigMatHash[:,1], sigMatHash[:,2]))
print('SigMat Simimilarity 1-3:', compSig.calcSigMatSim(sigMatHash[:,1], sigMatHash[:,3]))
print('SigMat Simimilarity 2-3:', compSig.calcSigMatSim(sigMatHash[:,2], sigMatHash[:,3]))

Jaccardi 0-1: 0.41317365269461076
Jaccardi 0-2: 0.3224944320712695
Jaccardi 0-3: 0.38701725554642563
Jaccardi 1-2: 0.3003679852805888
Jaccardi 1-3: 0.3639814424293547
Jaccardi 2-3: 0.38113387327298714
SigMat Simimilarity 0-1: 0.425
SigMat Simimilarity 0-2: 0.305
SigMat Simimilarity 0-3: 0.42
SigMat Simimilarity 1-2: 0.29
SigMat Simimilarity 1-3: 0.35
SigMat Simimilarity 2-3: 0.36


In [309]:
lsh = LSH()
print(lsh.findCandidatePairs(sigMatHash, 0.60, bands=None, k=10**3))

(200, 4)
25 8
0.668740304976422
[]
