In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from scipy.spatial.distance import hamming, euclidean
from scipy.stats import pearsonr
import hashlib
import ctypes


In [4]:
#corpus: 15 tweets using Omicron as the query (Jan 32, 2022)
corpus = ["DrCChambers RT @CIHR_IRSC: In light of the disruptions caused by the Omicron variant of #Covid19, CIHR is extending the registration and application de…",
"sealsoftheend Japan's Kowa says Ivermectin showed 'antiviral effect' against Omicron https://t.co/mKKY24WeQV",
"SVictor70973566 RT @EricTopol: Why is Omicron so hyper-transmissible? It's not related to high viral load in the upper airway, as shown by 2 recent studies…",
"freethinkfacts RT @yaneerbaryam: Actual cases of reinfection by Omicron are so widespread they are manifest to anyone who is not closing their eyes: 10/…",
"lsoril RT @CIHR_IRSC: In light of the disruptions caused by the Omicron variant of #Covid19, CIHR is extending the registration and application de…",
"pompey1977 RT @AllisonPearson: How can people not get it? Omicron’s advantage over Delta is it evades the vaccine. Everyone is going to get Omicron…",
"freethinkfacts RT @yaneerbaryam: Taken together, our results suggest that Omicron-induced immunity may not be sufficient to prevent infection from anothe…",
"SteveBennett15 RT @EricTopol: Anyone who thinks that vaccines aren't working against Omicron might want to review the data https://t.co/9bHYdKxz8u https:/…",
"wasohope RT @ASTERHealthcare: Omicron covid-19 variant was reported from South Africa on November 2021. This variant has had many mutations that aff…",
"SVictor70973566 RT @MdFacep: @EricTopol @maxdkozlov Omicron's impact is in its ability to evade our immune system: Our 'older' vaccine produced NABS fail…",
"Wildantlers @melulater Of course it did, Omicron spreads far quicker. But as a % of people who die from omicron it is far milde… https://t.co/eDFuW5qjAH",
"peterandann RT @EnemyInAState: Omicron, London: Babies and toddlers continue to surge, and 1 in 9 admitted is a child. Over 668 babies and toddlers hav…",
"DrMroz RT @CIHR_IRSC: In light of the disruptions caused by the Omicron variant of #Covid19, CIHR is extending the registration and application de…",
"Deis85208721 CORRECTED-Japan's Kowa says ivermectin showed 'antiviral effect' against Omicron in research https://t.co/VEoQyz5x6F",
"freethinkfacts RT @yaneerbaryam: Thus, breakthrough infection from Omicron may enhance cross-protection against Delta, and vice-versa, [only] inasmuch as…"]


vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
words = vectorizer.get_feature_names()
print(f"#words {len(words)} {words}")

#words 170 ['10', '19', '2021', '668', '9bhydkxz8u', 'ability', 'actual', 'admitted', 'advantage', 'aff', 'africa', 'against', 'airway', 'allisonpearson', 'and', 'anothe', 'antiviral', 'anyone', 'application', 'are', 'aren', 'as', 'asterhealthcare', 'babies', 'be', 'breakthrough', 'but', 'by', 'can', 'cases', 'caused', 'child', 'cihr', 'cihr_irsc', 'closing', 'co', 'continue', 'corrected', 'course', 'covid', 'covid19', 'cross', 'data', 'de', 'deis85208721', 'delta', 'did', 'die', 'disruptions', 'drcchambers', 'drmroz', 'edfuw5qjah', 'effect', 'enemyinastate', 'enhance', 'erictopol', 'evade', 'evades', 'everyone', 'extending', 'eyes', 'fail', 'far', 'freethinkfacts', 'from', 'get', 'going', 'had', 'has', 'hav', 'high', 'how', 'https', 'hyper', 'immune', 'immunity', 'impact', 'in', 'inasmuch', 'induced', 'infection', 'is', 'it', 'its', 'ivermectin', 'japan', 'kowa', 'light', 'load', 'london', 'lsoril', 'manifest', 'many', 'maxdkozlov', 'may', 'mdfacep', 'melulater', 'might', 'milde', 'mk

In [5]:
print(X.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]]


In [6]:
print(f"vector for first tweet\n{X.toarray()[0]}")

vector for first tweet
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 1 1 0 0 0
 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [7]:
F_BITS = 64

def calculate_simhash(features_extracted, weights):
    arrayvector_v = np.zeros(F_BITS)
    
    for feature, weight in zip(features_extracted, weights):
        hash_of_feature = hash(feature)        
        hash_to_binary = bin(ctypes.c_size_t(hash_of_feature).value)[2:]
        hash_to_binary = hash_to_binary.zfill(F_BITS)
        
        for i, bit in enumerate(hash_to_binary):
            if bit == '1':
                arrayvector_v[i] += weight
            else:
                arrayvector_v[i] -= weight
    
    simhash_fingerprint = ''.join(['1' if v >= 0 else '0' for v in arrayvector_v])
    
    return simhash_fingerprint

In [8]:
corpus = ["DrCChambers RT @CIHR_IRSC: In light of the disruptions caused by the Omicron variant of #Covid19, CIHR is extending the registration and application de…",
"sealsoftheend Japan's Kowa says Ivermectin showed 'antiviral effect' against Omicron https://t.co/mKKY24WeQV",
"SVictor70973566 RT @EricTopol: Why is Omicron so hyper-transmissible? It's not related to high viral load in the upper airway, as shown by 2 recent studies…",
"freethinkfacts RT @yaneerbaryam: Actual cases of reinfection by Omicron are so widespread they are manifest to anyone who is not closing their eyes: 10/…",
"lsoril RT @CIHR_IRSC: In light of the disruptions caused by the Omicron variant of #Covid19, CIHR is extending the registration and application de…",
"pompey1977 RT @AllisonPearson: How can people not get it? Omicron’s advantage over Delta is it evades the vaccine. Everyone is going to get Omicron…",
"freethinkfacts RT @yaneerbaryam: Taken together, our results suggest that Omicron-induced immunity may not be sufficient to prevent infection from anothe…",
"SteveBennett15 RT @EricTopol: Anyone who thinks that vaccines aren't working against Omicron might want to review the data https://t.co/9bHYdKxz8u https:/…",
"wasohope RT @ASTERHealthcare: Omicron covid-19 variant was reported from South Africa on November 2021. This variant has had many mutations that aff…",
"SVictor70973566 RT @MdFacep: @EricTopol @maxdkozlov Omicron's impact is in its ability to evade our immune system: Our 'older' vaccine produced NABS fail…",
"Wildantlers @melulater Of course it did, Omicron spreads far quicker. But as a % of people who die from omicron it is far milde… https://t.co/eDFuW5qjAH",
"peterandann RT @EnemyInAState: Omicron, London: Babies and toddlers continue to surge, and 1 in 9 admitted is a child. Over 668 babies and toddlers hav…",
"DrMroz RT @CIHR_IRSC: In light of the disruptions caused by the Omicron variant of #Covid19, CIHR is extending the registration and application de…",
"Deis85208721 CORRECTED-Japan's Kowa says ivermectin showed 'antiviral effect' against Omicron in research https://t.co/VEoQyz5x6F",
"freethinkfacts RT @yaneerbaryam: Thus, breakthrough infection from Omicron may enhance cross-protection against Delta, and vice-versa, [only] inasmuch as…"]

vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus)
words = vectorizer.get_feature_names()
print(f"#words {len(words)} {words}")

#words 127 ['10', '19', '2021', '668', '9bhydkxz8u', 'ability', 'actual', 'admitted', 'advantage', 'aff', 'africa', 'airway', 'allisonpearson', 'anothe', 'antiviral', 'application', 'aren', 'asterhealthcare', 'babies', 'breakthrough', 'cases', 'caused', 'child', 'cihr', 'cihr_irsc', 'closing', 'continue', 'corrected', 'course', 'covid', 'covid19', 'cross', 'data', 'deis85208721', 'delta', 'did', 'die', 'disruptions', 'drcchambers', 'drmroz', 'edfuw5qjah', 'effect', 'enemyinastate', 'enhance', 'erictopol', 'evade', 'evades', 'extending', 'eyes', 'fail', 'far', 'freethinkfacts', 'going', 'hav', 'high', 'https', 'hyper', 'immune', 'immunity', 'impact', 'inasmuch', 'induced', 'infection', 'ivermectin', 'japan', 'kowa', 'light', 'load', 'london', 'lsoril', 'manifest', 'maxdkozlov', 'mdfacep', 'melulater', 'milde', 'mkky24weqv', 'mutations', 'nabs', 'november', 'older', 'omicron', 'people', 'peterandann', 'pompey1977', 'prevent', 'produced', 'protection', 'quicker', 'recent', 'registration',

In [9]:
vectorizer = CountVectorizer(stop_words='english', min_df=2)
X = vectorizer.fit_transform(corpus)
words = vectorizer.get_feature_names()
print(f"#words {len(words)} {words}")

#words 28 ['antiviral', 'application', 'caused', 'cihr', 'cihr_irsc', 'covid19', 'delta', 'disruptions', 'effect', 'erictopol', 'extending', 'freethinkfacts', 'https', 'infection', 'ivermectin', 'japan', 'kowa', 'light', 'omicron', 'people', 'registration', 'rt', 'says', 'showed', 'svictor70973566', 'vaccine', 'variant', 'yaneerbaryam']


In [10]:
simhashes = []
for i in range(len(corpus)):
    features = vectorizer.get_feature_names()
    weights = X[i].toarray()[0]
    fingerprint = calculate_simhash(features, weights)
    simhashes.append(fingerprint)


In [11]:
calculate_hammingdistances = np.zeros((len(corpus), len(corpus)))

for i in range(len(corpus)):
    for j in range(i + 1, len(corpus)):
        calculate_hammingdistances[i, j] = hamming(simhashes[i], simhashes[j])
              
print("Hamming Distances:")
print(calculate_hammingdistances)

Hamming Distances:
[[0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1.]
 [0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1.]
 [0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1.]
 [0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [12]:
calculate_euclideandistances = np.zeros((len(corpus), len(corpus)))

for i in range(len(corpus)):
    for j in range(i + 1, len(corpus)):
        dist = euclidean(X[i].toarray()[0], X[j].toarray()[0])
        calculate_euclideandistances[i, j] = dist

print("\nEuclidean Distances:")
print(calculate_euclideandistances)


Euclidean Distances:
[[0.         4.35889894 3.46410162 3.46410162 0.         3.74165739
  3.60555128 3.87298335 3.16227766 3.60555128 3.74165739 3.16227766
  0.         4.35889894 3.74165739]
 [0.         0.         3.31662479 3.31662479 4.35889894 3.60555128
  3.46410162 3.16227766 3.60555128 3.46410162 3.         3.
  4.35889894 0.         3.60555128]
 [0.         0.         0.         2.         3.46410162 2.44948974
  2.23606798 2.23606798 2.44948974 1.         2.44948974 1.41421356
  3.46410162 3.31662479 2.44948974]
 [0.         0.         0.         0.         3.46410162 2.44948974
  1.         2.64575131 2.44948974 2.23606798 2.44948974 1.41421356
  3.46410162 3.31662479 1.41421356]
 [0.         0.         0.         0.         0.         3.74165739
  3.60555128 3.87298335 3.16227766 3.60555128 3.74165739 3.16227766
  0.         4.35889894 3.74165739]
 [0.         0.         0.         0.         0.         0.
  2.64575131 3.         2.82842712 2.23606798 2.         2.
  3.74

In [13]:
correlation, _ = pearsonr(calculate_hammingdistances.flatten(), calculate_euclideandistances.flatten())
print("\nPearson Correlation between Distances:", correlation)



Pearson Correlation between Distances: 0.9432267784025823


In [14]:
corpus = ["DrCChambers RT @CIHR_IRSC: In light of the disruptions caused by the Omicron variant of #Covid19, CIHR is extending the registration and application de…",
"sealsoftheend Japan's Kowa says Ivermectin showed 'antiviral effect' against Omicron https://t.co/mKKY24WeQV",
"SVictor70973566 RT @EricTopol: Why is Omicron so hyper-transmissible? It's not related to high viral load in the upper airway, as shown by 2 recent studies…",
"freethinkfacts RT @yaneerbaryam: Actual cases of reinfection by Omicron are so widespread they are manifest to anyone who is not closing their eyes: 10/…",
"lsoril RT @CIHR_IRSC: In light of the disruptions caused by the Omicron variant of #Covid19, CIHR is extending the registration and application de…",
"pompey1977 RT @AllisonPearson: How can people not get it? Omicron’s advantage over Delta is it evades the vaccine. Everyone is going to get Omicron…",
"freethinkfacts RT @yaneerbaryam: Taken together, our results suggest that Omicron-induced immunity may not be sufficient to prevent infection from anothe…",
"SteveBennett15 RT @EricTopol: Anyone who thinks that vaccines aren't working against Omicron might want to review the data https://t.co/9bHYdKxz8u https:/…",
"wasohope RT @ASTERHealthcare: Omicron covid-19 variant was reported from South Africa on November 2021. This variant has had many mutations that aff…",
"SVictor70973566 RT @MdFacep: @EricTopol @maxdkozlov Omicron's impact is in its ability to evade our immune system: Our 'older' vaccine produced NABS fail…",
"Wildantlers @melulater Of course it did, Omicron spreads far quicker. But as a % of people who die from omicron it is far milde… https://t.co/eDFuW5qjAH",
"peterandann RT @EnemyInAState: Omicron, London: Babies and toddlers continue to surge, and 1 in 9 admitted is a child. Over 668 babies and toddlers hav…",
"DrMroz RT @CIHR_IRSC: In light of the disruptions caused by the Omicron variant of #Covid19, CIHR is extending the registration and application de…",
"Deis85208721 CORRECTED-Japan's Kowa says ivermectin showed 'antiviral effect' against Omicron in research https://t.co/VEoQyz5x6F",
"freethinkfacts RT @yaneerbaryam: Thus, breakthrough infection from Omicron may enhance cross-protection against Delta, and vice-versa, [only] inasmuch as…"]

vectorizer = CountVectorizer(stop_words='english', ngram_range=(2, 2), min_df=2)
X = vectorizer.fit_transform(corpus)
words = vectorizer.get_feature_names()
print(f"#words {len(words)} {words}")
print(X.toarray())

#words 22 ['antiviral effect', 'caused omicron', 'cihr extending', 'cihr_irsc light', 'covid19 cihr', 'disruptions caused', 'effect omicron', 'extending registration', 'freethinkfacts rt', 'ivermectin showed', 'japan kowa', 'kowa says', 'light disruptions', 'omicron variant', 'registration application', 'rt cihr_irsc', 'rt erictopol', 'rt yaneerbaryam', 'says ivermectin', 'showed antiviral', 'svictor70973566 rt', 'variant covid19']
[[0 1 1 1 1 1 0 1 0 0 0 0 1 1 1 1 0 0 0 0 0 1]
 [1 0 0 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 1 1 1 1 1 0 1 0 0 0 0 1 1 1 1 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [15]:
simhashes = []
for i in range(len(corpus)):
    features = vectorizer.get_feature_names()
    weights = X[i].toarray()[0]
    fingerprint = calculate_simhash(features, weights)
    simhashes.append(fingerprint)


calculate_hammingdistances = np.zeros((len(corpus), len(corpus)))

for i in range(len(corpus)):
    for j in range(i + 1, len(corpus)):
        calculate_hammingdistances[i, j] = hamming(simhashes[i], simhashes[j])
              
print("Hamming Distances:")
print(calculate_hammingdistances)

Hamming Distances:
[[0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1.]
 [0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1.]
 [0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0.]
 [0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1.]
 [0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [16]:
calculate_euclideandistances = np.zeros((len(corpus), len(corpus)))

for i in range(len(corpus)):
    for j in range(i + 1, len(corpus)):
        dist = euclidean(X[i].toarray()[0], X[j].toarray()[0])
        calculate_euclideandistances[i, j] = dist

print("\nEuclidean Distances:")
print(calculate_euclideandistances)


Euclidean Distances:
[[0.         4.24264069 3.60555128 3.60555128 0.         3.31662479
  3.60555128 3.46410162 3.31662479 3.46410162 3.31662479 3.31662479
  0.         4.24264069 3.60555128]
 [0.         0.         3.         3.         4.24264069 2.64575131
  3.         2.82842712 2.64575131 2.82842712 2.64575131 2.64575131
  4.24264069 0.         3.        ]
 [0.         0.         0.         2.         3.60555128 1.41421356
  2.         1.         1.41421356 1.         1.41421356 1.41421356
  3.60555128 3.         2.        ]
 [0.         0.         0.         0.         3.60555128 1.41421356
  0.         1.73205081 1.41421356 1.73205081 1.41421356 1.41421356
  3.60555128 3.         0.        ]
 [0.         0.         0.         0.         0.         3.31662479
  3.60555128 3.46410162 3.31662479 3.46410162 3.31662479 3.31662479
  0.         4.24264069 3.60555128]
 [0.         0.         0.         0.         0.         0.
  1.41421356 1.         0.         1.         0.         0

In [17]:
correlation, _ = pearsonr(calculate_hammingdistances.flatten(), calculate_euclideandistances.flatten())
print("\nPearson Correlation between Distances:", correlation)


Pearson Correlation between Distances: 0.8886480198878762
