As discussed in the previous notebook, we now align the models to find semantic shifts.

The models are aligned pairwise and the shift is detected comparing cosine similarities between adjacent years.

In [1]:
import sys
sys.path.append("..")

import numpy as np
from pprint import pprint
from gensim.models import Word2Vec, KeyedVectors

from src.seeds import Seeds
from src.gensim_word2vec_procrustes_align import smart_procrustes_align_gensim

In [2]:
def print_similar(word, models, n=5):
    print(word)
    for y, m in models.items():
        try:
            print(f"\t{y}: {[e[0] for e in m.wv.most_similar(word, topn=n)]}")
        except:
            pass
        
def cos_sim(a, b):
    dot_product = np.dot(a, b) # x.y
    norm_a = np.linalg.norm(a) #|x|
    norm_b = np.linalg.norm(b) #|y|
    return dot_product / (norm_a * norm_b)

def get_similarity(m1, m2, word):
    try:
        return cos_sim(m1.wv[word], m2.wv[word])
    except:
        return -1
    
def get_similarity_sequence(models, couple_lists, word):
    return [get_similarity(models[s], models[e], word) for s, e in couple_lists]

def find_semantic_shift(models, couple_list, word, alpha=.9, threshold=.7):
    
    els = list(zip(couple_list, get_similarity_sequence(models, couple_list, word)))
    els = [e for e in els if e[1]>0]
    if not els:
        return []
    if len(els) == 1 and els[0][1] < threshold:
        return els
    
    m = np.mean([e[1] for e in els if e[1]>0])
    return [e for e in els if e[1] < alpha*m and e[1] != -1]
    # return [e for e in els if e[1] < 0.7 and e[1] != -1]


def print_shift(semantic_shifts):
    if not semantic_shifts:
        print("No semantic shifts found.")
    else:
        s = "Semantic shifts found:\n\t"
        s += "\n\t".join([f"{e[0][0]} - {e[0][1]}: cos_sim {round(e[1], 2)}" 
                            for e in semantic_shifts])
        print(s)

### Aligning the models

In [3]:
directory = "../data/models/time_vectors"
models = {int(model_name.split("/")[-1][:4]) : Word2Vec.load(model_name) 
          for model_name in [f"{directory}/{el}" for el in sorted(os.listdir(directory))]}

In [4]:
years = sorted(list(models.keys()), reverse=True)
couple_list = list(zip(years, years[1:]))
print(couple_list[:3], "...")

[(2010, 2005), (2005, 2000), (2000, 1995)] ...


In [5]:
for base, other in couple_list:
    smart_procrustes_align_gensim(models[base], models[other])

### Detecting semantic shifts

The function detected two semantic shifts, the first one is plausible, while the second might require some tuning.

In [16]:
word = "gun"
print(get_similarity_sequence(models, couple_list, word))
print_shift(find_semantic_shift(models, couple_list, word))
print_similar(word, models, 10)

[0.8691654, 0.90565354, 0.89996094, 0.8732419, 0.8876986, 0.8930776, 0.8834063, 0.8867987, 0.85011154, 0.85648865, 0.92155033, 0.9331214, 0.9254542, 0.8913033, 0.9035417, 0.88802475, 0.89950114, 0.9326969, 0.8689469, 0.6593746, 0.68954474, 0.769387, 0.5876891, 0.76707613, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
Semantic shifts found:
	1915 - 1910: cos_sim 0.6600000262260437
	1910 - 1905: cos_sim 0.6899999976158142
	1900 - 1875: cos_sim 0.5899999737739563
gun
	1870: ['fracture', 'pair', 'clean', 'fight', 'left', 'bone', 'game', 'packing', 'hood', 'play']
	1875: ['appliance', 'scheme', 'tea', 'pistol', 'combination', 'fist', 'heart', 'mechanical', 'switching', 'shoulder']
	1900: ['pistol', 'knife', 'hook', 'accidentally', 'revolver', 'neck', 'punch', 'shoot', 'pin', 'shot']
	1905: ['shoot', 'shake', 'dress', 'spark', 'firing', 'lunch', 'glove', 'scream', 'ignite', 'cry']
	1910: ['pistol', 'cartridge', 'firing', 'knife', 'revolver', 'flame', 'powder', 'toy', 'ignite', 'explode']
	

Another example

In [17]:
word = "homosexual"
print(get_similarity_sequence(models, couple_list, word))
print_shift(find_semantic_shift(models, couple_list, word))
print_similar(word, models, 10)

[0.33504492, 0.47475046, 0.7654252, 0.80400616, 0.80024385, 0.7773294, 0.81307054, 0.68021977, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
Semantic shifts found:
	2010 - 2005: cos_sim 0.3400000035762787
	2005 - 2000: cos_sim 0.4699999988079071
homosexual
	1970: ['unprovoked', 'paramour', 'prostitute', 'violent', 'delusion', 'panic', 'fellatio', 'stabbing', 'fondle', 'emotionally']
	1975: ['immoral', 'sexual', 'unprovoked', 'threat', 'insult', 'aggressive', 'intimidate', 'revenge', 'lewd', 'woman']
	1980: ['prostitute', 'sexual', 'fondle', 'anal', 'belligerent', 'hysterical', 'sex', 'lewd', 'lure', 'threatening']
	1985: ['prostitute', 'fondle', 'aggressive', 'threatening', 'acquaintance', 'echols', 'killer', 'aggression', 'victim', 'abusive']
	1990: ['anger', 'hate', 'girl', 'tommy', 'aggressive', 'boyfriend', 'jealous', 'abusive', 'friendly', 'boy']
	1995: ['abusive', 'jealous', 'teenage', 'prostitute', 'molest', 'fema

In [18]:
word = "assault"
print(get_similarity_sequence(models, couple_list, word))
print_shift(find_semantic_shift(models, couple_list, word))
print_similar(word, models, 10)

[0.7422146, 0.7420629, 0.74819547, 0.81739616, 0.7563726, 0.716122, 0.83076614, 0.84968543, 0.8456069, 0.87560034, 0.9014175, 0.9030931, 0.9061008, 0.92596436, 0.9177247, 0.938816, 0.9139613, 0.9191841, 0.88082516, 0.8682343, 0.89535046, 0.89584684, 0.78498065, 0.89838916, 0.89973617, 0.65147275, 0.9195534, 0.90651715, 0.9667838, 0.83998644, -1, -1, -1, -1, -1, -1]
Semantic shifts found:
	2010 - 2005: cos_sim 0.7400000095367432
	2005 - 2000: cos_sim 0.7400000095367432
	2000 - 1995: cos_sim 0.75
	1990 - 1985: cos_sim 0.7599999904632568
	1985 - 1980: cos_sim 0.7200000286102295
	1865 - 1860: cos_sim 0.6499999761581421
assault
	1840: ['accuse', 'conviction', 'evidently', 'altogether', 'nevertheless', 'sufficiently', 'precisely', 'side', 'country', 'uncertainty']
	1845: ['detain', 'absent', 'formally', 'regularly', 'home', 'imprisonment', 'following', 'mayor', 'affiant', 'finally']
	1850: ['indict', 'imprisonment', 'threaten', 'battery', 'abundantly', 'corpus', 'ad', 'murder', 'blow', 'conv

### Interesting words shifts
For each word of interest find semantic shifts. 

In [19]:
narcotics, weapons, investigation = Seeds().get_starting_seeds()

In [20]:
for word in narcotics:
    print(word)
    print_shift(find_semantic_shift(models, couple_list, word))
    print()

blunt
Semantic shifts found:
	1975 - 1970: cos_sim 0.6700000166893005
	1970 - 1965: cos_sim 0.699999988079071

methamphetamine
Semantic shifts found:
	2000 - 1995: cos_sim 0.44999998807907104

overdose
Semantic shifts found:
	2010 - 2005: cos_sim 0.44999998807907104

ketamine
No semantic shifts found.

marijuana
Semantic shifts found:
	2010 - 2005: cos_sim 0.7599999904632568

fentanyl
No semantic shifts found.

drugs
No semantic shifts found.

ecstasy
No semantic shifts found.

cannabis
Semantic shifts found:
	1970 - 1965: cos_sim 0.5099999904632568
	1965 - 1960: cos_sim 0.699999988079071

lsd
No semantic shifts found.

heroin
No semantic shifts found.

cocaine
Semantic shifts found:
	2010 - 2005: cos_sim 0.7599999904632568
	1975 - 1970: cos_sim 0.7300000190734863

drug
Semantic shifts found:
	1950 - 1945: cos_sim 0.5400000214576721
	1875 - 1870: cos_sim 0.6600000262260437



In [21]:
for word in weapons:
    print(word)
    print_shift(find_semantic_shift(models, couple_list, word))
    print()

gun
Semantic shifts found:
	1915 - 1910: cos_sim 0.6600000262260437
	1910 - 1905: cos_sim 0.6899999976158142
	1900 - 1875: cos_sim 0.5899999737739563

firearm
Semantic shifts found:
	1965 - 1960: cos_sim 0.49000000953674316
	1945 - 1940: cos_sim 0.47999998927116394
	1940 - 1935: cos_sim 0.3799999952316284
	1930 - 1925: cos_sim 0.6000000238418579
	1920 - 1915: cos_sim 0.5899999737739563

shotgun
Semantic shifts found:
	1945 - 1940: cos_sim 0.7300000190734863
	1930 - 1925: cos_sim 0.6800000071525574

handgun
No semantic shifts found.

revolver
Semantic shifts found:
	1900 - 1875: cos_sim 0.6200000047683716

assault
Semantic shifts found:
	2010 - 2005: cos_sim 0.7400000095367432
	2005 - 2000: cos_sim 0.7400000095367432
	2000 - 1995: cos_sim 0.75
	1990 - 1985: cos_sim 0.7599999904632568
	1985 - 1980: cos_sim 0.7200000286102295
	1865 - 1860: cos_sim 0.6499999761581421

carbine
No semantic shifts found.

rifle
Semantic shifts found:
	1950 - 1945: cos_sim 0.6800000071525574
	1940 - 1935: cos_

In [22]:
for word in investigation:
    print(word)
    print_shift(find_semantic_shift(models, couple_list, word))
    print()

cybercrime
No semantic shifts found.

mafia
No semantic shifts found.

robbery
Semantic shifts found:
	1985 - 1980: cos_sim 0.7400000095367432
	1980 - 1975: cos_sim 0.7200000286102295
	1920 - 1915: cos_sim 0.7400000095367432
	1900 - 1875: cos_sim 0.699999988079071

gang
Semantic shifts found:
	1965 - 1960: cos_sim 0.6499999761581421
	1960 - 1955: cos_sim 0.6700000166893005
	1920 - 1915: cos_sim 0.6100000143051147
	1900 - 1875: cos_sim 0.6100000143051147

theft
Semantic shifts found:
	1915 - 1910: cos_sim 0.6600000262260437
	1910 - 1905: cos_sim 0.49000000953674316
	1900 - 1875: cos_sim 0.47999998927116394
	1875 - 1870: cos_sim 0.6100000143051147

cyber
No semantic shifts found.

killer
No semantic shifts found.

crime
Semantic shifts found:
	1900 - 1875: cos_sim 0.7699999809265137

recidivism
Semantic shifts found:
	2000 - 1995: cos_sim 0.4399999976158142
	1995 - 1990: cos_sim 0.5
	1990 - 1985: cos_sim 0.4399999976158142

serial
Semantic shifts found:
	1945 - 1940: cos_sim 0.6600000262