In [11]:
import json
import scipy
import torch
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertModel

In [21]:
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score
from sklearn.metrics.cluster import contingency_matrix 
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

In [2]:
def get_embedding(texts, model, tokenizer, prep, mask=False):
    
    embs = []
    
    for idx, text in enumerate(texts):
               
        marked_text = "[CLS] " + text + " [SEP]"
        tokenized_text = tokenizer.tokenize(marked_text)

        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)[:512]
        
        attention_ids = [1] * len(indexed_tokens)
        segments_ids = [0] * len(indexed_tokens)
        
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
        attention_tensors = torch.tensor([attention_ids])
        
        with torch.no_grad():
#             outputs = model(input_ids=tokens_tensor, attention_mask=attention_tensors, token_type_ids=segments_tensors, return_dict=True)
            outputs = model(input_ids=tokens_tensor, attention_mask=attention_tensors, return_dict=True, output_hidden_states=True)
            hidden_states = outputs.last_hidden_state      

            if prep in tokenized_text:
                token_idx = tokenized_text.index(prep)
                sum_vec = hidden_states[0][token_idx]
                embs.append(sum_vec.numpy().tolist())
            else:
                print(prep, 'not found in', marked_text, text)
                embs.append(float("nan"))
                continue
            
    return embs

In [3]:
PREPOSITION = 'в силу' 
ENCODER = 'DeepPavlov/rubert-base-cased'
PREPOSITION_CONTENT = PREPOSITION.split()[1] #силу
# ENCODER = 'bert-base-multilingual-uncased'

In [4]:
import pygsheets
c = pygsheets.authorize(service_file='client_secret.json')
# ***

In [None]:
tokenizer = BertTokenizer.from_pretrained(ENCODER)
encoder = BertModel.from_pretrained(ENCODER, output_hidden_states = True)

# make sure that the prep is tokenized properly (no subtokens)
assert len(tokenizer(PREPOSITION_CONTENT).input_ids) == 3

In [8]:
# sh.worksheets() #list all preps in a file
wk = sh.worksheet_by_title(PREPOSITION)
df = wk.get_as_df()

In [9]:
# text preprocessing if needed

# df.dropna(subset=['year'],inplace=True)
# df['year'] = df['year'].str[:4]
# df['year_split'] = df['year'].astype(str).str.split(pat="-").str[0]
# df['text_cleaned'] = df['text'].str.replace(r"\[.*\]","")
# df['text_cleaned'] = df['text_cleaned'].astype(str).str.lower()
# df['year'] = df['year'].astype('int32')
# df = df[df['year'].str.isdigit()]

df['text'] = df['text'].astype(str).str.lower()
df['year'] = df['year'].astype('int32')

In [None]:
df['embs'] = get_embedding(df["text"], encoder, tokenizer, PREPOSITION_CONTENT, False)

In [13]:
df.dropna(subset=['year', 'text', 'embs'], inplace=True)
df.to_pickle("./pkl/"+PREPOSITION+".pkl")

In [None]:
# https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html
stats = {}

for prep in [PREPOSITION]:
    
    stats[prep] = {}
    print("prep", prep)
    
    try:
        df = pd.read_pickle("pkl/"+prep+".pkl")
        print(df.shape)
    except:
        continue

    new_df = pd.DataFrame()
    
    for year in [(1800,1850), (1850, 1900), (1900, 1950), (1950, 2000)]:
        print("period", year)
    
        kmeans = {}
        silhouette_list = []
        
        small_df = df[df['year'].between(year[0], year[1])]
        print("Shape of the defined period:", small_df.shape)
        
        X = small_df['embs'].tolist()

        for n_clusters in range(2, 7):
            try:
                clusterer = KMeans(n_clusters=n_clusters, random_state=10).fit(X)
            except:
                continue  
                
            cluster_labels = clusterer.predict(X)
            kmeans[n_clusters] = clusterer

            silhouette = silhouette_score(X, cluster_labels)
            silhouette_list.append((n_clusters, silhouette))

        silhouette_list_sorted = sorted(silhouette_list, key=lambda x: x[1], reverse=True)
        stats[prep][year] = silhouette_list_sorted

        
        small_df.assign(cluster=lambda x: kmeans[silhouette_list_sorted[0][0]].predict(x.embs.tolist())[0])
#         small_df['cluster'] = small_df['embs'].apply(lambda x: kmeans[silhouette_list_sorted[0][0]].predict([x])[0])
#         small_df['cluster_distance'] = small_df['embs'].apply(lambda x: [scipy.spatial.distance.euclidean(cl, x) for cl in kmeans[silhouette_list_sorted[0][0]].cluster_centers_])
    
        new_df = pd.concat([new_df, small_df], ignore_index=True)

In [24]:
stats

{'в силу': {(1800, 1850): [(2, 0.16734242536917696),
   (4, 0.1076494598462531),
   (6, 0.08846447846254077),
   (5, 0.0709829133248713),
   (3, 0.06871632535788721)],
  (1850, 1900): [(2, 0.07675602386730403),
   (3, 0.06588214788759043),
   (4, 0.06142597883234597),
   (6, 0.04178164581758671),
   (5, 0.039427527694123145)],
  (1900, 1950): [(2, 0.09444822894245805),
   (3, 0.06976714339675091),
   (4, 0.055987144184087505),
   (6, 0.052077270356110386),
   (5, 0.047939452056141564)],
  (1950, 2000): [(2, 0.16142255796431365),
   (3, 0.14147452958160067),
   (4, 0.088486379816195),
   (5, 0.07130788867854179),
   (6, 0.06728665077453826)]}}

In [123]:
new_df.sort_values(by=['year'], inplace=True)
new_df.drop('embs', axis='columns', inplace=True)

wks = sh.worksheet_by_title("="+PREPOSITION)
wks.set_dataframe(new_df,(1,1), fit=True)