In [148]:
import matplotlib.pyplot as plt
from matplotlib import offsetbox
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import NullFormatter
import plotly.graph_objects as go
import plotly.express as px
from mpl_toolkits.mplot3d import Axes3D


import numpy as np
import sklearn
import pandas as pd
import scipy
from sklearn.datasets import load_digits, make_s_curve
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.manifold import Isomap, LocallyLinearEmbedding, MDS, TSNE
import seaborn as sns

from time import time
from itertools import zip_longest
from tqdm import tqdm
from collections import OrderedDict
from functools import partial
import warnings
warnings.filterwarnings('ignore')


random_state=56
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg')

In [149]:
poll = pd.read_csv('all-answers.csv')
cands = pd.read_csv('vectors.csv')
cands

Unnamed: 0,name,name_eng,curia,q1,q2,q3,q4,q5,q6,q7,...,q16,q17,q18,q19,q20,q21,q22,q23,q24,q25
0,Аверьянов Иван,Aver'janov Ivan,common,0,1,2,-1,-1,2,2,...,0,-1,0,-1,-1,-1,-1,-1,-1,2
1,Агишев Ирек,Agishev Irek,common,-1,1,2,2,-2,2,-1,...,2,-2,2,2,-1,2,2,2,1,2
2,Албуров Георгий,Alburov Georgij,common,1,-2,1,-1,-1,1,2,...,2,-2,1,-2,2,-2,-2,1,1,-1
3,Александрова-Зорина Елизавета,Aleksandrova-Zorina Elizaveta,common,2,2,2,2,-1,2,2,...,2,-2,2,2,-2,-2,2,-2,-2,-1
4,Аншаков Михаил,Anshakov Mihail,common,1,1,-1,-1,1,2,2,...,2,-1,2,-2,-1,-2,-2,1,-1,-2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174,Крылов Константин,Krylov Konstantin,naz,0,0,2,1,1,1,1,...,2,-2,1,0,0,-2,-1,2,-1,2
175,Поткин Белов,Potkin Belov,naz,-1,1,2,-1,-1,-1,1,...,2,-1,1,-2,2,1,2,2,1,1
176,Радченко Всеволод,Radchenko Vsevolod,naz,1,1,2,0,1,2,1,...,2,-2,1,1,1,-2,-1,1,1,2
177,Резчиков Абанин,Rezchikov Abanin,naz,1,0,2,0,0,0,1,...,2,-2,1,0,0,1,0,1,1,2


In [156]:
def get_pca_embeddings(poll : pd.DataFrame, cands : pd.DataFrame) -> tuple:
    # Embedd poll and cands 25 dimensional vectors via PCA upto two principal components
    # Return (poll_emb, cands_emb)
    scaler = StandardScaler().fit(poll)
    print(f'FEATURES\nMean: {scaler.mean_}\nVariance: {scaler.var_}')

    pca = PCA(n_components=2, svd_solver='full', random_state=random_state).fit(scaler.fit_transform(poll))
    poll_emb = pd.DataFrame(pca.transform(scaler.fit_transform(poll)), columns=['PC1', 'PC2'])

    cands_emb = pd.DataFrame(pca.transform(scaler.fit_transform(cands.iloc[:, 3:])), columns=['PC1', 'PC2'])
    cands_emb = pd.concat(objs=(cands.iloc[:, 0], cands_emb), axis=1)

    print(f'\nPCA\nExplained variance: {pca.explained_variance_ratio_}')
    return poll_emb, cands_emb

poll_emb, cands_emb = get_pca_embeddings(poll, cands)

FEATURES
Mean: [ 0.77337149 -0.59702916  0.80212592  0.491142    0.42586536  0.9285909
  0.19133279  1.29626601  1.38689016  0.17034614  1.02507495  0.08885255
 -0.08394658 -0.63368765  0.32461161  1.33592259 -0.8807577  -0.13396021
 -0.89465794  0.50218043 -1.59498501  0.25647315  0.58449169  0.53284274
  0.51144726]
Variance: [1.33798266 1.49596828 1.71691017 1.51239087 1.45954903 1.59833492
 1.76115682 0.73997243 1.0905722  1.80608712 1.35069313 2.32543856
 1.89619636 2.21304884 2.39939699 0.86221733 1.26474012 2.02743488
 1.43493739 1.20965728 0.78717572 1.53602038 1.30009746 1.6896409
 1.58892592]

PCA
Explained variance: [0.16177616 0.10788656]


In [157]:
def get_dataframe(poll_emb : pd.DataFrame, cands_emb : pd.DataFrame) -> pd.DataFrame:
    df = pd.concat(objs=(poll_emb, cands_emb), axis=0)
    df['type'] = (~df['name'].isna()).replace(to_replace = {True : 'candidate', False : 'elector'})
    df['name'] = df['name'].apply(lambda x: 'elector' if type(x)==float else x)
    return df

df = get_dataframe(poll_emb, cands_emb)
df

Unnamed: 0,PC1,PC2,name,type
0,0.865645,-0.903099,elector,elector
1,-0.118478,-2.374318,elector,elector
2,-0.598661,-0.818387,elector,elector
3,0.281441,0.061510,elector,elector
4,0.770516,1.255159,elector,elector
...,...,...,...,...
174,0.836419,-1.345265,Крылов Константин,candidate
175,0.193825,-1.435835,Поткин Белов,candidate
176,1.230580,-1.717299,Радченко Всеволод,candidate
177,0.621822,-1.130206,Резчиков Абанин,candidate


In [158]:
fig = px.scatter(data_frame=df, x='PC1', y='PC2', color='type', hover_data=['name'])
fig.update(layout_coloraxis_showscale=False)
fig.show()

In [None]:
# precomp = scipy.spatial.distance.pdist(poll, 'minkowski', p=1.)
# precomp = scipy.spatial.distance.squareform(precomp)
# precomp

In [None]:
# n_neighbors=20
# random_state=21


# mds = MDS(n_components=3, n_init=1, max_iter=100, dissimilarity='precomputed', random_state=random_state)
# isomap = Isomap(n_neighbors=n_neighbors, n_components=3, metric='precomputed')
# lle = LocallyLinearEmbedding(n_neighbors=n_neighbors, n_components=3, method='standard', random_state=random_state)
# tsne = TSNE(perplexity=n_neighbors, n_components=3, random_state=random_state, metric='precomputed')

# mds_emb = mds.fit_transform(precomp)
# print('done')
# isomap_emb = isomap.fit_transform(precomp)
# print('done')
# lle_emb = lle.fit_transform(poll)
# print('done')
# tsne_emb = tsne.fit_transform(precomp)
# # plt.scatter(mds.fit_transform(poll))

In [None]:
# df = pd.DataFrame.from_records(np.concatenate((tsne_emb, .01 * np.ones(7338).reshape(-1,1)), axis=1),
#                                columns=['x_emb', 'y_emb', 'z_emb', 'size'])
# fig = px.scatter_3d(df, x='x_emb', y='y_emb', z='z_emb', size='size', opacity=1.)
# fig.show()

In [None]:
# df = pd.DataFrame.from_records(np.concatenate((isomap_emb, np.ones(7338).reshape(-1,1)), axis=1), columns=['x_emb', 'y_emb', 'z_emb', 'size'])
# df

In [None]:
# df = pd.DataFrame.from_records(isomap_emb, columns=['x_emb', 'y_emb', 'z_emb'])
# sns.pairplot(df)
# plt.savefig('pairplot.png', dpi=300)