In [1]:
import pandas as pd
import json

import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

In [2]:
# load embeddings
with open('distillBERT_embeddings_paragraph.json') as f:
    all_embeddings = json.load(f)

In [3]:
# show the shape of the embeddings
print('Shape of the embeddings: ', np.array(all_embeddings[0]).shape)

Shape of the embeddings:  (768,)


In [4]:
# PCA dimension reduction
pca = PCA(n_components=10)
pca.fit(all_embeddings)

# show the explained variance ratio
print('Explained variance ratio: ', pca.explained_variance_ratio_)
print('Sum of explained variance ratio: ', sum(pca.explained_variance_ratio_))


Explained variance ratio:  [0.35031584 0.10254797 0.06922593 0.04339591 0.03726173 0.02704681
 0.02350657 0.01883989 0.01843502 0.01584852]
Sum of explained variance ratio:  0.7064241843832859


In [5]:
pcas = pca.transform(all_embeddings)

In [6]:
pcas.shape

(3142, 10)

In [7]:
pcas[:5]

array([[ 1.64033351,  0.26066731, -0.1500692 ,  0.47575927,  0.13056072,
        -0.03519511, -0.33271385, -0.02165716, -0.23167121, -0.12744095],
       [ 1.60429446,  0.41202332, -0.16625691,  0.24071882, -0.20892941,
         0.0807956 ,  0.03813537, -0.15306791, -0.36109912,  0.00260587],
       [ 0.74239751,  0.11544861,  0.90743843,  0.31794922,  0.65205666,
        -0.40186452,  0.09654632,  0.46324191,  0.12036246, -0.33084398],
       [ 0.81202772,  0.04731575,  1.08560973,  0.29103246,  0.69460547,
        -0.33990799,  0.04546961,  0.66995762,  0.19961028, -0.64628818],
       [ 0.86345335,  0.10131379,  0.87677478,  0.45060398,  0.50857896,
        -0.47647844,  0.00602279,  0.56687678, -0.16581174, -0.20131903]])

In [8]:
# load the cfips data
dtypes = {'cfips': str}
cfips_df = pd.read_csv('cfips.csv', dtype=dtypes)
cfips_df.head()

Unnamed: 0,cfips
0,1001
1,1003
2,1005
3,1007
4,1009


In [9]:
dim_names = ['PCA_dim_' + str(i) for i in range(1, 11)]
dim_names

['PCA_dim_1',
 'PCA_dim_2',
 'PCA_dim_3',
 'PCA_dim_4',
 'PCA_dim_5',
 'PCA_dim_6',
 'PCA_dim_7',
 'PCA_dim_8',
 'PCA_dim_9',
 'PCA_dim_10']

In [10]:
PCA_df = pd.DataFrame(columns=['cfips']+dim_names)
PCA_df.head()

Unnamed: 0,cfips,PCA_dim_1,PCA_dim_2,PCA_dim_3,PCA_dim_4,PCA_dim_5,PCA_dim_6,PCA_dim_7,PCA_dim_8,PCA_dim_9,PCA_dim_10


In [11]:
for i, cfips_str in enumerate(cfips_df['cfips']):
    PCA_df.loc[i, 'cfips'] = cfips_str
    PCA_df.loc[i, dim_names] = pcas[i, :]
PCA_df.head()

Unnamed: 0,cfips,PCA_dim_1,PCA_dim_2,PCA_dim_3,PCA_dim_4,PCA_dim_5,PCA_dim_6,PCA_dim_7,PCA_dim_8,PCA_dim_9,PCA_dim_10
0,1001,1.640334,0.260667,-0.150069,0.475759,0.130561,-0.035195,-0.332714,-0.021657,-0.231671,-0.127441
1,1003,1.604294,0.412023,-0.166257,0.240719,-0.208929,0.080796,0.038135,-0.153068,-0.361099,0.002606
2,1005,0.742398,0.115449,0.907438,0.317949,0.652057,-0.401865,0.096546,0.463242,0.120362,-0.330844
3,1007,0.812028,0.047316,1.08561,0.291032,0.694605,-0.339908,0.04547,0.669958,0.19961,-0.646288
4,1009,0.863453,0.101314,0.876775,0.450604,0.508579,-0.476478,0.006023,0.566877,-0.165812,-0.201319


In [12]:
# save the PCA_df to csv
PCA_df.to_csv('PCA_df.csv', index=False)