# PCA with Count Vectors of Names
How well can PCA reduce dimensionality in count vectors of names assuming english alphabet (len 26 vector)

In [1]:
# Load Test Data (names)
from src import load_file
from src import standardize_str_columns_in_dataframe

# Load PII from sample data
df = load_file(
    'data/sample.csv', 
    subset=['FIRST_NAME', 'LAST_NAME']
)

# Deduplicate data
df = df.drop_duplicates()

# Clean names (some randomly generated non-ascii stuff needs to be removed)
df = standardize_str_columns_in_dataframe(df[['FIRST_NAME', 'LAST_NAME']])

In [50]:
df.head(2)

Unnamed: 0,FIRST_NAME,LAST_NAME
0,saladi,batty
1,saylen,hilker


## Alphabet Vectorizer
Let's turn words into a vector of alphabet counts

In [5]:
import string
import numpy as np

# Define Vectorizer
class AlphabetVectorizer():
    def __init__(self, index):
        self.index = index
        
    
    def init_vector(self):
        return np.zeros(len(self.index))
    
    
    def transform(self, x: str) -> np.ndarray:
        
        def update_vector(val, vector):
            if val in self.index:
                vector[self.index[val]] += 1
        
        v = self.init_vector()
        for val in x:
            update_vector(val, v)
        
        return v
        
    
    def __call__(self, x: str):
        return self.transform(x)
    
    
# Create Index
alpha_index = {l: i for i, l in enumerate(string.ascii_lowercase)}

vectorizer = AlphabetVectorizer(index = alpha_index)

In [6]:
# Test vectorizer
test_name = 'vincent'
vectorizer(test_name)

array([0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 2., 0., 0., 0.,
       0., 0., 1., 0., 1., 0., 0., 0., 0.])

In [26]:
import pandas as pd

X = pd.DataFrame(
    df['FIRST_NAME'].apply(vectorizer).to_list(),
    columns=list(alpha_index.keys()))
X.head()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,q,r,s,t,u,v,w,x,y,z
0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## PCA - Manipulating Dimensionality
We can now apply our vectorizer across the dataset and get these nice fixed-length vectors.  From there, we can fit the PCA decomposition and explore how much information we lose as we drop down in dimensions

In [29]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Make an instance of the Model and Standardize the Count Data
pca = PCA(n_components=26)
X = StandardScaler().fit_transform(X)

pca = PCA(n_components = 26)
pca.fit(X)

PCA(n_components=26)

In [33]:
# Check for how much of the variance is maintained (should be all of it or 1 here)
sum(pca.explained_variance_ratio_)

1.0

In [44]:
output = []

for n in range(20, 27, 1):
    pca = PCA(n_components = n)
    pca.fit(X)
    
    run = {}
    run['num_components'] = n
    run['explained_variance'] = sum(pca.explained_variance_ratio_)
    output.append(run)
    
    print(len(pca.transform(X[0:1])[0]))

pd.DataFrame(output)

20
21
22
23
24
25
26


Unnamed: 0,num_components,explained_variance
0,20,0.855085
1,21,0.886514
2,22,0.916421
3,23,0.944671
4,24,0.969927
5,25,0.988068
6,26,1.0


## Check For Stratification - How Different Are Names At Each Reduction
As we reduce dimensionality, data necessarily gets a bit more squished.  We want to make sure that if we compare two names, their 'distance' stays somewhat stable.

In [80]:
from sklearn.metrics.pairwise import cosine_similarity
from numpy.random import randint
from numpy import std, mean

output = []
eval_pool = randint(0, len(X), 10)

for n in range(20, 27, 1):
    pca = PCA(n_components = n)
    pca.fit(X)
    
    similarity= cosine_similarity(pca.fit_transform(X)[eval_pool])[0]
    s_std = std(similarity)
    s_mean = mean(similarity)
    s_range = max(similarity) - min(similarity)
    
    run = {}
    run['num_components'] = n
    run['similarity_mean'] = s_mean
    run['similarity_std'] = s_std
    run['similarity_range'] = s_range
    output.append(run)

pd.DataFrame(output)

Unnamed: 0,num_components,similarity_mean,similarity_std,similarity_range
0,20,0.064254,0.362074,1.472638
1,21,0.06855,0.358582,1.459433
2,22,0.079829,0.326377,1.157697
3,23,0.081241,0.325346,1.148144
4,24,0.080391,0.326202,1.149218
5,25,0.075004,0.324014,1.146364
6,26,0.083606,0.319949,1.163778


## Scikit-Learn Vectorizers TF-IDF Vectorizer, Count Vectorizer
Fit and export vectors for each deduplicated name is possible if names are first split into n-grams (single letters won't work out of the box). Below is an example of splitting the word into n-grams and then running the vectorizer.  The output matrix is still quite large for this corpus at n=2, however, so a simpler vectorizer was used above.

In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

def split_to_ngrams(x: str, n: int = 2):
    s = x
    l = ''
    
    while len(s) >= n:
        l += s[0:n] + ' '
        s = s[n:]
        
    if len(s) > 0:
        l += s
        
    return l

            
# Modify the data such that the vectorizer pics up each letter as an entity(word)
df['X'] = df['FIRST_NAME'].apply(split_to_ngrams)

# vectorizer = TfidfVectorizer()
vectorizer = CountVectorizer(stop_words = None)
X = vectorizer.fit_transform(df.X)
print(X.shape)
print(vectorizer.transform(df.X[0:1]))

(99993, 551)
  (0, 76)	1
  (0, 243)	1
  (0, 389)	1
