# College Essay Prompt Clustering (with Spacy)

The goal of this project is to group together similar supplemental essay prompts from colleges in the US. Ultimately, these groups should contain prompts similar enough to be responded to with very similar essays. 

This version of the project uses a Natural Language Processing(NLP) package called Spacy. 

## Importing packages and modules


|Package name|Description|
|------------|-----------|
|numpy     |array-related actions, especially useful for 2D arrays       |
|matplotlib|plotting/graphing       |
|random    |set seed during KMeans (allows for reproducible results)  
|sklearn   |data preprocessing and machine learning algorithms (aka scikit-learn)  
|json      |raw prompt storage and retrieval  
|spacy     |NLP tools, like transformers (sentence --> vector)  

In [4]:
import numpy as np
import matplotlib.pyplot as plt

import random

import sklearn
from sklearn.cluster import KMeans

from sklearn.decomposition import PCA

import json

import spacy

nlp = spacy.load("en_core_web_trf")

## Loading in prompts and storing them

**allprompts**: A list of lists that stores all the prompts. In the overall list, each element is a list that represents a school. In each school's list, there is a unique number that corresponds to them and a list of tuples with each prompt and its title. 

In [5]:
file = open("json_data.json")

allprompts = json.load(file)

  
**numPrompts**: number of prompts from each school  
**prompt_corpus**: prompts (sentences), no longer grouped by school  

In [None]:
numPrompts = []
prompt_corpus = []

for college in allprompts:
    numPrompts.append(len(college[1]))

    for prompt in college[1]:
        if len(prompt[1]) > 0 and len(prompt[1][0]) > 0:
            prompt_corpus.append(prompt[1])

## Transforming Prompts into Vectors

Spacy takes the `prompt_corpus` and performs an entire NLP process on them. nlp.pipe() returns an object with all the "docs" (prompts) processed. However, I just need the vector forms of the prompts, which can be accessed through `doc._trf_data.tensors[-1]`. 

In [None]:
# transformed vectors extracted from nlp.pipe()
prompt_vectors = []

# the original index of the prompt in the prompt_corpus
prompt_indices = []
i = 0

for doc in nlp.pipe(prompt_corpus):
    transformed_prompt = doc._.trf_data.tensors[-1]
    
    # shape is checked since some prompts are too long and are then transformed into multiple vectors, which I have chosen to omit
    if transformed_prompt.shape == (1,768):
        prompt_vectors.append(transformed_prompt[0])
        prompt_indices.append(i)
    i += 1

# numpy 2D array where each row is a prompt's corresponding vector form
prompt_matrix = np.stack(prompt_vectors, axis=0)

In [None]:
# (number of prompts, number of features)
print(prompt_matrix.shape)

# adds a column of indices from prompt_indices to the front of the prompt_matrix
prompt_matrix_indexed = np.hstack((np.array(prompt_indices)[:,None], prompt_matrix))

In [None]:
# extra column now
prompt_matrix_indexed.shape

In [None]:
# function to convert from the index in the prompt_corpus to the list indices in allprompts
# allows for conversion from vector back to original prompt only using index from first column

def indToDict(index):
    numPrompts = []
    for prompts in allprompts:
        numPrompts.append(len(prompts[1]))
    
    total = 0
    for x in range(len(numPrompts)):
        curr = numPrompts[x]

        if (total + curr) > index:
            ind = index - total
            return (x, int(ind))
        
        total += curr

In [None]:
indToDict(943)

In [None]:
# PCA with 4 components

pca = PCA(n_components=4)

pca_data = pca.fit_transform(prompt_matrix_indexed[:,1:])

# again, adds index column 
pca_data = np.hstack((np.arange(pca_data.shape[0])[:, None], pca_data))

In [None]:
num_comps = pca_data.shape[1]-1

fig, axs = plt.subplots(num_comps, num_comps)

fig.set_size_inches(18.5, 10.5, forward=True)

fig.set_dpi(100)

for i in range(1, num_comps+1):
    for j in range(1, num_comps+1):
        axs[i-1][j-1].scatter(pca_data[:,i]+1, pca_data[:,j]+1)
        if i == 1:
            axs[i-1][j-1].set_title("PC " + str(j))
        if j == 1:
            axs[i-1][j-1].set_ylabel("PC " + str(i), fontsize = "large")

plt.show()

In [None]:
pca = PCA(n_components=100)

pca_data = pca.fit_transform(prompt_matrix_indexed[:,1:])

pca_data = np.hstack((np.arange(pca_data.shape[0])[:, None], pca_data))

In [None]:
PC_values = np.arange(pca.n_components_) + 1

cumulative_var = [sum(pca.explained_variance_ratio_[0:x+1]) for x in range(len(pca.explained_variance_ratio_))]

i=0
for x in cumulative_var:
    if x > 0.85:
        print(i)
        break
    i+=1
        

plt.plot(PC_values, cumulative_var, 'o-', linewidth=2, color='blue')
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.show()

In [None]:
print(sum(pca.explained_variance_ratio_))

In [None]:
# k-means for any dim, points is list of vectors in n-dim

def select_init(points, k, labels):
    random.seed = (333)

    dist_weights = []
    centroids = []

    centroids.append(np.asarray(random.choice(points)))

    while len(centroids) < k:
        dist_weights = find_distances(points, centroids, labels)
        centroids.append(np.asarray(random.choices(points, weights=dist_weights)[0]))

    return centroids


def find_distances(points, centroids, labels = 0):
    clusters = [[] for x in range(len(centroids))]
    closest_dists = []

    for curr_point in points:

        # should be large enough given that words will rarely 
        #   occur more than even 20 times in one prompt
        min_dist = 100000000
        min_ind = len(centroids)
        for j in range(len(centroids)):
            curr_dist = sum((centroids[j][labels:] - curr_point[labels:]) ** 2) ** 0.5
            
            if curr_dist < min_dist:
                min_dist = curr_dist
        
        closest_dists.append(min_dist)
        
    return closest_dists


def find_clusters(points, centroids, labels = 0):
    clusters = [[] for x in range(len(centroids))]

    for curr_point in points:

        # should be large enough given that words will rarely 
        #   occur more than even 20 times in one prompt
        min_dist = 100000000
        min_ind = len(centroids)
        for j in range(len(centroids)):
            curr_dist = sum((centroids[j][labels:] - curr_point[labels:]) ** 2) ** 0.5
            if curr_dist < min_dist:
                min_dist = curr_dist
                min_ind = j
        
        clusters[min_ind].append(curr_point)

    return clusters


def k_means(points, k, labels = 0):

    print(labels)

    centroids = select_init(points, k, labels)


    for center_num in range(k):
        centroids[center_num] = np.asarray(centroids[center_num])
    
    equal_means = 0
    clusters = []

    while equal_means < k:
        clusters = find_clusters(points, centroids, labels)
        
        for i in range(k):
            mean = sum(np.asarray(clusters[i])) / len(clusters[i])

            dim = labels
            curr_equal_means = True
            while (dim < len(points[0])) and curr_equal_means:
                if (mean[dim] != centroids[i][dim]):
                    centroids[i] = mean
                    curr_equal_means = False
                dim += 1
            
            if dim == len(points[0]):
                equal_means += 1
    
    return clusters

In [None]:
pca_points = [list(pca_data[i,0:3]) for i in range(len(pca_data[:,1]))]

num_clusters = 30

pca_clusters = k_means(pca_points, num_clusters, labels = 1)

print(len(pca_clusters[0]))

pca_clusters_graph = []

for c in range(len(pca_clusters)):
    for x in pca_clusters[c]:
        pca_clusters_graph.append(x + [c])

pca_clusters_graph = np.asarray(pca_clusters_graph)

plt.scatter(pca_clusters_graph[:,1], pca_clusters_graph[:,2], c = pca_clusters_graph[:,3])
plt.xlabel("PC 1") 
plt.ylabel("PC 2")
plt.title("PC 1 vs PC 2 with 2D KMeans clusters")

In [None]:
n = 2
print(len(pca_clusters[n]))
for point in pca_clusters[n]:
    r, c = indToDict(point[0])

    print(allprompts[int(r)][1][int(c)])

In [None]:
# using sklearn KMeans function
elbow = []

for i in range(1, 50):
    kmeans = KMeans(n_clusters = i, init = "k-means++", random_state = 333)
    kmeans.fit(prompt_matrix_indexed[:,1:])
    elbow.append(kmeans.inertia_)

In [None]:
plt.plot(elbow)
plt.xlabel("Number of Clusters")
plt.ylabel("")
plt.title("KMeans Elbow Plot")

In [None]:
kmeans = KMeans(n_clusters = 50, init = "k-means++", random_state = 333)

kmeans.fit(prompt_matrix_indexed[:,1:])

In [None]:
kmeans.labels_.shape

plt.scatter(pca_data[:,1], pca_data[:,2], c = kmeans.labels_)
plt.xlabel("PC 1")
plt.ylabel("PC 2")
plt.title("PC 1 vs PC 2 plot with multidimensional KMeans clusters")

In [None]:
n = 1

count = 0
for p in range(len(kmeans.labels_)):
    if int(kmeans.labels_[p]) == n:
        r, c = indToDict(p)

        print(allprompts[int(r)][1][int(c)])
        count += 1

print(count)



In [None]:
all_clusters = k_means(prompt_matrix_indexed, 50, labels = 1)

for x in all_clusters:
    print(len(x))

In [None]:
n = 1
for x in all_clusters[n:n+1]:
    print(len(x))

    for i in range(0, len(x)):
        r, c = indToDict(x[i][0])

        print(allprompts[int(r)][1][int(c)])

In [None]:
import hdbscan

cluster_obj = hdbscan.HDBSCAN(min_cluster_size=5, min_samples = 1)

# cluster_obj = hdbscan.HDBSCAN()

cluster_obj.fit(prompt_matrix)

In [None]:
print(sum([x for x in cluster_obj.labels_ if x == -1]))
print(max(cluster_obj.labels_[0:10]))

In [None]:
np.random.seed(333)

projection = sklearn.manifold.TSNE().fit_transform(prompt_matrix)
plt.scatter(*projection.T, c=cluster_obj.labels_)

plt.xlabel("tSNE Dim 1")
plt.ylabel("tSNE Dim 2")
plt.title("tSNE plot colored by HDBSCAN clusters")

In [None]:
num_comps = 6

fig, axs = plt.subplots(num_comps-1, num_comps-1)

fig.set_size_inches(18.5, 10.5, forward=True)

fig.set_dpi(100)

for i in range(1, num_comps):
    for j in range(1, num_comps):
        axs[i-1][j-1].scatter(pca_data[:,i]+1, pca_data[:,j]+1, c = ["red" if x < 0 else "blue" for x in cluster_obj.labels_], s=[1 if x < 0 else 1 for x in cluster_obj.labels_])
        if i == 1:
            axs[i-1][j-1].set_title("PC " + str(j))
        if j == 1:
            axs[i-1][j-1].set_ylabel("PC " + str(i), fontsize = "large")

In [None]:
%%html
<style>
table {float:left}
</style>