In [2]:
!pip install kneed

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting kneed
  Downloading kneed-0.8.2-py3-none-any.whl (10 kB)
Installing collected packages: kneed
Successfully installed kneed-0.8.2


In [24]:
import pandas as pd
import numpy as np
import pandas as pd
from scipy import spatial
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# libraries for k-means clustering 
from kneed import KneeLocator
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# libraries for pca analysis 
import pickle 
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
import plotly 
import plotly.graph_objs as go
from sklearn.decomposition import PCA

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
#import word embedding 
!wget http://nlp.stanford.edu/data/glove.42B.300d.zip
!unzip glove.42B.300d.zip

--2023-02-25 21:37:23--  http://nlp.stanford.edu/data/glove.42B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.42B.300d.zip [following]
--2023-02-25 21:37:23--  https://nlp.stanford.edu/data/glove.42B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.42B.300d.zip [following]
--2023-02-25 21:37:23--  https://downloads.cs.stanford.edu/nlp/data/glove.42B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1877800501 (1.7G) [application/zip]


In [5]:
#import csv and dropping rows without mission statements
url = 'https://raw.githubusercontent.com/p-ai-org/p-colleges/main/Brian/Mission%20Statement/Data_2-14-2023.csv'
df1 = pd.read_csv(url)
df1['Mission statement (IC2020mission)'].replace('', np.nan, inplace=True)
df1.dropna(subset=['Mission statement (IC2020mission)'], inplace=True)
len(df1)

427

In [10]:
# sample mission statement text imported from csv file
mission_statement = 'Boston College was founded in 1863 by the Society of Jesus (the Jesuits) to educate Boston’s predominantly Irish, Catholic immigrant community. It opened its doors on September 5, 1864, in a building on Harrison Avenue in Boston’s South End, a small streetcar college for commuting students. When it outgrew the limitations of the space, then-president Rev. Thomas I. Gasson, S.J., bought 31 acres of the former Lawrence Farm in Chestnut Hill, Massachusetts, and broke ground in 1909 on a new campus, today fondly known as the Heights. BC began as an undergraduate liberal arts college, but as its aspirations grew, it added graduate programs and professional schools fulfilling its charter as a university.'

In [6]:
# naming word embedding data 
!head -n 1000 glove.42B.300d.txt > top_1000.txt

In [7]:
# creating a dictionary for the words and embeddings
embeddings = {}
with open('top_1000.txt', 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings[word] = vector
        
words_with_embeddings = set([w for w in embeddings])

In [8]:
# function 1: changing text to a list of words, removing punctuation and stop words

def clean(text):
    # clean description
    text = text.translate(str.maketrans('', '', string.punctuation))
    description = text.lower().strip()
    words = description.split(" ")
    stops = set(stopwords.words('english'))
    
    # filter out stop words
    words = [w for w in words if not w in stops]

    return words

In [12]:
"""
takes the list of words, finds the word embeddings for each word, and 
finds the average word embedding for the list of words. 
"""

def calculate_description_embedding(words):
    words = [w for w in words if (w in words_with_embeddings)]
    
    if len(words) == 0:
        return None
    
    # calculate embedding and return
    return sum([embeddings[w] for w in words])/len(words)

In [13]:
# making a dataframe with embeddings 
df1["description embeddings"] = [calculate_description_embedding(desc) for desc in df1["Mission statement (IC2020mission)"]]
description_embeddings = df1[["Institution Name","description embeddings"]].set_index("Institution Name").dropna().to_dict()['description embeddings']

In [15]:
def recommend(institution):
    '''Finds 10 closest college for a given college by taking the cosine similarity of their description embeddings.'''
    
    def distance(institution, reference):
        return spatial.distance.cosine(description_embeddings[institution], description_embeddings[reference])

    def closest_courses(reference):
        return sorted(description_embeddings.keys(), key=lambda w: distance(w, reference))
    
    return closest_courses(institution)[:10]

In [16]:
# Trying out the recommender
print(recommend('Scripps College'))

['Scripps College',
 'Dickinson State University',
 'Baptist Bible College',
 'World Mission University',
 'Compass College of Film and Media',
 'Randall University',
 'Olivet College',
 "The King's College",
 'Howard University',
 'Clark Atlanta University']

In [None]:
# using k-means clustering on the dataset

def KMeans_Cluster(dataset):
  # this part of the function groups the data
  vectorsList = [i for i in dataset.values()]
  # chose k = 10 because it seems fitting for the colleges, alternatively, we can use elbow or silhouette method to find k, but that approach is usually for unsupervised data (can't see output and tweak yourself). 
  clustering = KMeans(n_clusters=10)
  clustering.fit(vectorsList)

  collegeGroups = clustering.labels_
  
  # this part of the of the function pairs the raw data back to their colleges
  colleges = [i for i in dataset.keys()]
  pair = zip(colleges, collegeGroups)
  result = (list(pair))
  result.sort(key=lambda x: float(x[1]))
  return result

In [18]:
# visualizing with PCA

glove_file = "glove.42B.300d.txt"
glove2word2vec(glove_file, "glove.42B.300d.word2vec.txt") 

model = KeyedVectors.load_word2vec_format("glove.42B.300d.word2vec.txt")


In [19]:
filename = 'glove2word2vec_model.sav'
pickle.dump(model, open(filename, 'wb')) #applying the model and storing the data

In [22]:
# helper function: returns a list of tuples in the form (sim_words[i], word) 

def append_list(sim_words, words):
    
    list_of_words = []
    
    for i in range(len(sim_words)):
        
        sim_words_list = list(sim_words[i])
        sim_words_list.append(words)
        sim_words_tuple = tuple(sim_words_list)
        list_of_words.append(sim_words_tuple)
        
    return list_of_words

In [23]:
# defining the input as our mission statement test
input_words = clean(mission_statement) 
result_word = [] 

# taking out all the words that are not in the vocabulary
for word in input_words:
  try:
    sim_words = model.most_similar(word, topn = 5)
  except:
    input_words.remove(word)

# generating the top 5 similar words for each word in the mission statement 
for word in input_words: 
  sim_words = model.most_similar(word, topn = 5)
  sim_words = append_list(sim_words, word)           
  result_word.extend(sim_words)

In [28]:
similar_word = [word[0] for word in result_word]
similarity = [word[1] for word in result_word] 
similar_word.extend(input_words)
labels = [word[2] for word in result_word]
label_dict = dict([(y,x+1) for x,y in enumerate(set(labels))])
color_map = [label_dict[x] for x in labels]

In [25]:
# plotting function
def display_pca_scatterplot_3D(model, user_input=None, words=None, label=None, color_map=None, topn=5, sample=10):

    if words == None:
        if sample > 0:
            words = np.random.choice(list(model.vocab.keys()), sample)
        else:
            words = [ word for word in model.vocab ]
    
    word_vectors = np.array([model[w] for w in words])
    
    three_dim = PCA(random_state=0).fit_transform(word_vectors)[:,:3]

    data = []
    count = 0
    
    for i in range (len(user_input)):

                trace = go.Scatter3d(
                    x = three_dim[count:count+topn,0], 
                    y = three_dim[count:count+topn,1],  
                    z = three_dim[count:count+topn,2],
                    text = words[count:count+topn],
                    name = user_input[i],
                    textposition = "top center",
                    textfont_size = 20,
                    mode = 'markers+text',
                    marker = {
                        'size': 10,
                        'opacity': 0.8,
                        'color': 2
                    }
       
                )
                            
                data.append(trace)
                count = count+topn

    trace_input = go.Scatter3d(
                    x = three_dim[count:,0], 
                    y = three_dim[count:,1],  
                    z = three_dim[count:,2],
                    text = words[count:],
                    name = 'input words',
                    textposition = "top center",
                    textfont_size = 20,
                    mode = 'markers+text',
                    marker = {
                        'size': 10,
                        'opacity': 1,
                        'color': 'black'
                    }
                    )
            
    data.append(trace_input)
    
# Configure the layout

    layout = go.Layout(
        margin = {'l': 0, 'r': 0, 'b': 0, 't': 0},
        showlegend=True,
        legend=dict(
        x=1,
        y=0.5,
        font=dict(
            family="Courier New",
            size=25,
            color="black"
        )),
        font = dict(
            family = " Courier New ",
            size = 15),
        autosize = False,
        width = 1000,
        height = 1000
        )


    plot_figure = go.Figure(data = data, layout = layout)
    plot_figure.show()

In [29]:
display_pca_scatterplot_3D(model, input_words, similar_word, labels, color_map)