In [105]:
# Import libraries
import sys
import os
import collections
from collections import defaultdict
import numpy as np
import pandas as pd
from scipy import stats
import re

In [106]:
# Install required packages.
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

1.13.0+cpu



[notice] A new release of pip available: 22.3.1 -> 23.0
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip available: 22.3.1 -> 23.0
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip available: 22.3.1 -> 23.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [107]:
#Read in the processed data
student_file = "./data/Processed_data.csv"
student_data = pd.read_csv(student_file)
#Drop unecessary columns
student_data.drop(student_data.columns[student_data.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
student_data.head()


Unnamed: 0,studentId,greV,greQ,greA,cgpa,univName,major,program,decision
0,7,157.0,147.0,4.0,3.59,New York University (NYU) - Steinhardt,Communication Sciences And Disorders,MS,Accepted
1,17,157.0,151.0,5.5,3.57,Texas A&M University,International Affairs,MS,Accepted
2,46,155.0,167.0,4.0,3.66,"University Of California, Irvine",Biotechnology Management,MS,Accepted
3,64,161.0,157.0,4.0,3.1,Boston University,Psychology,MS,Accepted
4,70,149.0,157.0,3.0,3.68,New York University (NYU) Steinhardt,Speech Language Pathology (Online),MS,Accepted


In [108]:
#Number of students
n_students = len(student_data)
#Number and names of unique universities
n_uni = len((student_data['univName']).unique())
uni_names = student_data.univName.unique()
#Convert unique universities into a separate table
uni_data = pd.DataFrame(uni_names, columns=['univName'])
#Set id for unique universities, add to the table
uni_data['uniId'] = range(1, n_uni + 1)
uni_data = uni_data[['uniId', 'univName']]
uni_data.head()


Unnamed: 0,uniId,univName
0,1,New York University (NYU) - Steinhardt
1,2,Texas A&M University
2,3,"University Of California, Irvine"
3,4,Boston University
4,5,New York University (NYU) Steinhardt


In [110]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

#Create mapping of universities' names - id.
uni_dict = dict(zip(range(1, n_uni + 1), uni_names))
uni_temp = dict(zip(uni_names, range(1, n_uni + 1)))
arr = []
for i in range(n_students):
    arr.append(uni_temp.get(student_data.univName[i]))
#Create mapping of students' id - gpa
student_dict = dict(zip(student_data.studentId, student_data.cgpa))
overall_data = pd.DataFrame(student_data.studentId, columns=['studentId'])
overall_data['GPA'] = student_data.cgpa
overall_data['uniId'] = arr
overall_data['uniName'] = student_data.univName
overall_data.head()
#Write to a csv file with student id, university id one applied to, gpa
overall_data.to_csv('./data/overall_data.csv')


In [111]:
#Node embedding
class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_students, n_uni, n_factors = 20):
        super().__init__()
        # create student embeddings
        self.student_factors = torch.nn.Embedding(n_students, n_factors)
        # create university embeddings
        self.uni_factors = torch.nn.Embedding(n_uni, n_factors)
        self.student_factors.weight.data.uniform_(0, 0.05)
        self.uni_factors.weight.data.uniform_(0, 0.05)
    def forward(self, data):
        # matrix multiplication
        students, universities = data[:, 2], data[:, 3]
        return(self.student_factors(students) * self.uni_factors(universities)).sum(1)
    def predict(self, student, uni):
        return self.forward(student, uni)

In [112]:
# Creating the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # package that helps transform your data to machine learning readiness

In [113]:
#Train the data
class Loader(Dataset):
    def __init__(self):
        self.data = overall_data.copy()
        
        students = overall_data.studentId.unique()
        unis = overall_data.uniId.unique()
        
        #Turns student/univerities id to index in the dict
        self.stuId2idx = {o:i for i,o in enumerate(students)}
        self.uniId2idx = {o:i for i,o in enumerate(unis)}
        
        #Turns index into student/universities id
        self.idx2stuId = {i:o for o,i in self.stuId2idx.items()}
        self.idx2uniId = {i:o for o,i in self.uniId2idx.items()}
        
        self.data.uniId = overall_data.uniId.apply(lambda x: self.uniId2idx[x])
        self.data.studentId = overall_data.studentId.apply(lambda x: self.stuId2idx[x])
        
        #Focus on universities id with accpeted gpa.
        self.data.head()
        self.x = self.data['uniId'].values
        self.y = self.data['GPA'].values
        
        self.x = torch.tensor(self.x)
        self.y = torch.tensor(self.y)
    def __getuni__(self, index):
        return(self.x[index], self.y[index])
    def __len__(self):
        return len(self.data)

In [114]:
#Embedd data and train the model
model = MatrixFactorization(n_students, n_uni, n_factors=8)
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

In [115]:
# Print out the weight
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)

student_factors.weight tensor([[0.0117, 0.0122, 0.0080,  ..., 0.0367, 0.0130, 0.0111],
        [0.0239, 0.0117, 0.0187,  ..., 0.0473, 0.0062, 0.0253],
        [0.0429, 0.0254, 0.0071,  ..., 0.0270, 0.0067, 0.0137],
        ...,
        [0.0006, 0.0343, 0.0385,  ..., 0.0226, 0.0313, 0.0045],
        [0.0105, 0.0192, 0.0431,  ..., 0.0183, 0.0335, 0.0006],
        [0.0296, 0.0470, 0.0357,  ..., 0.0435, 0.0201, 0.0069]])
uni_factors.weight tensor([[0.0227, 0.0444, 0.0012,  ..., 0.0186, 0.0222, 0.0373],
        [0.0416, 0.0393, 0.0309,  ..., 0.0144, 0.0287, 0.0285],
        [0.0061, 0.0032, 0.0247,  ..., 0.0482, 0.0034, 0.0125],
        ...,
        [0.0141, 0.0124, 0.0460,  ..., 0.0423, 0.0006, 0.0330],
        [0.0316, 0.0374, 0.0259,  ..., 0.0425, 0.0300, 0.0175],
        [0.0324, 0.0192, 0.0277,  ..., 0.0159, 0.0051, 0.0280]])


In [116]:
#Recommend based on weights using k means where each cluster has similar items/univerities
from sklearn.cluster import KMeans
# Fit the clusters based on the universities weights
trained_uni_embeddings = model.uni_factors.weight.data.cpu().numpy()
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_uni_embeddings)
for cluster in range(5):
    print("Cluster #{}".format(cluster))
    unis = []
    for uniIdx in np.where(kmeans.labels_ == cluster)[0]:
        uniId = train_set.idx2uniId[uniIdx]
        gpa_count = overall_data.loc[overall_data['uniId']==uniId].count()[0]
        unis.append((uni_dict[uniId], gpa_count))
    for uni in sorted(unis, key=lambda tup: tup[1], reverse=True)[:10]:
        print("\t", uni[0])

Cluster #0
	 University Of Florida
	 Brown University
	 Ohio State University
	 USC
	 ีUniversity Of Maryland - College Park (UMD)
	 University Of Texas At Austin (UT Austin)
	 Columbia  University (Fu Foundation)
	 University Of Washington, Seattle
	 ีUniversity Of Southern California
	 University Of Illinois
Cluster #1
	 Stanford University
	 Boston University
	 Cornell University
	 Washington University in St. Louis (WashU/WUSTL)
	 Rush University
	 (Arizona State University) ASU
	 ีืUniversity Of Central Florida
	 University Of South Carolina
	 Wayne State University
	 Carnagie Mellon University (CMU)
Cluster #2
	 University Of Washington
	 Arizona State University
	 New York University (NYU)
	 Tufts University
	 Vanderbilt University
	 Radford University
	 University Of Toronto
	 ีUniversity Of Southern California (USC) - Viterbi School Of Engineering
	 Georgia State University
	 University Of Georgia
Cluster #3
	 Duke University
	 Northeastern University
	 University Of Californi