In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy 
import warnings
from termcolor import colored

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial import distance; 
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity

warnings.filterwarnings("ignore")

In [20]:
def match(students, employers):
    students = pd.read_csv(students)
    employers = pd.read_csv(employers)

    curr = employers.rename(columns={'Company Name': 'Name', 'Majors and Minors (check all that apply)':'Majors/Minors'})
    students = students.rename(collumns={'Best email to reach you':'Name', 'Select your major and minor (check all that apply)':'Majors/Minors'})

    for i in range(len(employers.index)):

        # obtain i-th employer from dataframe
        curr = employers.iloc[[i]]

        # perform filtering on all students based on criteria of i-th employer
        filtered = round1_filter(students, curr)

        # create dataframe with filtered students and i-th employer
        appended = filtered.append(curr)

        # find optimal number of clusters for appended dataframe
        s_score, db_score = optimize_skills(filtered, curr)
        s_clusters = find_num_clusters(plot_evaluation(s_score))
        db_clusters = find_num_clusters(plot_evaluation(db_score))

        # perform clustering on appended using both of the optimized cluster scores
        # use appended dataframe because we need apply a bonus weight if student and employer's clusters match
        s_clustered = round2_cluster(appended, s_clusters)
        db_clustered = round2_cluster(appened, db_clusters)

        # get list of top 10-12 candidates as a list of tuples (x, y) where x is the candidate's email address and y is their similarity score
        s_optimal_matchings = match_skills(s_clustered)
        db_optimal_matchings = match_skills(db_clustered)

        # cleanup all dataframes and get new dataframe which includes candidate's email, similarity score, and social causes columns
        s_cleaned_up = cleanup(s_clustered, s_optimal_matchings)
        db_cleaned_up = cleanup(db_clustered, db_optimal_matchings)

        # find optimal number of clusters for social causes
        s_social_score, db_social_score = optimize_social(s_cleaned_up)
        s_social_clusters = find_num_clusters(plot_evaluation(s_social_score))
        db_social_clusters = find_num_clusters(plot_evaluation(db_social_score))

        # find optimal clusterings for social causes
        s_social = round3_socialcluster(appended, s_social_clusters)
        db_social = round3_socialcluster(appended, db_social_clusters)

        # return list of top 3-5 candidates based on social clustering
        s_final = match_social(s_social)
        db_final = match_socials(db_social)

        # pretty print top candidates for current employer
        pretty_print(s_final)
    return

In [None]:
def round1_filter(students, employer):
    

In [None]:
def optimize_skills(appended):
    df = appended
    df['Problem Solving'] = df['Rank each skill on the list first to last. [Problem Solving]'].astype(str).str[0]
    df['Creativity'] = df['Rank each skill on the list first to last. [Creativity]'].astype(str).str[0]
    df['Research'] = df['Rank each skill on the list first to last. [Research]'].astype(str).str[0]
    df['Time Management'] = df['Rank each skill on the list first to last. [Time Management]'].astype(str).str[0]
    df['Communication'] = df['Rank each skill on the list first to last. [Communication]'].astype(str).str[0]
    # df['Critical Thinking'] = df[' [Critical Thinking]'].astype(str).str[0]

    newdf = df[['Problem Solving', 'Creativity', 'Research', 'Time Management', 'Communication']]
    newdf['Problem Solving'].replace("n", value="0", inplace=True)
    newdf['Creativity'].replace("n", value="0", inplace=True) 
    newdf['Research'].replace("n", value="0", inplace=True) 
    newdf['Time Management'].replace("n", value="0", inplace=True) 
    newdf['Communication'].replace("n", value="0", inplace=True) 
    #print(newdf)


    scaler = MinMaxScaler()
    new_df = pd.DataFrame(scaler.fit_transform(newdf), columns=newdf.columns[:], index=newdf.index)

    # print(new_df)
    # Setting the amount of clusters to test out
    cluster_cnt = [i for i in range(2, 12, 1)]

    # Establishing empty lists to store the scores for the evaluation metrics
    s_scores = []

    db_scores = []

    # Looping through different iterations for the number of clusters
    for i in cluster_cnt:
        
        # Hierarchical Agglomerative Clustering with different number of clusters
        hac = AgglomerativeClustering(n_clusters=i)
        
        hac.fit(new_df)
        
        cluster_assignments = hac.labels_
        
        ## KMeans Clustering with different number of clusters
        k_means = KMeans(n_clusters=i)
        
        k_means.fit(new_df)
        
        cluster_assignments = k_means.predict(new_df)
        
        # Appending the scores to the empty lists    
        s_scores.append(silhouette_score(new_df, cluster_assignments))
        
        db_scores.append(davies_bouldin_score(new_df, cluster_assignments))
    return s_scores, db_scores

In [None]:
def plot_evaluation(scores):
    df = pd.DataFrame(columns=['Cluster Score'], index=[i for i in range(2, len(y)+2)])
    df['Cluster Score'] = y
    
    # print('Max Value: Cluster #', df[df['Cluster Score']==df['Cluster Score'].max()])
    # print('\nMin Value: Cluster #', df[df['Cluster Score']==df['Cluster Score'].min()])
    # print('\n')
    
    # Plotting out the scores based on cluster count
    # plt.figure(figsize=(16,6))
    # plt.style.use('ggplot')
    # plt.plot(x,y)
    # plt.xlabel('# of Clusters')
    # plt.ylabel('Score')
    # plt.show()
    return df['Cluster Score']==df['Cluster Score'].max()

In [None]:
def find_num_clusters(scores)
for i in range(2, len(score)):
    if scores[i]:
        scores = i
        return i

In [None]:
def round2_cluster(appended, num_clusters):
    df = appended
    df['Problem Solving'] = df['Rank each skill on the list first to last. [Problem Solving]'].astype(str).str[0]
    df['Creativity'] = df['Rank each skill on the list first to last. [Creativity]'].astype(str).str[0]
    df['Research'] = df['Rank each skill on the list first to last. [Research]'].astype(str).str[0]
    df['Time Management'] = df['Rank each skill on the list first to last. [Time Management]'].astype(str).str[0]
    df['Communication'] = df['Rank each skill on the list first to last. [Communication]'].astype(str).str[0]
    df['Critical Thinking'] = df['Rank each skill on the list first to last. [Critical Thinking ]'].astype(str).str[0]

    newdf = df[['Problem Solving', 'Creativity', 'Research', 'Time Management', 'Communication', 'Critical Thinking']]
    newdf['Problem Solving'].replace("n", value="0", inplace=True)
    newdf['Creativity'].replace("n", value="0", inplace=True) 
    newdf['Research'].replace("n", value="0", inplace=True) 
    newdf['Time Management'].replace("n", value="0", inplace=True) 
    newdf['Communication'].replace("n", value="0", inplace=True) 
    newdf['Critical Thinking'].replace("n", value="0", inplace=True) 
    # print(newdf)


    scaler = MinMaxScaler()
    # print(df)
    new_df = pd.DataFrame(scaler.fit_transform(newdf), columns=newdf.columns[:], index=newdf.index)

    # print(new_df)

    clustering = AgglomerativeClustering(num_clusters)

    # Fitting
    clustering.fit(new_df)

    # Getting cluster assignments
    cluster_assignments = clustering.labels_

    # Unscaling the categories then replacing the scaled values
    df = df[['Name']].join(pd.DataFrame(scaler.inverse_transform(newdf), columns=newdf.columns[:], index=newdf.index))
    
    # Assigning the clusters to each profile
    df['Cluster #'] = cluster_assignments

    return df

In [None]:
def match_skills(clustered):
    employer = clustered.iloc[[-1]]
    filtered_students = clustered.iloc[[0:-1]]
    best_student = ""
    best_arr = []
    most_similar = -1
    scores = []
    names = []
    for index, student in filtered_students.iterrows():
        arr = employer.values.tolist()
        student_arr = student.values.tolist()
        employer_values = np.array(arr[2:])
        student_values = np.array(student_arr[2:])
        cosine = cosine_similarity(employer_values.reshape(1, -1), student_values.reshape(1, -1))[0][0]
        name = student_arr[0]
        # print(name)
        scores.append(cosine)
        names.append(name)
    top_students = sorted(zip(scores, names), reverse=True)[:10]

In [None]:
def cleanup(clustered, matchings):
    # add column for scores