In [13]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy 
import warnings
import sys
!{sys.executable} -m pip install termcolor
from termcolor import colored

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial import distance; 
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity

warnings.filterwarnings("ignore")


[33mYou are using pip version 9.0.3, however version 21.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [14]:
def optimize(students, employers):
    df = pd.read_csv(students)
    df['Problem Solving'] = df['Rank each skill on the list first to last. [Problem Solving]'].astype(str).str[0]
    df['Creativity'] = df['Rank each skill on the list first to last. [Creativity]'].astype(str).str[0]
    df['Research'] = df['Rank each skill on the list first to last. [Research]'].astype(str).str[0]
    df['Time Management'] = df['Rank each skill on the list first to last. [Time Management]'].astype(str).str[0]
    df['Communication'] = df['Rank each skill on the list first to last. [Communication]'].astype(str).str[0]
    # df['Critical Thinking'] = df[' [Critical Thinking]'].astype(str).str[0]

    newdf = df[['Problem Solving', 'Creativity', 'Research', 'Time Management', 'Communication']]
    newdf['Problem Solving'].replace("n", value="0", inplace=True)
    newdf['Creativity'].replace("n", value="0", inplace=True) 
    newdf['Research'].replace("n", value="0", inplace=True) 
    newdf['Time Management'].replace("n", value="0", inplace=True) 
    newdf['Communication'].replace("n", value="0", inplace=True) 
    #print(newdf)


    scaler = MinMaxScaler()
    new_df = pd.DataFrame(scaler.fit_transform(newdf), columns=newdf.columns[:], index=newdf.index)

    # print(new_df)
    # Setting the amount of clusters to test out
    cluster_cnt = [i for i in range(2, 12, 1)]

    # Establishing empty lists to store the scores for the evaluation metrics
    s_scores = []

    db_scores = []

    # Looping through different iterations for the number of clusters
    for i in cluster_cnt:
        
        # Hierarchical Agglomerative Clustering with different number of clusters
        hac = AgglomerativeClustering(n_clusters=i)
        
        hac.fit(new_df)
        
        cluster_assignments = hac.labels_
        
        ## KMeans Clustering with different number of clusters
        k_means = KMeans(n_clusters=i)
        
        k_means.fit(new_df)
        
        cluster_assignments = k_means.predict(new_df)
        
        # Appending the scores to the empty lists    
        s_scores.append(silhouette_score(new_df, cluster_assignments))
        
        db_scores.append(davies_bouldin_score(new_df, cluster_assignments))
    return [s_scores, db_scores]

In [15]:
def plot_evaluation(y, x=[i for i in range(2, 12, 1)]):
    """
    Plots the scores of a set evaluation metric. Prints out the max and min values of the evaluation scores.
    """
    
    # Creating a DataFrame for returning the max and min scores for each cluster
    df = pd.DataFrame(columns=['Cluster Score'], index=[i for i in range(2, len(y)+2)])
    df['Cluster Score'] = y
    
    # print('Max Value: Cluster #', df[df['Cluster Score']==df['Cluster Score'].max()])
    # print('\nMin Value: Cluster #', df[df['Cluster Score']==df['Cluster Score'].min()])
    # print('\n')
    
    # Plotting out the scores based on cluster count
    # plt.figure(figsize=(16,6))
    # plt.style.use('ggplot')
    # plt.plot(x,y)
    # plt.xlabel('# of Clusters')
    # plt.ylabel('Score')
    # plt.show()
    return df['Cluster Score']==df['Cluster Score'].max()

In [72]:
def our_cluster(csv, num_clusters, etc=None):
    df = pd.read_csv(csv)
    # print(df['Rank each skill on the list first to last. [Problem Solving]'])
    df['Problem Solving'] = df['Rank each skill on the list first to last. [Problem Solving]'].astype(str).str[0]
    df['Creativity'] = df['Rank each skill on the list first to last. [Creativity]'].astype(str).str[0]
    df['Research'] = df['Rank each skill on the list first to last. [Research]'].astype(str).str[0]
    df['Time Management'] = df['Rank each skill on the list first to last. [Time Management]'].astype(str).str[0]
    df['Communication'] = df['Rank each skill on the list first to last. [Communication]'].astype(str).str[0]
    df['Critical Thinking'] = df['Rank each skill on the list first to last. [Critical Thinking ]'].astype(str).str[0]

    newdf = df[['Problem Solving', 'Creativity', 'Research', 'Time Management', 'Communication', 'Critical Thinking']]
    newdf['Problem Solving'].replace("n", value="0", inplace=True)
    newdf['Creativity'].replace("n", value="0", inplace=True) 
    newdf['Research'].replace("n", value="0", inplace=True) 
    newdf['Time Management'].replace("n", value="0", inplace=True) 
    newdf['Communication'].replace("n", value="0", inplace=True) 
    newdf['Critical Thinking'].replace("n", value="0", inplace=True) 
    # print(newdf)


    scaler = MinMaxScaler()
    # print(df)
    new_df = pd.DataFrame(scaler.fit_transform(newdf), columns=newdf.columns[:], index=newdf.index)

    # print(new_df)

    clustering = AgglomerativeClustering(num_clusters)

    # Fitting
    clustering.fit(new_df)

    # Getting cluster assignments
    cluster_assignments = clustering.labels_

    # Unscaling the categories then replacing the scaled values
    if 'Best email to reach you' in df.columns:
        df = df[['Best email to reach you', 'Select your major and minor (check all that apply)']].join(pd.DataFrame(scaler.inverse_transform(newdf), columns=newdf.columns[:], index=newdf.index))
        #df = df[['Best email to reach you']].join(pd.DataFrame(scaler.inverse_transform(newdf), columns=newdf.columns[:], index=newdf.index))
    else:
        df = df[['Company Name', 'Majors and Minors (check all that apply)']].join(pd.DataFrame(scaler.inverse_transform(newdf), columns=newdf.columns[:], index=newdf.index))
        #df = df[['Company Name']].join(pd.DataFrame(scaler.inverse_transform(newdf), columns=newdf.columns[:], index=newdf.index))
    # Assigning the clusters to each profile
    df['Cluster #'] = cluster_assignments

    # Viewing the dating profiles with cluster assignments
    # print(df)
    return df

In [73]:
def match(students, employers, num_clusters):
    from clustering import cluster; import pandas as pd; import numpy as np; import scipy; from scipy.spatial import distance; from sklearn.metrics.pairwise import cosine_distances, cosine_similarity

    clusteredEmployers = our_cluster(employers, num_clusters)
    clusteredStudents = our_cluster(students, num_clusters)
    
    #change majors to lists
    me = 'Majors and Minors (check all that apply)'
    ms = 'Select your major and minor (check all that apply)'
    clusteredEmployers[me] = clusteredEmployers[me].apply(lambda x: x.split(';'))
    clusteredStudents[ms] = clusteredStudents[ms].fillna('NA')
    clusteredStudents[ms] = clusteredStudents[ms].apply(lambda x: x.split(';'))
    
    for index, employer in clusteredEmployers.iterrows():
        #filter by cluster
        cluster = employer['Cluster #']
        filtered_students = clusteredStudents[clusteredStudents['Cluster #'] == cluster]
        
        #filter by major
        employer_majors = employer[me]
        filtered_students_m = pd.DataFrame()
        for s in employer_majors:
            rows = filtered_students[filtered_students[ms].apply(lambda x: s in x)]
            filtered_students_m = filtered_students_m.append(rows)
        filtered_students = filtered_students_m
    
        
        # top_students = []
        best_student = ""
        best_arr = []
        most_similar = -1
        scores = []
        names = []
        for index, student in filtered_students.iterrows():
            arr = employer.values.tolist()
            student_arr = student.values.tolist()
            employer_values = np.array(arr[2:])
            student_values = np.array(student_arr[2:])
            cosine = cosine_similarity(employer_values.reshape(1, -1), student_values.reshape(1, -1))[0][0]
            name = student_arr[0]
            # print(name)
            scores.append(cosine)
            names.append(name)
        # print(names)
        top_students = sorted(zip(scores, names), reverse=True)[:3]
        if top_students != []:
            print(("The best students for " + employer['Company Name'] + " are:"))
            for i in range(0, len(top_students)):
                print(colored((str(i+1)+". " + str(top_students[i][1]) + " with a " + str(round(top_students[i][0] * 100, 1)) + "% similarity."), "green")) 
            print()
        else:
            print(colored(("No optimal students found for " + employer['Company Name'] + " based on given preferences. \n"), "red"))
        # print("Student: ", best_arr)
        # print("Employer: ", employer_values)

In [77]:
# students = path + "Student_Registration.csv"
# employers = path + "CORRECT_Employer_Full_Registration.csv"
students = "Student_Registration.csv"
employers = "CORRECT_Employer_Full_Registration.csv"
s_scores = optimize(students, employers)[0]
db_scores = optimize(students, employers)[1]
s_score_num_clusters = plot_evaluation(s_scores)
for i in range(2, len(s_score_num_clusters)):
    if s_score_num_clusters[i]:
        s_score_num_clusters = i
        break
db_score_num_clusters = plot_evaluation(db_scores)
for i in range(2, len(db_score_num_clusters)):
    if db_score_num_clusters[i]:
        db_score_num_clusters = i
        break
# print("\n \n")

# print("Silhoutte Score Optimal Clusters: ", s_score_num_clusters, "\n")

print(colored(("Silhouette Score Optimization: \n"), "blue"))
match(students, employers, s_score_num_clusters)
print("\n \n")

# print("Davies-Bouldin Score Optimal Clusters: ", db_score_num_clusters, "\n")
print(colored(("Davies-Bouldin Score Optimization: \n"), "blue"))
match(students, employers, db_score_num_clusters)


[34mSilhouette Score Optimization: 
[0m
[31mNo optimal students found for Cat 5 studios based on given preferences. 
[0m
The best students for Orlando World Live are:
[32m1. william.b.zavala@gmail.com with a 84.7% similarity.[0m
[32m2. egorodetskaia@su.suffolk.edu with a 76.6% similarity.[0m
[32m3. joshferrari_98@hotmail.com with a 65.3% similarity.[0m

[31mNo optimal students found for Pivot Business Consulting based on given preferences. 
[0m
The best students for Fit Me In Now are:
[32m1. helenahe0202@outlook.com with a 94.2% similarity.[0m
[32m2. davidseyrich@outlook.com with a 87.8% similarity.[0m
[32m3. jscharbaaiemilien@mail.valenciacollege.edu with a 87.5% similarity.[0m

The best students for Landen Conner Photo are:
[32m1. achoudhury2017@gmail.com with a 90.3% similarity.[0m
[32m2. mariestephtoure@gmail.com with a 90.3% similarity.[0m
[32m3. jennm.graphicdesign@gmail.com with a 85.5% similarity.[0m

The best students for Fit Me In Now are:
[32m1. cmpe