In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy 
import warnings
from termcolor import colored

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial import distance; 
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity

warnings.filterwarnings("ignore")

In [7]:
students = pd.read_csv('Student_Registration.csv')
print(students['What social causes matter to  you? Employers and students identify causes that matter to them.(Choose up to 3).  Check out our Get Involved page on Intern Pursuit for more information: https://www.internpursuit.tech/get-involved'], '\n')

0      Entrepreneurship or Social Entrepreneurship;Ch...
1                            Hunger;Homeless;Environment
2                        Human Rights;Hunger;Environment
3                             Human Rights;Art & Culture
4                            Human Rights;Animals;Health
                             ...                        
192                        Art & Culture;Children;Hunger
193                 Human Rights;Children;Mental Illness
194                  Human Rights;Art & Culture;Homeless
195    Human Rights;Entrepreneurship or Social Entrep...
196                  Children;Mental Illness;Environment
Name: What social causes matter to  you? Employers and students identify causes that matter to them.(Choose up to 3).  Check out our Get Involved page on Intern Pursuit for more information: https://www.internpursuit.tech/get-involved, Length: 197, dtype: object 



In [361]:
def match(students, employers):
    students = pd.read_csv(students)
    employers = pd.read_csv(employers)

    employers = employers.rename(columns={'Company Name': 'Name', 'Majors and Minors (check all that apply)':'Majors/Minors', 'Identify only 3':'Social Causes'})
    students = students.rename(columns={'Best email to reach you':'Name', 'Select your major and minor (check all that apply)':'Majors/Minors', 'What social causes matter to  you? Employers and students identify causes that matter to them.(Choose up to 3).  Check out our Get Involved page on Intern Pursuit for more information: https://www.internpursuit.tech/get-involved':'Social Causes'})
    for i in range(len(employers.index)):

        # obtain i-th employer from dataframe
        curr = employers.iloc[[i]]

        # perform filtering on all students based on criteria of i-th employer
        # filtered = round1_filter(students, curr)
        filtered = students
        # create dataframe with filtered students and i-th employer
        appended = filtered.append(curr)

        # find optimal number of clusters for appended dataframe
        s_score, db_score = optimize_skills(appended)
        s_clusters = find_num_clusters(plot_evaluation(s_score))
        db_clusters = find_num_clusters(plot_evaluation(db_score))

        # perform clustering on appended using both of the optimized cluster scores, use appended dataframe because we need apply a bonus weight if student and employer's clusters match
        s_clustered = round2_cluster(appended, s_clusters)
        db_clustered = round2_cluster(appended, db_clusters)

        # get list of top 10-12 candidates as a list of tuples (x, y) where x is the candidate's email address and y is their similarity score
        s_optimal_matchings = match_skills(s_clustered)
        db_optimal_matchings = match_skills(db_clustered)
        # print(s_optimal_matchings)
        # print(db_optimal_matchings)
        # cleanup all dataframes and get new dataframe which includes candidate's email, similarity score, and social causes columns
        s_cleaned_up = cleanup(filtered, s_clustered, s_optimal_matchings)
        db_cleaned_up = cleanup(filtered, db_clustered, db_optimal_matchings)

        # # return list of top 3-5 candidates based on social clustering
        s_final = match_socials(s_cleaned_up, curr)
        db_final = match_socials(db_cleaned_up, curr)

        # # pretty print top candidates for current employer
        pretty_print(s_final, curr)
        # pretty_print(db_final)
    return

In [362]:
def round1_filter(students, employer):
    print('hi')

In [363]:
def optimize_skills(appended):
    df = appended
    df['Problem Solving'] = df['Rank each skill on the list first to last. [Problem Solving]'].astype(str).str[0]
    df['Creativity'] = df['Rank each skill on the list first to last. [Creativity]'].astype(str).str[0]
    df['Research'] = df['Rank each skill on the list first to last. [Research]'].astype(str).str[0]
    df['Time Management'] = df['Rank each skill on the list first to last. [Time Management]'].astype(str).str[0]
    df['Communication'] = df['Rank each skill on the list first to last. [Communication]'].astype(str).str[0]
    # df['Critical Thinking'] = df[' [Critical Thinking]'].astype(str).str[0]

    newdf = df[['Problem Solving', 'Creativity', 'Research', 'Time Management', 'Communication']]
    newdf['Problem Solving'].replace("n", value="0", inplace=True)
    newdf['Creativity'].replace("n", value="0", inplace=True) 
    newdf['Research'].replace("n", value="0", inplace=True) 
    newdf['Time Management'].replace("n", value="0", inplace=True) 
    newdf['Communication'].replace("n", value="0", inplace=True) 
    #print(newdf)


    scaler = MinMaxScaler()
    new_df = pd.DataFrame(scaler.fit_transform(newdf), columns=newdf.columns[:], index=newdf.index)

    # print(new_df)
    # Setting the amount of clusters to test out
    cluster_cnt = [i for i in range(2, 12, 1)]

    # Establishing empty lists to store the scores for the evaluation metrics
    s_scores = []

    db_scores = []

    # Looping through different iterations for the number of clusters
    for i in cluster_cnt:
        
        # Hierarchical Agglomerative Clustering with different number of clusters
        hac = AgglomerativeClustering(n_clusters=i)
        
        hac.fit(new_df)
        
        cluster_assignments = hac.labels_
        
        ## KMeans Clustering with different number of clusters
        k_means = KMeans(n_clusters=i)
        
        k_means.fit(new_df)
        
        cluster_assignments = k_means.predict(new_df)
        
        # Appending the scores to the empty lists    
        s_scores.append(silhouette_score(new_df, cluster_assignments))
        
        db_scores.append(davies_bouldin_score(new_df, cluster_assignments))
    return s_scores, db_scores

In [364]:
def plot_evaluation(scores):
    df = pd.DataFrame(columns=['Cluster Score'], index=[i for i in range(2, len(scores)+2)])
    df['Cluster Score'] = scores
    
    # print('Max Value: Cluster #', df[df['Cluster Score']==df['Cluster Score'].max()])
    # print('\nMin Value: Cluster #', df[df['Cluster Score']==df['Cluster Score'].min()])
    # print('\n')
    
    # Plotting out the scores based on cluster count
    # plt.figure(figsize=(16,6))
    # plt.style.use('ggplot')
    # plt.plot(x,y)
    # plt.xlabel('# of Clusters')
    # plt.ylabel('Score')
    # plt.show()
    return df['Cluster Score']==df['Cluster Score'].max()

In [365]:
def find_num_clusters(scores):
    for i in range(2, len(scores)):
        if scores[i]:
            scores = i
            return i

In [366]:
def round2_cluster(appended, num_clusters):
    # print(num_clusters)
    df = appended
    df['Problem Solving'] = df['Rank each skill on the list first to last. [Problem Solving]'].astype(str).str[0]
    df['Creativity'] = df['Rank each skill on the list first to last. [Creativity]'].astype(str).str[0]
    df['Research'] = df['Rank each skill on the list first to last. [Research]'].astype(str).str[0]
    df['Time Management'] = df['Rank each skill on the list first to last. [Time Management]'].astype(str).str[0]
    df['Communication'] = df['Rank each skill on the list first to last. [Communication]'].astype(str).str[0]
    df['Critical Thinking'] = df['Rank each skill on the list first to last. [Critical Thinking ]'].astype(str).str[0]

    newdf = df[['Problem Solving', 'Creativity', 'Research', 'Time Management', 'Communication', 'Critical Thinking']]
    newdf['Problem Solving'].replace("n", value="0", inplace=True)
    newdf['Creativity'].replace("n", value="0", inplace=True) 
    newdf['Research'].replace("n", value="0", inplace=True) 
    newdf['Time Management'].replace("n", value="0", inplace=True) 
    newdf['Communication'].replace("n", value="0", inplace=True) 
    newdf['Critical Thinking'].replace("n", value="0", inplace=True) 
    # print(newdf)


    scaler = MinMaxScaler()
    # print(df)
    new_df = pd.DataFrame(scaler.fit_transform(newdf), columns=newdf.columns[:], index=newdf.index)

    # print(new_df)

    clustering = AgglomerativeClustering(num_clusters)

    # Fitting
    clustering.fit(new_df)

    # Getting cluster assignments
    cluster_assignments = clustering.labels_


    # print(len(cluster_assignments))
    # print(len(df.index))
    # Unscaling the categories then replacing the scaled values
    df = df[['Name']].join(pd.DataFrame(scaler.inverse_transform(newdf), columns=newdf.columns[:]))
    # df = df['Name']
    # Assigning the clusters to each profile
    df['Cluster #'] = cluster_assignments
    # print(df.keys())
    return df

In [367]:
def match_skills(clustered):
    employer = clustered.iloc[[-1]]
    filtered_students = clustered[0:-1]
    best_student = ""
    best_arr = []
    most_similar = -1
    scores = []
    names = []
    for index, student in filtered_students.iterrows():
        
        arr = employer.values.tolist()
        # print('employer: ', arr)
        student_arr = student.values.tolist()
        # print('student: ', student_arr)
        employer_values = np.array(arr[0][2:])
        # print('employer values: ', employer_values)
        student_values = np.array(student_arr[2:])
        # print('student_values: ', student_values)
        cosine = cosine_similarity(employer_values.reshape(1, -1), student_values.reshape(1, -1))[0][0]
        name = student_arr[0]
        # print(name)
        scores.append(cosine)
        names.append(name)
    for i in range(len(names)):
        if isinstance(names[i], float):
            names[i] = 'Error'
    # print(names)
    # print(scores)
    top_students = sorted(zip(scores, names), reverse=True)[:10]
    return top_students

In [368]:
def cleanup(filtered, clustered, matchings):
    # add column for scores
    scores = {}
    df = pd.DataFrame(columns=["Name", "Scores", "Social Causes"])
    for i in range(len(matchings)):
        person = matchings[i]
        score, name = person[0], person[1]
        social_causes = filtered[filtered['Name'] == name]['Social Causes']
        df.loc[len(df)] = [name, score, social_causes]
    
    return df

In [396]:
def match_socials(dataframe, curr):
    curr = curr[['Name', 'Social Causes']]
    employer_social = set(curr['Social Causes'].values[0].split(';'))
    df = pd.DataFrame(columns=["Name", "Final Score"])
    for index, row in dataframe.iterrows():
        social = row[2].values[0]
        causes = social.split(';')
        overlap = len(list(employer_social.intersection(causes)))/3
        updated_score = (row[1] * 0.75) + (overlap * 0.25)
        df.loc[len(df)] = [row[0], updated_score]
    # print(df)
    return df

In [401]:
def pretty_print(dataframe, curr):
    names = dataframe['Name'].values.tolist()
    scores = dataframe['Final Score'].values.tolist()

    top_students = sorted(zip(scores, names), reverse=True)[:3]
    print(colored(("The top 3 students for " + curr['Name'].values[0] + " after filtering, skills matching, and social cause matching are:"), "blue"))
    for i in range(3):
        print(colored((str(i+1)+". " + str(top_students[i][1]) + " with a " + str(round(top_students[i][0] * 100, 1)) + "% similarity."), "green")) 
    print()

In [402]:
match('Student_Registration.csv', 'CORRECT_Employer_Full_Registration.csv')

[34mThe top 3 students for Cat 5 studios after filtering, skills matching, and social cause matching are:[0m
[32m1. herbertelizabeth27@gmail.com with a 82.3% similarity.[0m
[32m2. bellindton.cayo@gmail.com with a 82.0% similarity.[0m
[32m3. ikeen20@gmail.com with a 81.9% similarity.[0m

[34mThe top 3 students for Orlando World Live after filtering, skills matching, and social cause matching are:[0m
[32m1. herbertelizabeth27@gmail.com with a 90.7% similarity.[0m
[32m2. vanessa.seymour@knights.ucf.edu with a 82.3% similarity.[0m
[32m3. HALEYBLATT@GMAIL.COM with a 82.0% similarity.[0m

[34mThe top 3 students for Pivot Business Consulting after filtering, skills matching, and social cause matching are:[0m
[32m1. sylwiag@live.com with a 83.3% similarity.[0m
[32m2. ikeen20@gmail.com with a 81.9% similarity.[0m
[32m3. aprylbrown17@gmail.com with a 81.4% similarity.[0m

[34mThe top 3 students for Fit Me In Now after filtering, skills matching, and social cause matching