In [1]:
#import required packages
import warnings
warnings.filterwarnings("ignore")
import pandas as pd

In [2]:
df = pd.read_csv("../data/external/potential-talents.csv").set_index('id')
df.head()

Unnamed: 0_level_0,job_title,location,connection,fit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
4,People Development Coordinator at Ryan,"Denton, Texas",500+,
5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


###### Remove duplicates

In [3]:
df_no_duplicates = df.drop_duplicates()
print('shape of the dataframe without duplicates: ', df_no_duplicates.shape)

shape of the dataframe without duplicates:  (53, 4)


###### Remove the fit feature column as there is no data in it as of now

In [4]:
df_no_duplicates_fitFeature = df_no_duplicates.drop('fit', axis=1)
print('shape of the dataframe without duplicates and fit feature: ',df_no_duplicates_fitFeature.shape)
print('dataframe without duplicates and fit feature: ', df_no_duplicates_fitFeature)

shape of the dataframe without duplicates and fit feature:  (53, 3)
dataframe without duplicates and fit feature:                                               job_title  \
id                                                       
1    2019 C.T. Bauer College of Business Graduate (...   
2    Native English Teacher at EPIK (English Progra...   
3                Aspiring Human Resources Professional   
4               People Development Coordinator at Ryan   
5      Advisory Board Member at Celal Bayar University   
6                  Aspiring Human Resources Specialist   
7    Student at Humber College and Aspiring Human R...   
8                                 HR Senior Specialist   
10   Seeking Human Resources HRIS and Generalist Po...   
11                       Student at Chapman University   
12   SVP, CHRO, Marketing & Communications, CSR Off...   
13   Human Resources Coordinator at InterContinenta...   
27   Aspiring Human Resources Management student se...   
28             

###### Check to see if there are any null values for any of the features

In [5]:
df_no_duplicates_fitFeature.isnull().value_counts()

job_title  location  connection
False      False     False         53
dtype: int64

###### We will copy this dataframe to df_v1 and used that for some processing

In [6]:
df_v1 = df_no_duplicates_fitFeature.copy()
print("shape: ", df_v1.shape)
print("dataframe: ", df_v1)

shape:  (53, 3)
dataframe:                                               job_title  \
id                                                       
1    2019 C.T. Bauer College of Business Graduate (...   
2    Native English Teacher at EPIK (English Progra...   
3                Aspiring Human Resources Professional   
4               People Development Coordinator at Ryan   
5      Advisory Board Member at Celal Bayar University   
6                  Aspiring Human Resources Specialist   
7    Student at Humber College and Aspiring Human R...   
8                                 HR Senior Specialist   
10   Seeking Human Resources HRIS and Generalist Po...   
11                       Student at Chapman University   
12   SVP, CHRO, Marketing & Communications, CSR Off...   
13   Human Resources Coordinator at InterContinenta...   
27   Aspiring Human Resources Management student se...   
28               Seeking Human Resources Opportunities   
66   Experienced Retail Manager and aspiring

###### Preprocessing text so as to remove numbers, hard returns, special characters, extra spaces and stopwords and also lemmatize to get the verb forms of the words

In [7]:
import re
import nltk
# nltk.download('wordnet')
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def get_preprocessed_text(text):
    pattern1 = r'[0-9]'
    pattern2 = r'[\n]'
    pattern3 = r'[^\s\w]'
    pattern4 = r'\s+'
    t1 = re.sub(pattern1, '', text)
    t2 = re.sub(pattern2, ' ', t1)
    t3 = re.sub(pattern3, '', t2)
    t4 = re.sub(pattern4, ' ', t3)
    t5 = t4.lower()
    # list of words/tokens by breaking the sentence into tokens
    tokenized_words =  word_tokenize(t5)
    lemmatizer = WordNetLemmatizer()
    # list of words whrein few words are converted into their verb forms
    lemmatized_words = [lemmatizer.lemmatize(tokens, pos='v') for tokens in tokenized_words] 
    # List of non-important words
    stop_words = stopwords.words('english')
    stripped_words = []
    for word in lemmatized_words:
        if word not in stop_words:
            stripped_words.append(word)
    processed_text = ' '.join(stripped_words)
    return processed_text

df_v1['job_title'] = df_v1.job_title.apply(get_preprocessed_text)

###### expanding the abbreviations in the text

In [8]:
df_v1.replace({'job_title' : { 'chro' : 'chief human resources officer', 'svp' : 'senior vice president'
        ,'gphr' : 'global professional in human resources','hris' : 'human resources information system'
        , 'csr' : 'corporate social responsibility', 'sphr' : 'strategic and policy-making certification'
        , 'hr' : 'human resources'}}, regex=True, inplace=True)
print('shape of the df_v1 dataframe: ', df_v1.shape)
print('df_v1 dataframe: ', df_v1)

shape of the df_v1 dataframe:  (53, 3)
df_v1 dataframe:                                               job_title  \
id                                                       
1    ct bauer college business graduate magna cum l...   
2    native english teacher epik english program korea   
3                  aspire human resources professional   
4                  people development coordinator ryan   
5         advisory board member celal bayar university   
6                    aspire human resources specialist   
7    student humber college aspire human resources ...   
8                    human resources senior specialist   
10   seek human resources human resources informati...   
11                          student chapman university   
12   senior vice president chief human resources of...   
13   human resources coordinator intercontinental b...   
27   aspire human resources management student seek...   
28                  seek human resources opportunities   
66   experience

###### Finding the most commmon words

In [9]:
from collections import Counter
words_counts = Counter()
for sentence in df_v1.job_title:
    for word in sentence.split(' '):
        words_counts[word] += 1
print(len(words_counts))
print(words_counts.most_common())

182
[('human', 39), ('resources', 39), ('aspire', 13), ('professional', 10), ('seek', 10), ('manager', 7), ('university', 6), ('student', 6), ('business', 5), ('generalist', 5), ('management', 5), ('specialist', 4), ('position', 4), ('senior', 3), ('information', 3), ('opportunities', 3), ('director', 3), ('major', 3), ('college', 2), ('graduate', 2), ('english', 2), ('coordinator', 2), ('officer', 2), ('internship', 2), ('retail', 2), ('staff', 2), ('north', 2), ('america', 2), ('systems', 2), ('leader', 2), ('ct', 1), ('bauer', 1), ('magna', 1), ('cum', 1), ('laude', 1), ('native', 1), ('teacher', 1), ('epik', 1), ('program', 1), ('korea', 1), ('people', 1), ('development', 1), ('ryan', 1), ('advisory', 1), ('board', 1), ('member', 1), ('celal', 1), ('bayar', 1), ('humber', 1), ('system', 1), ('chapman', 1), ('vice', 1), ('president', 1), ('chief', 1), ('market', 1), ('communications', 1), ('corporate', 1), ('social', 1), ('responsibility', 1), ('engie', 1), ('houston', 1), ('woodlan

#### Bag of words

###### Cosine similarity is the dot product of the array A and B divided by the product of their absolute values

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def rank_candidates_bag_of_words(keyword, df, feature_name):
    corpus = df[feature_name].tolist()
    keyword = get_preprocessed_text(keyword)
    corpus.append(keyword)
    # print(len(corpus))
    # print(corpus)
    vectorizer = CountVectorizer( binary = True)
    vectors = vectorizer.fit_transform(corpus)
    num_of_texts = vectors.toarray().shape[0]
    X, y = vectors.toarray()[: num_of_texts -1, :], vectors.toarray()[num_of_texts -1, :]
    # print(X.shape, (y.reshape(1, -1)).shape)
    cosym = cosine_similarity(X, y.reshape(1, -1))
    df['fit_bow'] = cosym
    df.sort_values('fit_bow', ascending=False, inplace=True)
    corpus.pop()
    return df    
    
resultant_df = rank_candidates_bag_of_words('specialist', df_v1, 'job_title')
resultant_df

Unnamed: 0_level_0,job_title,location,connection,fit_bow
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6,aspire human resources specialist,Greater New York City Area,1,0.5
8,human resources senior specialist,San Francisco Bay Area,500+,0.5
68,human resources specialist luxottica,Greater New York City Area,500+,0.5
86,information systems specialist programmer love...,"Gaithersburg, Maryland",4,0.377964
1,ct bauer college business graduate magna cum l...,"Houston, Texas",85,0.0
92,seek employment opportunities within customer ...,"Torrance, California",64,0.0
82,aspire human resources professional energetic ...,"Austin, Texas Area",174,0.0
83,human resources manager endemol shine north am...,"Los Angeles, California",268,0.0
84,human resources professional world leader gi s...,"Highland, California",50,0.0
85,rrp brand portfolio executive jti japan tobacc...,Greater Philadelphia Area,500+,0.0


###### Test the ranking based on bag_of_words

In [11]:
keyword = 'aspire human resources'
bow_df = rank_candidates_bag_of_words(keyword, df_v1, 'job_title')
# number of candidates with similar experience (with cosine similarity greater than 0)
print(f'number of candidates with similar experience: { bow_df[bow_df["fit_bow"] > 0].shape[0] }')
print(f'top 10 candidates with similar experience: {bow_df.head(10)}')

number of candidates with similar experience: 35
top 10 candidates with similar experience:                                             job_title  \
id                                                      
6                   aspire human resources specialist   
3                 aspire human resources professional   
97                aspire human resources professional   
73  aspire human resources manager seek internship...   
74                       human resources professional   
72  business management major aspire human resourc...   
82  aspire human resources professional energetic ...   
7   student humber college aspire human resources ...   
27  aspire human resources management student seek...   
66  experience retail manager aspire human resourc...   

                               location connection   fit_bow  
id                                                            
6            Greater New York City Area          1  0.866025  
3   Raleigh-Durham, North Carolina

#### tf-idf

In [12]:
df_v2 = df_no_duplicates_fitFeature.copy()
df_v2['job_title'] = df_v2.job_title.apply(get_preprocessed_text)
df_v2.replace({'job_title' : { 'chro' : 'chief human resources officer', 'svp' : 'senior vice president'
        ,'gphr' : 'global professional in human resources','hris' : 'human resources information system'
        , 'csr' : 'corporate social responsibility', 'sphr' : 'strategic and policy-making certification'
        , 'hr' : 'human resources'}}, regex=True, inplace=True)
print('shape of the df_v2 dataframe: ', df_v2.shape)
print('df_v2 dataframe: ', df_v2)

shape of the df_v2 dataframe:  (53, 3)
df_v2 dataframe:                                               job_title  \
id                                                       
1    ct bauer college business graduate magna cum l...   
2    native english teacher epik english program korea   
3                  aspire human resources professional   
4                  people development coordinator ryan   
5         advisory board member celal bayar university   
6                    aspire human resources specialist   
7    student humber college aspire human resources ...   
8                    human resources senior specialist   
10   seek human resources human resources informati...   
11                          student chapman university   
12   senior vice president chief human resources of...   
13   human resources coordinator intercontinental b...   
27   aspire human resources management student seek...   
28                  seek human resources opportunities   
66   experience

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
def rank_candidates_tf_idf(keyword, df, feature_name):
    corpus=df[feature_name].tolist()
    keyword=get_preprocessed_text(keyword)
    corpus.append(keyword)
    tf_idf_vectorizer = TfidfVectorizer(lowercase = True, stop_words='english', max_df=100, min_df=2)
    vectors = tf_idf_vectorizer.fit_transform(corpus)
    shape = vectors.toarray().shape
    cosym = cosine_similarity(vectors.toarray()[:shape[0]-1,:], vectors.toarray()[shape[0]-1,:].reshape(1,-1))
    df['fit_tf_idf'] = cosym
    df.sort_values('fit_tf_idf', ascending=False, inplace=True)
    return df
df_tf_idf = rank_candidates_tf_idf('aspire human resources', df_v2, 'job_title')
df_tf_idf

Unnamed: 0_level_0,job_title,location,connection,fit_tf_idf
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,aspire human resources professional,"Raleigh-Durham, North Carolina Area",44,0.757836
97,aspire human resources professional,"Kokomo, Indiana Area",71,0.757836
76,aspire human resources professional passionate...,"New York, New York",212,0.757836
6,aspire human resources specialist,Greater New York City Area,1,0.665663
79,liberal arts major aspire human resources analyst,"Baton Rouge, Louisiana Area",7,0.64186
73,aspire human resources manager seek internship...,"Houston, Texas Area",7,0.601381
100,aspire human resources manager graduate may se...,"Cape Girardeau, Missouri",103,0.543577
82,aspire human resources professional energetic ...,"Austin, Texas Area",174,0.541985
66,experience retail manager aspire human resourc...,"Austin, Texas Area",57,0.480158
7,student humber college aspire human resources ...,Kanada,61,0.456017


In [14]:
keyword = 'specialist'
df_tf_idf = rank_candidates_tf_idf(keyword, df_v2, 'job_title')
# number of candidates with similar experience (with cosine similarity greater than 0)
print(f'number of candidates with similar experience: { df_tf_idf[df_tf_idf["fit_tf_idf"] > 0].shape[0] }')
print(f'top 5 candidates with similar experience: \n {df_tf_idf.head(5)}')

number of candidates with similar experience: 4
top 5 candidates with similar experience: 
                                             job_title  \
id                                                      
68               human resources specialist luxottica   
6                   aspire human resources specialist   
8                   human resources senior specialist   
86  information systems specialist programmer love...   
3                 aspire human resources professional   

                               location connection  fit_tf_idf  
id                                                              
68           Greater New York City Area      500+     0.847543  
6            Greater New York City Area          1    0.718968  
8                San Francisco Bay Area      500+     0.613115  
86               Gaithersburg, Maryland          4    0.516696  
3   Raleigh-Durham, North Carolina Area         44    0.000000  


#### Word2Vec

In [15]:
df_v3 = df_no_duplicates_fitFeature.copy()
df_v3['job_title'] = df_v3.job_title.apply(get_preprocessed_text)
df_v3.replace({'job_title' : { 'chro' : 'chief human resources officer', 'svp' : 'senior vice president'
        ,'gphr' : 'global professional in human resources','hris' : 'human resources information system'
        , 'csr' : 'corporate social responsibility', 'sphr' : 'strategic and policy-making certification'
        , 'hr' : 'human resources'}}, regex=True, inplace=True)
print('shape of the df_v3 dataframe: ', df_v3.shape)
print('df_v3 dataframe: ', df_v3)

shape of the df_v3 dataframe:  (53, 3)
df_v3 dataframe:                                               job_title  \
id                                                       
1    ct bauer college business graduate magna cum l...   
2    native english teacher epik english program korea   
3                  aspire human resources professional   
4                  people development coordinator ryan   
5         advisory board member celal bayar university   
6                    aspire human resources specialist   
7    student humber college aspire human resources ...   
8                    human resources senior specialist   
10   seek human resources human resources informati...   
11                          student chapman university   
12   senior vice president chief human resources of...   
13   human resources coordinator intercontinental b...   
27   aspire human resources management student seek...   
28                  seek human resources opportunities   
66   experience

In [16]:
from gensim.models import Word2Vec

In [17]:
def get_tokenized_sentences(df_or_keyword, feature_name=''):
    if(type(df_or_keyword) == str):
        tokenized = [word_tokenize(word) for word in [get_preprocessed_text(df_or_keyword)]]
    else:
        text = [get_preprocessed_text(sentence) for sentence in df_or_keyword[feature_name].tolist()]
        tokenized =  [word_tokenize(sentence) for sentence in text]
    return tokenized

# t_s = get_tokenized_sentences(df_v3, 'job_title')
# print(t_s[:2])
# t_s2 = get_tokenized_sentences('Aspire Human Resources')
# print(t_s2[0])

In [18]:
# model = Word2Vec(t_s, min_count=1)
# print("The model is: ", model)
# print('The first five words from the model', list(model.wv.index_to_key)[:5])
# print("The word resources (first word from the model) is represented as vector like this: \n", model.wv.vectors[0])
# print('The firsTot five words from the text: ', t_s[:][0][:5])
# print(f'The word graduate (fifth word from the text: {t_s[:][0][4]}) is represented as a vector like this: \n {model.wv[t_s[:][0][4]]}')
# model.wv.most_similar("human")

In [19]:
import numpy as np

# def get_vector_avg_for_keyword(keyword):
#     tokenize_keyword = get_tokenized_sentences(keyword)
#     print(tokenize_keyword)
#     print(len(tokenize_keyword))
#     kw_model = Word2Vec(tokenize_keyword, min_count=1)
#     vector_array = np.array(0)
#     for vector in kw_model.wv.vectors:
#         vector_array = vector_array + vector
#     return (vector_array/len(kw_model.wv.vectors))
    
# vecs = get_vector_avg_for_keyword('aspire human resources')
# print(vecs)

In [20]:
def get_avg_vector(tokenized_sentences):
    model_v = Word2Vec(tokenized_sentences, min_count=1)
    vec_avg_array = []
    n = 0
    while n < len(tokenized_sentences):
        line_vec_array = np.array(0)
        for i in range(len(tokenized_sentences[n])):
            line_vec_array = line_vec_array + model_v.wv[tokenized_sentences[n][i]]
        vec_avg_array.append(line_vec_array/len(tokenized_sentences[n]))
        n = n+1
    return vec_avg_array

keyword = 'aspire Human resourseS'
tokenized_sentence_keyword = get_tokenized_sentences(keyword)
avg_vec_kw = np.array(get_avg_vector(tokenized_sentence_keyword))
tokenized_sentence_feature = get_tokenized_sentences(df_v3, 'job_title')
avg_vec_feature = np.array(get_avg_vector(tokenized_sentence_feature))

cosym3 = cosine_similarity(avg_vec_feature, avg_vec_kw.reshape(1, -1))
# # print(cosym3)
# # print(df_v3.shape)
# # print(len(cosym3))
df_v3['fit_w2v'] = cosym3
df_v3.sort_values('fit_w2v', ascending=False, inplace=True)
np.set_printoptions(suppress=True)

df_v3    

Unnamed: 0_level_0,job_title,location,connection,fit_w2v
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,aspire human resources professional,"Raleigh-Durham, North Carolina Area",44,0.87101
97,aspire human resources professional,"Kokomo, Indiana Area",71,0.87101
6,aspire human resources specialist,Greater New York City Area,1,0.865894
73,aspire human resources manager seek internship...,"Houston, Texas Area",7,0.789793
74,human resources professional,Greater Boston Area,16,0.714817
10,seek human resources human resources informati...,Greater Philadelphia Area,500+,0.714178
100,aspire human resources manager graduate may se...,"Cape Girardeau, Missouri",103,0.705349
66,experience retail manager aspire human resourc...,"Austin, Texas Area",57,0.672847
7,student humber college aspire human resources ...,Kanada,61,0.665674
79,liberal arts major aspire human resources analyst,"Baton Rouge, Louisiana Area",7,0.65314


#### GloVe

In [21]:
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [22]:
word2vec_output_file = "glove"+'.word2vec'
# glove2word2vec("glove.6B.100d.txt", word2vec_output_file)

In [23]:
df_v4 = df_no_duplicates_fitFeature.copy()
df_v4['job_title'] = df_v4.job_title.apply(get_preprocessed_text)
df_v4.replace({'job_title' : { 'chro' : 'chief human resources officer', 'svp' : 'senior vice president'
        ,'gphr' : 'global professional in human resources','hris' : 'human resources information system'
        , 'csr' : 'corporate social responsibility', 'sphr' : 'strategic and policy-making certification'
        , 'hr' : 'human resources'}}, regex=True, inplace=True)
print('shape of the df_v4 dataframe: ', df_v4.shape)
print('df_v4 dataframe: ', df_v4)

shape of the df_v4 dataframe:  (53, 3)
df_v4 dataframe:                                               job_title  \
id                                                       
1    ct bauer college business graduate magna cum l...   
2    native english teacher epik english program korea   
3                  aspire human resources professional   
4                  people development coordinator ryan   
5         advisory board member celal bayar university   
6                    aspire human resources specialist   
7    student humber college aspire human resources ...   
8                    human resources senior specialist   
10   seek human resources human resources informati...   
11                          student chapman university   
12   senior vice president chief human resources of...   
13   human resources coordinator intercontinental b...   
27   aspire human resources management student seek...   
28                  seek human resources opportunities   
66   experience

In [24]:
keyword = 'aspire human resources'
t_s_kw= get_tokenized_sentences(keyword)
t_s_df= get_tokenized_sentences(df_v4, 'job_title')
t_s_kw

[['aspire', 'human', 'resources']]

In [25]:
def get_avg_vector_glove(tokenized_sentences):
    model_glove = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)
#     print(model_glove.similar_by_word('aspire'))
    vec_avg_array = []
    n = 0
    while n < len(tokenized_sentences):
        line_vec_array = np.array(0)
        for i in range(len(tokenized_sentences[n])):
            try:
                line_vec_array = line_vec_array + model_glove.get_vector(tokenized_sentences[n][i])
            except:
                line_vec_array = line_vec_array
        vec_avg_array.append(line_vec_array/len(tokenized_sentences[n]))
        n = n+1
    return vec_avg_array

In [26]:
avg_vec_kw = np.array(get_avg_vector_glove(t_s_kw))
avg_vec_df = np.array(get_avg_vector_glove(t_s_df))

In [27]:
cosym4 = cosine_similarity(avg_vec_df, avg_vec_kw.reshape(1, -1))
df_v4['fit_gloVe'] = cosym3
df_v4.sort_values('fit_gloVe', ascending=False, inplace=True)
np.set_printoptions(suppress=True)

df_v4    

Unnamed: 0_level_0,job_title,location,connection,fit_gloVe
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,aspire human resources professional,"Raleigh-Durham, North Carolina Area",44,0.87101
97,aspire human resources professional,"Kokomo, Indiana Area",71,0.87101
6,aspire human resources specialist,Greater New York City Area,1,0.865894
73,aspire human resources manager seek internship...,"Houston, Texas Area",7,0.789793
74,human resources professional,Greater Boston Area,16,0.714817
10,seek human resources human resources informati...,Greater Philadelphia Area,500+,0.714178
100,aspire human resources manager graduate may se...,"Cape Girardeau, Missouri",103,0.705349
66,experience retail manager aspire human resourc...,"Austin, Texas Area",57,0.672847
7,student humber college aspire human resources ...,Kanada,61,0.665674
79,liberal arts major aspire human resources analyst,"Baton Rouge, Louisiana Area",7,0.65314


#### GloVe Another approach

In [28]:
df_v5 = df_no_duplicates_fitFeature.copy()
df_v5['job_title'] = df_v5.job_title.apply(get_preprocessed_text)
df_v5.replace({'job_title' : { 'chro' : 'chief human resources officer', 'svp' : 'senior vice president'
        ,'gphr' : 'global professional in human resources','hris' : 'human resources information system'
        , 'csr' : 'corporate social responsibility', 'sphr' : 'strategic and policy-making certification'
        , 'hr' : 'human resources'}}, regex=True, inplace=True)
print('shape of the df_v5 dataframe: ', df_v5.shape)
print('df_v5 dataframe: ', df_v5)

shape of the df_v5 dataframe:  (53, 3)
df_v5 dataframe:                                               job_title  \
id                                                       
1    ct bauer college business graduate magna cum l...   
2    native english teacher epik english program korea   
3                  aspire human resources professional   
4                  people development coordinator ryan   
5         advisory board member celal bayar university   
6                    aspire human resources specialist   
7    student humber college aspire human resources ...   
8                    human resources senior specialist   
10   seek human resources human resources informati...   
11                          student chapman university   
12   senior vice president chief human resources of...   
13   human resources coordinator intercontinental b...   
27   aspire human resources management student seek...   
28                  seek human resources opportunities   
66   experience

In [29]:
c_t_s_df = t_s_df
c_t_s_df.append(t_s_kw[0])
c_t_s_df

[['ct',
  'bauer',
  'college',
  'business',
  'graduate',
  'magna',
  'cum',
  'laude',
  'aspire',
  'human',
  'resources',
  'professional'],
 ['native', 'english', 'teacher', 'epik', 'english', 'program', 'korea'],
 ['aspire', 'human', 'resources', 'professional'],
 ['people', 'development', 'coordinator', 'ryan'],
 ['advisory', 'board', 'member', 'celal', 'bayar', 'university'],
 ['aspire', 'human', 'resources', 'specialist'],
 ['student',
  'humber',
  'college',
  'aspire',
  'human',
  'resources',
  'generalist'],
 ['human', 'resources', 'senior', 'specialist'],
 ['seek',
  'human',
  'resources',
  'human',
  'resources',
  'information',
  'system',
  'generalist',
  'position'],
 ['student', 'chapman', 'university'],
 ['senior',
  'vice',
  'president',
  'chief',
  'human',
  'resources',
  'officer',
  'market',
  'communications',
  'corporate',
  'social',
  'responsibility',
  'officer',
  'engie',
  'houston',
  'woodlands',
  'energy',
  'global',
  'professional'

In [30]:
def sentense_vector_glove(corpus):
    glove =  KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)
    line_vec=np.array(0)
    l=list()
    for i in corpus:
        a=0
        for j in i:
            a += 1
            try:
                a=a+1
                line_vec=line_vec+glove.get_vector(j)
            except:
                line_vec=line_vec
        l.append(line_vec/a)
    return l

In [31]:
avg_vecs_all = sentense_vector_glove(c_t_s_df)

In [32]:
avg_vecs_all_array = np.array(avg_vecs_all)

In [33]:
avg_vecs_all_array.shape

(54, 100)

In [34]:
avg_vecs_all_array[1]

array([ 0.4466211 ,  0.47182527,  0.06191029, -0.02479523, -0.06759001,
       -0.05950569,  0.24243975, -0.02293098, -0.07452219,  0.6309534 ,
       -0.18751201, -0.36395055,  0.4165965 ,  0.7031492 , -0.26035154,
       -0.3294459 ,  0.76978076,  0.03635737, -0.17786191,  0.41572043,
       -0.37812668,  0.1536633 ,  0.22512643, -0.01103844,  0.12832057,
       -0.09495573, -0.15104477, -0.6472967 , -0.04570744, -0.03060014,
       -0.73989433,  0.89988333, -0.50165254,  0.10967638, -0.0794732 ,
        0.11480696, -0.19023858,  0.2734347 , -0.38280073, -0.03170557,
       -0.6781289 ,  0.05340324, -0.566459  , -0.21168293, -0.02746588,
        0.00073488,  0.3082872 ,  0.06082916,  0.14910899, -0.12362785,
       -0.22174878, -0.20483981,  0.03707885,  0.23487571, -0.01782588,
       -1.8380002 ,  0.00903405, -0.5435758 ,  1.279343  ,  0.15138392,
        0.05821714,  0.2330191 , -0.4883435 , -0.32412672,  0.42199138,
       -0.00539658,  0.30376264,  0.46628782,  0.7061189 ,  0.49

In [35]:
num_of_vectors = avg_vecs_all_array.shape[0]
X, y = avg_vecs_all_array[: num_of_vectors -1, :], avg_vecs_all_array[num_of_vectors -1, :]
cosym5 = cosine_similarity(X, y.reshape(1, -1))
df_v5['fit_gloVe2'] = cosym3
df_v5.sort_values('fit_gloVe2', ascending=False, inplace=True)
np.set_printoptions(suppress=True)

df_v5    

Unnamed: 0_level_0,job_title,location,connection,fit_gloVe2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,aspire human resources professional,"Raleigh-Durham, North Carolina Area",44,0.87101
97,aspire human resources professional,"Kokomo, Indiana Area",71,0.87101
6,aspire human resources specialist,Greater New York City Area,1,0.865894
73,aspire human resources manager seek internship...,"Houston, Texas Area",7,0.789793
74,human resources professional,Greater Boston Area,16,0.714817
10,seek human resources human resources informati...,Greater Philadelphia Area,500+,0.714178
100,aspire human resources manager graduate may se...,"Cape Girardeau, Missouri",103,0.705349
66,experience retail manager aspire human resourc...,"Austin, Texas Area",57,0.672847
7,student humber college aspire human resources ...,Kanada,61,0.665674
79,liberal arts major aspire human resources analyst,"Baton Rouge, Louisiana Area",7,0.65314


#### Fasttext