In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

def load_text(path):
    with open(path, 'r') as file:
        data = file.read()
    return data

def load_pickle(path):
    df = pd.read_pickle(path)
    return df

def process_df(df, col, resume):
    df = df.append({col:resume}, ignore_index=True)
    df[col] = df[col].apply(lambda x: "".join(x))
    return df

def tfidf_conversion(df, col):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(df[col])
    tfidf_np_matrix = tfidf_matrix.toarray()
    return tfidf_np_matrix

def cosine_similarity(df, col, tfidf_mat):
    df[col] = tfidf_mat @ tfidf_mat[df.index[-1]]
    df = df.sort_values(col, ascending = False)
    df = df.iloc[1:, :]
    return df

def pickle_data(df, path):
    df.to_pickle(path)

In [2]:
resume = load_text('data/Liveproject Resume.txt')
df = load_pickle('output/step1_df.pk')
df = process_df(df, 'skills', resume)
tfidf_mat = tfidf_conversion(df, 'skills')
df_sim = cosine_similarity(df, 'cosine', tfidf_mat)
df_new = df_sim[:10]
pickle_data(df_new, 'output/step2_df.pk')