## Import

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

## Data Load

In [3]:
apply_train_data = pd.read_csv('../data/apply_train.csv')
company_data = pd.read_csv('../data/company.csv')
recruitment_data = pd.read_csv('../data/recruitment.csv')
resume_data = pd.read_csv('../data/resume_certificate.csv')
resume_certificate_data = pd.read_csv('../data/resume_certificate.csv')
resume_education_data = pd.read_csv('../data/resume_education.csv')
resume_language_data = pd.read_csv('../data/resume_language.csv')

## User-Item Matrix / Similarity / Score

In [4]:
apply_train_data

Unnamed: 0,resume_seq,recruitment_seq
0,U05833,R03838
1,U06456,R02144
2,U07807,R01877
3,U04842,R02463
4,U08336,R00112
...,...,...
57941,U02270,R03430
57942,U02640,R04987
57943,U08238,R01342
57944,U01296,R06363


In [5]:
# 사용자-아이템 행렬 생성: 구직자가 해당 채용 공고에 지원했으면 1, 아니면 0으로 설정
user_item_matrix = apply_train_data.groupby(['resume_seq', 'recruitment_seq']).size().unstack(fill_value=0)
user_item_matrix[user_item_matrix > 1] = 1

# 사용자 간의 유사성 계산
user_similarity = cosine_similarity(user_item_matrix)

# 추천 점수 계산
user_predicted_scores = user_similarity.dot(user_item_matrix) / np.array([np.abs(user_similarity).sum(axis=1)]).T

In [11]:
user_item_matrix

recruitment_seq,R00001,R00002,R00003,R00004,R00005,R00006,R00007,R00008,R00009,R00010,...,R06686,R06687,R06688,R06689,R06690,R06691,R06692,R06693,R06694,R06695
resume_seq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U00001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U00002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U00003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U00004,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U00005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
U08478,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U08479,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U08480,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U08481,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
user_predicted_scores

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Prediction

In [12]:
# 이미 지원한 채용 공고 제외하고 추천
recommendations = []
for idx, user in enumerate(user_item_matrix.index):
    # 해당 사용자가 지원한 채용 공고
    applied_jobs = set(user_item_matrix.loc[user][user_item_matrix.loc[user] == 1].index)
    
    # 해당 사용자의 추천 점수 (높은 점수부터 정렬)
    sorted_job_indices = user_predicted_scores[idx].argsort()[::-1]
    recommended_jobs = [job for job in user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:5]
    
    for job in recommended_jobs:
        recommendations.append([user, job])

In [13]:
recommendations

[['U00001', 'R01528'],
 ['U00001', 'R03811'],
 ['U00001', 'R06276'],
 ['U00001', 'R00165'],
 ['U00001', 'R02888'],
 ['U00002', 'R02412'],
 ['U00002', 'R04074'],
 ['U00002', 'R01081'],
 ['U00002', 'R05574'],
 ['U00002', 'R04070'],
 ['U00003', 'R00588'],
 ['U00003', 'R04808'],
 ['U00003', 'R04650'],
 ['U00003', 'R03470'],
 ['U00003', 'R03605'],
 ['U00004', 'R02224'],
 ['U00004', 'R00609'],
 ['U00004', 'R05358'],
 ['U00004', 'R03015'],
 ['U00004', 'R02072'],
 ['U00005', 'R05792'],
 ['U00005', 'R04069'],
 ['U00005', 'R04108'],
 ['U00005', 'R03782'],
 ['U00005', 'R04744'],
 ['U00006', 'R00359'],
 ['U00006', 'R03094'],
 ['U00006', 'R00072'],
 ['U00006', 'R01501'],
 ['U00006', 'R01692'],
 ['U00007', 'R06620'],
 ['U00007', 'R05360'],
 ['U00007', 'R06131'],
 ['U00007', 'R00012'],
 ['U00007', 'R03801'],
 ['U00008', 'R02105'],
 ['U00008', 'R04409'],
 ['U00008', 'R00559'],
 ['U00008', 'R02238'],
 ['U00008', 'R04771'],
 ['U00009', 'R05856'],
 ['U00009', 'R02072'],
 ['U00009', 'R00200'],
 ['U00009',

## Submission

In [5]:
# sample_submission.csv 형태로 DataFrame 생성
top_recommendations = pd.DataFrame(recommendations, columns=['resume_seq', 'recruitment_seq'])

top_recommendations.to_csv('./baseline_submit.csv', index=False)