In [1]:
import warnings
warnings.filterwarnings(action='ignore')

from implicit.evaluation import  *
from implicit.als import AlternatingLeastSquares as ALS
from implicit.bpr import BayesianPersonalizedRanking as BPR

In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from pathlib import Path

In [4]:
DATA_DIR = Path("data")

In [5]:
TRAIN_DIR = DATA_DIR/"train_job"

In [6]:
test_df = pd.read_csv(DATA_DIR/"test_job.csv")

In [7]:
test_df.head(3)

Unnamed: 0,userID,jobID
0,ebaee1af0c501f22ddfe242fc16dae53,352407221afb776e3143e8a1a0577885
1,9ab05403ac7808cbfba3da26665f7a9c,96b9bff013acedfb1d140579e2fbeb63
2,33349e909eba71677299d2fc97e158b7,58d4d1e7b1e97b258c9ed0b37e02d087


In [8]:
train_df = pd.read_csv(TRAIN_DIR/"train.csv")

In [9]:
number_of_unique_jobID = len(train_df["jobID"].value_counts().index)

In [34]:
number_of_unique_userID = len(train_df["userID"].value_counts().index)

In [35]:
print(f"Number of train_df: {len(train_df)}")
print(f"Unique jobs: {number_of_unique_jobID}")
print(f"Unique users: {number_of_unique_userID}")

Number of train_df: 6000
Unique jobs: 708
Unique users: 196


# Change userID and jobID to idx

In [11]:
train_df.head(1)

Unnamed: 0,userID,jobID,applied
0,fe292163d06253b716e9a0099b42031d,15de21c670ae7c3f6f3f1f37029303c9,0


In [12]:
unique_users = list(train_df["userID"].value_counts().index)

In [13]:
assert len(unique_users) == len(set(unique_users)), print("duplicated keys")

In [14]:
user_to_idx = dict()
idx_to_user = dict()

idx = 0

for unique_user in unique_users:
    if unique_user not in user_to_idx:
        user_to_idx[unique_user] = idx
        idx_to_user[idx] = unique_user
        idx += 1

In [15]:
unique_jobs = list(train_df["jobID"].value_counts().index)

In [16]:
job_to_idx = dict()
idx_to_job = dict()

idx = 0 

for job in unique_jobs:
    if job not in job_to_idx:
        job_to_idx[job] = idx
        idx_to_job[idx] = job
        idx += 1

In [17]:
num_user = len(user_to_idx)
num_job = len(job_to_idx)

# Make CSR_matrix

In [60]:
csr_list = [[0 for _ in range(num_job)] for _ in range(num_user)]

In [21]:
print(f"Shape of csr_list: {len(csr_list)} X {len(csr_list[0])}")

Shape of csr_list: 196 X 708


In [23]:
assert sum(map(sum, csr_list)) == 0, print("all should be 0")

In [58]:
from tqdm import tqdm

## Update csr_list value

In [61]:
for user in tqdm(unique_users):
    for job in unique_jobs:
        user_idx = user_to_idx[user]
        job_idx = job_to_idx[job]
        
        tmp_df = train_df[(train_df["userID"] == user) & (train_df["jobID"] == job)]
        
        if not tmp_df.empty:
            if tmp_df["applied"].values == 0: continue
            csr_list[user_idx][job_idx] = tmp_df["applied"].values[0]

100%|██████████| 196/196 [02:30<00:00,  1.30it/s]


In [69]:
for row in csr_list:
    for col in row:
        if col != 0 and col != 1:
            raise ValueError("should be 0 or 1")

In [72]:
assert sum(map(sum,csr_list)) == train_df["applied"].sum(), valueError("cTypeError has wrong number of applied")

In [73]:
csr_np = np.asarray(csr_list)

In [74]:
print(f"Shape of csr_np: {csr_np.shape[0]} X {csr_np.shape[1]}")

Shape of csr_np: 196 X 708


In [75]:
csr = csr_matrix(csr_np)

# Build model

## args
- factors
- regularization

In [213]:
factors = 1024
regularization = 0.08

In [214]:
als_model = ALS(factors=factors, regularization=regularization)
als_model.fit(csr.T * 15.0)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [215]:
als_model.user_factors.shape

(196, 1024)

In [216]:
item_factors = als_model.item_factors

# Test testcase

In [217]:
test_df = pd.read_csv(DATA_DIR/"test_job.csv")

In [218]:
test_df.head(13)

Unnamed: 0,userID,jobID
0,ebaee1af0c501f22ddfe242fc16dae53,352407221afb776e3143e8a1a0577885
1,9ab05403ac7808cbfba3da26665f7a9c,96b9bff013acedfb1d140579e2fbeb63
2,33349e909eba71677299d2fc97e158b7,58d4d1e7b1e97b258c9ed0b37e02d087
3,ac985a9db5faeb44c94a334430ccc241,ccb0989662211f61edae2e26d58ea92f
4,d41e0e6f6f1e29098d9d152511503ab2,4a213d37242bdcad8e7300e202e7caa4
5,078dd365ebc98784af7f8df76a98f53a,285f89b802bcb2651801455c86d78f2a
6,8ec72a61b2adad097a1c3aa06751e8c4,0f840be9b8db4d3fbd5ba2ce59211f55
7,0cc8f7bf8a8d56980414a6e4bc69cdc6,839ab46820b524afda05122893c2fe8e
8,d083e4e5172a4b14ae887b3eee370664,d7322ed717dedf1eb4e6e52a37ea7bcd
9,a31cedae93e1b076254f28d4a9b71319,c8ba76c279269b1c6bc8a07e38e78fa4


In [223]:
threshold = 0.03

In [224]:
ret = []

for idx, value in test_df.iterrows():
    user, job = value["userID"], value["jobID"]
    user_idx = user_to_idx[user]
    
    if job not in job_to_idx:
        ret.append(0)
        continue

    job_recs = als_model.recommend(user_to_idx[user], csr, N = 700)
    
    for job_rec in job_recs:
        if job_rec[0] == job_idx:
            if job_rec[1] > threshold:
                ret.append(1)
            else:
                ret.append(0)
            break
    else:
        ret.append(0)


In [225]:
with open("output.csv","w", newline="\n") as f:
    f.write("applied\n")
    for r in ret:
        f.write(str(r)+'\n')

In [226]:
!mv output.csv ~/Desktop