# JobRec

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np

In [None]:
user_job = pd.read_table("../raw_data/job-recommendation/users.tsv")
user_job = user_job[["UserID", "Major","DegreeType", "TotalYearsExperience"]]

In [None]:
apps = pd.read_table("../raw_data/job-recommendation/apps.tsv")
apps

In [None]:
df_jobs_full = pd.read_table("../raw_data/job-recommendation/jobs.tsv",on_bad_lines='warn')
df_jobs_full

In [None]:
df_jobs = pd.read_table("../raw_data/job-recommendation/jobs.tsv",on_bad_lines='warn',usecols=["JobID","Title"])
df_jobs

In [None]:
user_job.shape[0] #different from the one in apps.tsv

In [None]:
apps.UserID.unique().shape #reported unique users

# Clean data

In [None]:
from preprocessor import filter_data, preprocess_column

In [None]:
apps = preprocess_column(apps)
user_job = preprocess_column(user_job)
df_jobs = preprocess_column(df_jobs)

In [None]:
df_train, df_val, df_test, filtered_user, filtered_item = filter_data("jobs", apps, user_job, df_jobs, core_filter=5, min_inter=2)

In [None]:
filtered_user = filtered_user[["user_id",
                               "DegreeType", 
                               "Major",
                               "TotalYearsExperience"
                               ]]
filtered_user

In [None]:
filtered_user["in_test"] = filtered_user["user_id"].isin(df_test.user_id)
filtered_user

# User grouping

In [None]:
map_degree = {
                "Bachelor's":"University",
                "Master's":"University",
                "PhD":"University",
                "Associate's":"College",
                "High School":"High School",
                "Vocational":"College"
              }
filtered_user["map_degree"] = filtered_user.DegreeType.map(map_degree)

In [None]:
filtered_user.TotalYearsExperience = filtered_user.TotalYearsExperience.astype(int)

In [None]:
min_year = filtered_user.TotalYearsExperience.min()
max_year = filtered_user.TotalYearsExperience.max()

In [None]:
experience_years = ["$\leq$5", ">5--10", ">10"]

filtered_user["map_experience"] = pd.cut(filtered_user["TotalYearsExperience"], 
                                        [-1,5, 10, max_year], 
                                        labels=experience_years)

In [None]:
filtered_test_user = filtered_user[filtered_user.in_test]

## Major analysis

In [None]:
num_unique_major = filtered_user.Major.str.lower().nunique()
num_unique_major

In [None]:
num_unique_major_test = filtered_test_user.Major.str.lower().nunique()
num_unique_major_test

In [None]:
num_unique_major - num_unique_major_test

In [None]:
all_major = filtered_user.Major.str.lower().unique()
test_user_major = filtered_test_user.Major.str.lower().unique()

unannotated_major = np.setdiff1d(all_major, test_user_major, assume_unique=True)

In [None]:
filtered_user.shape[0]

In [None]:
filtered_user.query("map_degree!='High School'").Major.str.lower()\
                        .isin(unannotated_major)\
                        .sum()

In [None]:
annotate_path = "to_annotate"

#commented to avoid rewriting

# pd.DataFrame(filtered_test_user.query("map_degree=='High School'").Major.str.lower().value_counts())\
#                                                                                         .to_csv(f"{annotate_path}/test_user_major_HS.csv")

In [None]:
# pd.DataFrame(filtered_test_user.query("map_degree!='High School'").Major.str.lower().value_counts())\
#                                                                                     .to_csv(f"{annotate_path}/test_user_major_nonHS.csv")

In [None]:
# pd.DataFrame(filtered_user.query("(~in_test) &( map_degree!='High School')").Major.str.lower().value_counts())\
#                                                                 .to_csv(f"{annotate_path}/list_major_nonHS_users_not_in_test.csv")

# After annotation

In [None]:
annotated_major = pd.read_csv("annotated/annotated_test_user_major_nonHS.csv")
annotated_major = annotated_major[["Major", "category"]]
annotated_major["category"] = annotated_major["category"]\
                                                    .str.replace(" &", ",")\
                                                    .str.replace("Health, Medical", "Health \& Medical")

In [None]:
list_major = annotated_major.Major
list_category = annotated_major.category

In [None]:
map_major = dict(zip(list_major, list_category))

In [None]:
filtered_user["map_major"] = filtered_user["Major"]\
                                                .str.lower()\
                                                .map(map_major)

filtered_test_user["map_major"] = filtered_test_user["Major"]\
                                                            .str.lower()\
                                                            .map(map_major)

In [None]:
# get rid of high school majors in general
filtered_user.loc[filtered_user.DegreeType=="High School", "map_major"] = "-"
filtered_test_user.loc[filtered_test_user.DegreeType=="High School", "map_major"] = "-"

In [None]:
filtered_user.shape[0], filtered_user.query("map_major!='Drop'").shape[0]

In [None]:
# drop users with invalid/generic majors
filtered_user = filtered_user.query("map_major!='Drop'")
filtered_test_user = filtered_test_user.query("map_major!='Drop'")

In [None]:
assert filtered_test_user.map_major.isna().sum() == 0 

In [None]:
from stats import print_stats

In [None]:
val_count_degree = filtered_test_user.value_counts("map_degree")
val_count_exp = filtered_test_user.value_counts("map_experience")
val_count_major = filtered_test_user.value_counts("map_major")

In [None]:
print_stats(val_count_degree)

In [None]:
print_stats(val_count_exp)

In [None]:
print_stats(val_count_major)

In [None]:
col_names = filtered_test_user.columns[-3:]

In [None]:
groupings = ["degree","experience (years)","major"]


filtered_user_grouping_HS = filtered_test_user\
            .rename(columns=dict(zip(col_names, groupings)))\
            .groupby(groupings)[["user_id"]]\
            .count()\
            .rename(columns={"user_id":"\#user"})\
            .reindex(["High School"], level=0)\
            .reindex(experience_years, level=1)\
            .reindex(["-"], level=2)

In [None]:
major_index = val_count_major.index[1:]
major_index

In [None]:
filtered_user_grouping_non_HS = filtered_test_user\
            .rename(columns=dict(zip(col_names, groupings)))\
            .groupby(groupings)[["user_id"]]\
            .count()\
            .rename(columns={"user_id":"\#user"})\
            .reindex(["College", "University"], level=0)\
            .reindex(experience_years, level=1)\
            .reindex(major_index, level=2)
filtered_user_grouping_non_HS

In [None]:
filtered_user_grouping = pd.concat([filtered_user_grouping_HS, filtered_user_grouping_non_HS])
nonzero_filtered_user_grouping = filtered_user_grouping[filtered_user_grouping["\\#user"]>0]
nonzero_filtered_user_grouping

In [None]:
print(nonzero_filtered_user_grouping.to_latex())

In [None]:
nonzero_filtered_user_grouping\
                    .describe()\
                    .loc[["count", "min", "50%", "max"]]\
                    .astype(int)

# Handle non-test user with no "map_major" annotation 
This ends up not being used for LLMRecs, as we only prompt for the test users.

In [None]:
filtered_user_no_major = filtered_user[filtered_user.map_major.isna()][["user_id", "Major"]]
filtered_user_no_major

In [None]:
from rapidfuzz import fuzz, utils, process

def fuzzy_match(unannotated_major):
    return process.extractOne(unannotated_major, annotated_major.Major,
                              scorer=fuzz.token_set_ratio, processor=utils.default_process)

In [None]:
unique_no_major = filtered_user_no_major[["Major"]].drop_duplicates("Major").reset_index()
unique_no_major

In [None]:
unique_no_major["match_result"] = unique_no_major.Major.apply(fuzzy_match)
unique_no_major["match"] = unique_no_major.match_result.apply(lambda x: x[0])
unique_no_major["match_score"] = unique_no_major.match_result.apply(lambda x: x[1])
unique_no_major["match_idx"] = unique_no_major.match_result.apply(lambda x: x[2])

In [None]:
unique_no_major.drop(columns="match_result", inplace=True)

In [None]:
thresh = 75

In [None]:
val_count_match_score = unique_no_major.value_counts("match_score")
val_count_match_score[val_count_match_score.index<thresh].sum()

In [None]:
unique_no_major.query("@thresh<= match_score <= 80").sort_values("match_score")

In [None]:
unique_no_major["category"] = unique_no_major\
                                    .apply(lambda x: "Others" if x.match_score < thresh else annotated_major.at[x.match_idx,"category"], axis=1)

In [None]:
filtered_user_no_major["category"] = filtered_user_no_major.Major.apply(lambda x: unique_no_major.loc[unique_no_major.Major==x,"category"].values[0])

In [None]:
filtered_user["map_major"] = filtered_user.apply(lambda x: 
                                        filtered_user_no_major.loc[
                                            filtered_user_no_major.user_id==x.user_id, "category"].values[0] 
                                        if type(x.map_major) == float
                                        else x.map_major, axis=1)

In [None]:
filtered_user = filtered_user[filtered_user.map_major != "Drop"]

In [None]:
filtered_user.value_counts("map_major")

# Save and stats

In [None]:
from stats import get_df_stats
from preprocessor import prepare_to_save, general_save

In [None]:
df_train, df_val, df_test, filtered_item = prepare_to_save(df_train, df_val, df_test, filtered_user, filtered_item)

In [None]:
get_df_stats(df_train, df_val, df_test, filtered_user, filtered_item)

In [None]:
# general save
data_name = "../cleaned_data/jobrec"
general_save(df_train, df_val, df_test, filtered_user, filtered_item, data_name)