In [29]:
import pandas as pd
import re

In [30]:
df = pd.read_csv("data/combined_scraped_data.csv")
df.head()

Unnamed: 0,Position Title,Date,Apply,Work Model,Location,Company,Company Size,Company Industry,Salary,Qualifications,H1b Sponsored,Is New Grad,category
0,Kids Club Associate,2025-11-25,https://jobright.ai/jobs/info/6911b63dbb519377...,Hybrid,"Multi Location\nKissimmee, FL: E Osceola Pkwy ...",EōS Fitness,1001-5000,"Fitness,Health Care",$15-$16 /hr,"1. Previous experience in childcare, daycare, ...",not sure,,Education
1,Board Certified Behavior Analyst (BCBA),2025-11-25,https://jobright.ai/jobs/info/69265a85f0beb072...,On Site,"Long Beach, CA",Butterfly Effects,501-1000,Family,$85000-$95000 /yr,1. Master's degree in Applied Behavior Analysi...,not sure,,Education
2,Clinical Research Coordinator I - Heart Instit...,2025-11-25,https://jobright.ai/jobs/info/69265981f0beb072...,On Site,"147 S Robertson Blvd, Los Angeles, CA, 90048, US",Cedars-Sinai,10000+,"Communities,Health Care",$23.39-$39.76 /hr,1. Independent study coordination including sc...,not sure,,Education
3,"Home Health SLP, Speech Therapist",2025-11-25,https://jobright.ai/jobs/info/6926557f27bf2f41...,On Site,"Foxfield, CO 80016 | 39.589685509 | -104.72748045",BAYADA Home Health Care,10000+,"Health Care,Medical",$55-$65 /yr,1. A current license as a Speech Language Path...,not sure,,Education
4,Direct Service Professional - Batavia (Day Ser...,2025-11-25,https://jobright.ai/jobs/info/69265436f0beb072...,On Site,"Batavia, OH",Ohio Valley Goodwill Industries,501-1000,"Employment,Home Services",$18-$18 /hr,1. Twenty-One years or older\n2. Minimum of Hi...,not sure,,Education


In [31]:
df["text"] = (
    df["Position Title"] + " " +
    df["Qualifications"] + " " +
    df["Company Industry"] + " " + 
    df["Work Model"]
)

In [32]:
df.dtypes

Position Title      object
Date                object
Apply               object
Work Model          object
Location            object
Company             object
Company Size        object
Company Industry    object
Salary              object
Qualifications      object
H1b Sponsored       object
Is New Grad         object
category            object
text                object
dtype: object

In [33]:
def clean_text_columns(df):
    text_cols = df.select_dtypes(include="object").columns
    df[text_cols] = df[text_cols].apply(lambda col: col.fillna("").str.lower().str.strip())
    return df


In [34]:
df = clean_text_columns(df).drop_duplicates()
df = clean_text_columns(df)
df.head()

Unnamed: 0,Position Title,Date,Apply,Work Model,Location,Company,Company Size,Company Industry,Salary,Qualifications,H1b Sponsored,Is New Grad,category,text
0,kids club associate,2025-11-25,https://jobright.ai/jobs/info/6911b63dbb519377...,hybrid,"multi location\nkissimmee, fl: e osceola pkwy ...",eōs fitness,1001-5000,"fitness,health care",$15-$16 /hr,"1. previous experience in childcare, daycare, ...",not sure,,education,kids club associate 1. previous experience in ...
1,board certified behavior analyst (bcba),2025-11-25,https://jobright.ai/jobs/info/69265a85f0beb072...,on site,"long beach, ca",butterfly effects,501-1000,family,$85000-$95000 /yr,1. master's degree in applied behavior analysi...,not sure,,education,board certified behavior analyst (bcba) 1. mas...
2,clinical research coordinator i - heart instit...,2025-11-25,https://jobright.ai/jobs/info/69265981f0beb072...,on site,"147 s robertson blvd, los angeles, ca, 90048, us",cedars-sinai,10000+,"communities,health care",$23.39-$39.76 /hr,1. independent study coordination including sc...,not sure,,education,clinical research coordinator i - heart instit...
3,"home health slp, speech therapist",2025-11-25,https://jobright.ai/jobs/info/6926557f27bf2f41...,on site,"foxfield, co 80016 | 39.589685509 | -104.72748045",bayada home health care,10000+,"health care,medical",$55-$65 /yr,1. a current license as a speech language path...,not sure,,education,"home health slp, speech therapist 1. a current..."
4,direct service professional - batavia (day ser...,2025-11-25,https://jobright.ai/jobs/info/69265436f0beb072...,on site,"batavia, oh",ohio valley goodwill industries,501-1000,"employment,home services",$18-$18 /hr,1. twenty-one years or older\n2. minimum of hi...,not sure,,education,direct service professional - batavia (day ser...


In [35]:
feature_list = df.drop(columns = ["Position Title", "Date", "Apply", "Qualifications", "Company Industry", "H1b Sponsored", "Is New Grad", "Work Model", "Salary"])

feature_list.head()

Unnamed: 0,Location,Company,Company Size,category,text
0,"multi location\nkissimmee, fl: e osceola pkwy ...",eōs fitness,1001-5000,education,kids club associate 1. previous experience in ...
1,"long beach, ca",butterfly effects,501-1000,education,board certified behavior analyst (bcba) 1. mas...
2,"147 s robertson blvd, los angeles, ca, 90048, us",cedars-sinai,10000+,education,clinical research coordinator i - heart instit...
3,"foxfield, co 80016 | 39.589685509 | -104.72748045",bayada home health care,10000+,education,"home health slp, speech therapist 1. a current..."
4,"batavia, oh",ohio valley goodwill industries,501-1000,education,direct service professional - batavia (day ser...


In [36]:
def parse_salary(s):
    """
    Convert salary strings like "$15-$16 /hr" or "$85000-$95000 /yr"
    into average yearly salary.
    
    Rules:
    - Hourly: average(range) * 40 hours/week * 45 working weeks/year
    - Yearly: average(range)
    - Null or malformed → return None
    """
    if pd.isna(s):
        return None  # Handle nulls
    
    s = s.strip().lower()
    if s == "":
        return None

    # Extract numbers (handles commas, $)
    nums = re.findall(r"\$?([\d,]+)", s)
    if not nums:
        return None
    
    # Convert to floats
    nums = [float(n.replace(",", "")) for n in nums]

    # Average of range (or single number)
    avg_val = sum(nums) / len(nums)

    # Hourly case
    if "hr" in s or "/hr" in s:
        return avg_val * 40 * 45  # 40 hours/week × 45 working weeks
    
    # Yearly case
    if "yr" in s or "/yr" in s or "year" in s:
        return avg_val

    return None  # If unclear format


# ---- Apply to your DataFrame ----
df["Yearly_Salary"] = df["Salary"].apply(parse_salary)
feature_list["salary"] = df["Yearly_Salary"]


In [37]:
feature_list.head()

Unnamed: 0,Location,Company,Company Size,category,text,salary
0,"multi location\nkissimmee, fl: e osceola pkwy ...",eōs fitness,1001-5000,education,kids club associate 1. previous experience in ...,27900.0
1,"long beach, ca",butterfly effects,501-1000,education,board certified behavior analyst (bcba) 1. mas...,90000.0
2,"147 s robertson blvd, los angeles, ca, 90048, us",cedars-sinai,10000+,education,clinical research coordinator i - heart instit...,79650.0
3,"foxfield, co 80016 | 39.589685509 | -104.72748045",bayada home health care,10000+,education,"home health slp, speech therapist 1. a current...",60.0
4,"batavia, oh",ohio valley goodwill industries,501-1000,education,direct service professional - batavia (day ser...,32400.0


In [38]:
import joblib

#to tf-idf the text column I need to use the same tf-idf vectorizer from when the model was created
tfidf = joblib.load("tfidf.pkl")


In [40]:
text_tf_idf = tfidf.transform(feature_list["text"])
