In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv("../data/combined_scraped_data.csv")
df.head()

Unnamed: 0,Position Title,Date,Apply,Work Model,Location,Company,Company Size,Company Industry,Salary,Qualifications,H1b Sponsored,Is New Grad,category
0,Kids Club Associate,2025-11-25,https://jobright.ai/jobs/info/6911b63dbb519377...,Hybrid,"Multi Location\nKissimmee, FL: E Osceola Pkwy ...",EōS Fitness,1001-5000,"Fitness,Health Care",$15-$16 /hr,"1. Previous experience in childcare, daycare, ...",not sure,,Education
1,Board Certified Behavior Analyst (BCBA),2025-11-25,https://jobright.ai/jobs/info/69265a85f0beb072...,On Site,"Long Beach, CA",Butterfly Effects,501-1000,Family,$85000-$95000 /yr,1. Master's degree in Applied Behavior Analysi...,not sure,,Education
2,Clinical Research Coordinator I - Heart Instit...,2025-11-25,https://jobright.ai/jobs/info/69265981f0beb072...,On Site,"147 S Robertson Blvd, Los Angeles, CA, 90048, US",Cedars-Sinai,10000+,"Communities,Health Care",$23.39-$39.76 /hr,1. Independent study coordination including sc...,not sure,,Education
3,"Home Health SLP, Speech Therapist",2025-11-25,https://jobright.ai/jobs/info/6926557f27bf2f41...,On Site,"Foxfield, CO 80016 | 39.589685509 | -104.72748045",BAYADA Home Health Care,10000+,"Health Care,Medical",$55-$65 /yr,1. A current license as a Speech Language Path...,not sure,,Education
4,Direct Service Professional - Batavia (Day Ser...,2025-11-25,https://jobright.ai/jobs/info/69265436f0beb072...,On Site,"Batavia, OH",Ohio Valley Goodwill Industries,501-1000,"Employment,Home Services",$18-$18 /hr,1. Twenty-One years or older\n2. Minimum of Hi...,not sure,,Education


In [3]:
df["text"] = (
    df["Position Title"] + " " +
    df["Qualifications"] + " " +
    df["Company Industry"] + " " + 
    df["Work Model"] + " " + 
    df["Company"]
)

In [4]:
df.dtypes

Position Title      object
Date                object
Apply               object
Work Model          object
Location            object
Company             object
Company Size        object
Company Industry    object
Salary              object
Qualifications      object
H1b Sponsored       object
Is New Grad         object
category            object
text                object
dtype: object

In [5]:
def clean_text_columns(df):
    text_cols = df.select_dtypes(include="object").columns
    df[text_cols] = df[text_cols].apply(lambda col: col.fillna("").str.lower().str.strip())
    return df


In [6]:
df = clean_text_columns(df).drop_duplicates()
df = clean_text_columns(df)
df.head()

Unnamed: 0,Position Title,Date,Apply,Work Model,Location,Company,Company Size,Company Industry,Salary,Qualifications,H1b Sponsored,Is New Grad,category,text
0,kids club associate,2025-11-25,https://jobright.ai/jobs/info/6911b63dbb519377...,hybrid,"multi location\nkissimmee, fl: e osceola pkwy ...",eōs fitness,1001-5000,"fitness,health care",$15-$16 /hr,"1. previous experience in childcare, daycare, ...",not sure,,education,kids club associate 1. previous experience in ...
1,board certified behavior analyst (bcba),2025-11-25,https://jobright.ai/jobs/info/69265a85f0beb072...,on site,"long beach, ca",butterfly effects,501-1000,family,$85000-$95000 /yr,1. master's degree in applied behavior analysi...,not sure,,education,board certified behavior analyst (bcba) 1. mas...
2,clinical research coordinator i - heart instit...,2025-11-25,https://jobright.ai/jobs/info/69265981f0beb072...,on site,"147 s robertson blvd, los angeles, ca, 90048, us",cedars-sinai,10000+,"communities,health care",$23.39-$39.76 /hr,1. independent study coordination including sc...,not sure,,education,clinical research coordinator i - heart instit...
3,"home health slp, speech therapist",2025-11-25,https://jobright.ai/jobs/info/6926557f27bf2f41...,on site,"foxfield, co 80016 | 39.589685509 | -104.72748045",bayada home health care,10000+,"health care,medical",$55-$65 /yr,1. a current license as a speech language path...,not sure,,education,"home health slp, speech therapist 1. a current..."
4,direct service professional - batavia (day ser...,2025-11-25,https://jobright.ai/jobs/info/69265436f0beb072...,on site,"batavia, oh",ohio valley goodwill industries,501-1000,"employment,home services",$18-$18 /hr,1. twenty-one years or older\n2. minimum of hi...,not sure,,education,direct service professional - batavia (day ser...


In [7]:
feature_list = df.drop(columns = ["Position Title", "Date", "Apply", "Qualifications", "Company Industry", "H1b Sponsored", "Is New Grad", "Work Model", "Salary", "Company"])

feature_list.head()

Unnamed: 0,Location,Company Size,category,text
0,"multi location\nkissimmee, fl: e osceola pkwy ...",1001-5000,education,kids club associate 1. previous experience in ...
1,"long beach, ca",501-1000,education,board certified behavior analyst (bcba) 1. mas...
2,"147 s robertson blvd, los angeles, ca, 90048, us",10000+,education,clinical research coordinator i - heart instit...
3,"foxfield, co 80016 | 39.589685509 | -104.72748045",10000+,education,"home health slp, speech therapist 1. a current..."
4,"batavia, oh",501-1000,education,direct service professional - batavia (day ser...


In [8]:
def parse_salary(s):
    """
    Convert salary strings like "$15-$16 /hr" or "$85000-$95000 /yr"
    into average yearly salary.
    
    Rules:
    - Hourly: average(range) * 40 hours/week * 45 working weeks/year
    - Yearly: average(range)
    - Null or malformed → return None
    """
    if pd.isna(s):
        return -1  # Handle nulls
    
    s = s.strip().lower()
    if s == "":
        return -1

    # Extract numbers (handles commas, $)
    nums = re.findall(r"\$?([\d,]+)", s)
    if not nums:
        return -1
    
    # Convert to floats
    nums = [float(n.replace(",", "")) for n in nums]

    # Average of range (or single number)
    avg_val = sum(nums) / len(nums)

    # Hourly case
    if "hr" in s or "/hr" in s:
        return avg_val * 40 * 45  # 40 hours/week × 45 working weeks
    
    # Yearly case
    if "yr" in s or "/yr" in s or "year" in s:
        return avg_val

    return -1  # If unclear format


# ---- Apply to your DataFrame ----
df["Yearly_Salary"] = df["Salary"].apply(parse_salary)
feature_list["salary_avg"] = df["Yearly_Salary"]


In [9]:
feature_list.head()

Unnamed: 0,Location,Company Size,category,text,salary_avg
0,"multi location\nkissimmee, fl: e osceola pkwy ...",1001-5000,education,kids club associate 1. previous experience in ...,27900.0
1,"long beach, ca",501-1000,education,board certified behavior analyst (bcba) 1. mas...,90000.0
2,"147 s robertson blvd, los angeles, ca, 90048, us",10000+,education,clinical research coordinator i - heart instit...,79650.0
3,"foxfield, co 80016 | 39.589685509 | -104.72748045",10000+,education,"home health slp, speech therapist 1. a current...",60.0
4,"batavia, oh",501-1000,education,direct service professional - batavia (day ser...,32400.0


In [10]:
#ready to normalize 
from sklearn.preprocessing import MinMaxScaler

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Apply Min-Max scaler
feature_list["salary_avg_norm"] = scaler.fit_transform(feature_list[["salary_avg"]])

In [11]:
feature_list.isnull().sum()

Location           0
Company Size       0
category           0
text               0
salary_avg         0
salary_avg_norm    0
dtype: int64

In [12]:
import joblib

#to tf-idf the text column I need to use the same tf-idf vectorizer from when the model was created
tfidf = joblib.load("tfidf_vectorizer.pkl")


In [13]:
text_tf_idf = tfidf.transform(feature_list["text"])

In [14]:
joblib.dump(text_tf_idf, "scraped_tfidf_matrix.pkl")

['scraped_tfidf_matrix.pkl']

Location:

In [15]:
def score_location_simple(raw_loc):
    if pd.isna(raw_loc):
        return 1
    
    loc = str(raw_loc).strip().lower()
    if loc == "":
        return 1

    score = 1
    tokens = loc.split()

    # -------------------------------
    # 1. Comma structure (very strong signal)
    # -------------------------------
    if "," in loc:
        score += 2   # keep this

    # -------------------------------
    # 2. Known countries / codes (strong)
    # -------------------------------
    known_countries = ["us", "usa", "uk", "ca", "nz", "au", "india", "in", "gb"]
    if any(tok in known_countries for tok in tokens):
        score += 1

    # -------------------------------
    # 3. Real U.S. states (2 letters) — but use whitelist
    # -------------------------------
    us_states = {
        "al","ak","az","ar","ca","co","ct","de","fl","ga","hi","id","il","in",
        "ia","ks","ky","la","me","md","ma","mi","mn","ms","mo","mt","ne","nv",
        "nh","nj","nm","ny","nc","nd","oh","ok","or","pa","ri","sc","sd","tn",
        "tx","ut","vt","va","wa","wv","wi","wy"
    }

    if any(tok in us_states for tok in tokens):
        score += 2

    # -------------------------------
    # 4. REMOVE: rewarding "north ___" (fraudsters abuse this)
    # Instead: Penalize if 2-word name looks like a first + first name
    # -------------------------------
    direction_words = ["north", "south", "east", "west"]
    if tokens[0] in direction_words:
        # DON'T reward
        # Instead: check if the second word looks like a human name
        second = tokens[1] if len(tokens) > 1 else ""
        # simple check — names often capitalized but lowercase here
        if second in ["jeffrey", "jackson", "david", "john", "michael"]:
            score -= 1

    # -------------------------------
    # 5. Long single-word strings → suspicious
    # -------------------------------
    if len(tokens) == 1 and len(loc) > 10:
        score -= 1

    # -------------------------------
    # 6. Fake-looking suffixes
    # -------------------------------
    fake_suffixes = ["furt", "bury", "view", "mouth", "shire", "stad", "side"]
    if any(loc.endswith(sfx) for sfx in fake_suffixes):
        score -= 1

    # -------------------------------
    # 7. NEW: contains digits → more real (zip codes, coordinates)
    # -------------------------------
    if any(tok.isdigit() for tok in tokens):
        score += 1

    # -------------------------------
    # 8. NEW: if the string looks like "First Last" → suspicious
    # -------------------------------
    first_last_pattern = r"^[a-z]+ [a-z]+$"
    if re.match(first_last_pattern, loc):
        score -= 2

    # clamp
    return max(1, min(score, 5))


In [16]:
# --------------------------------------------------------
# STEP 1 — Compute location features using original df
# --------------------------------------------------------

# Legitimacy score (simple version)
feature_list["location_legitimacy"] = df.loc[feature_list.index, "Location"].apply(score_location_simple)

# Structural features
def fe_location(loc):
    loc = str(loc).lower()
    return pd.Series({
        "loc_len": len(loc),
        "loc_word_count": len(loc.split()),
        "has_us_prefix": int(loc.startswith("us,")),
        "has_state_code": int(bool(re.search(r"\b[a-z]{2}\b", loc))),
        "starts_with_direction": int(loc.split()[0] in ["north", "south", "east", "west"]),
        "contains_digits": int(any(char.isdigit() for char in loc)),
    })

loc_train = df.loc[feature_list.index, "Location"].apply(fe_location)

# Attach features
feature_list = pd.concat([feature_list, loc_train], axis=1)

# --------------------------------------------------------
# STEP 2 — BUILD PREVIEW WITH ASSOCIATED VALUES
# --------------------------------------------------------

preview = pd.concat(
    [
        df.loc[feature_list.index, ["Location"]],   # original locations
        feature_list[[
            "location_legitimacy",
            "loc_len",
            "loc_word_count",
            "has_us_prefix",
            "has_state_code",
            "starts_with_direction",
            "contains_digits"
        ]]
    ],
    axis=1
)

# Look at a random sample of 10 rows
preview.sample(10, random_state=42)

Unnamed: 0,Location,location_legitimacy,loc_len,loc_word_count,has_us_prefix,has_state_code,starts_with_direction,contains_digits
7083,"raleigh, nc",5,11,2,0,1,0,0
239,"hackensack, nj",5,14,2,0,1,0,0
2864,"multi location\nnew york, united states\nnew y...",5,51,9,0,1,0,0
5398,"multi location\nla jolla, ca\nla jolla",5,36,7,0,1,0,0
2987,"multi location\nbuffalo, ny\nnew york city, ny",5,44,8,0,1,0,0
349,"berkeley, ca",5,12,2,0,1,0,0
927,"multi location\ncincinnati, oh\ncharleston, sc...",5,102,16,0,1,0,0
5782,"rosemont, il",5,12,2,0,1,0,0
4463,"bentonville, ar",5,15,2,0,1,0,0
4897,"multi location\nnew york, ny\nsouth jordan, ut",5,44,8,0,1,0,0


In [17]:
feature_list.head()

Unnamed: 0,Location,Company Size,category,text,salary_avg,salary_avg_norm,location_legitimacy,loc_len,loc_word_count,has_us_prefix,has_state_code,starts_with_direction,contains_digits
0,"multi location\nkissimmee, fl: e osceola pkwy ...",1001-5000,education,kids club associate 1. previous experience in ...,27900.0,0.0003443917,5,536,85,0,1,0,1
1,"long beach, ca",501-1000,education,board certified behavior analyst (bcba) 1. mas...,90000.0,0.001110914,5,14,3,0,1,0,0
2,"147 s robertson blvd, los angeles, ca, 90048, us",10000+,education,clinical research coordinator i - heart instit...,79650.0,0.00098316,5,48,9,0,1,0,1
3,"foxfield, co 80016 | 39.589685509 | -104.72748045",10000+,education,"home health slp, speech therapist 1. a current...",60.0,7.529442e-07,5,49,7,0,1,0,1
4,"batavia, oh",501-1000,education,direct service professional - batavia (day ser...,32400.0,0.0003999368,5,11,2,0,1,0,0


In [18]:
feature_list["category"].value_counts()

category
accountingandfinance    2765
technology              1793
marking                  987
legalandcompliance       939
hr                       939
education                804
Name: count, dtype: int64

Need to one-hot encode the abovve columns, but it needs to match the column names of the feature names that was used to train the model. 

In [19]:
kaggle = joblib.load("X_train.pkl")
kaggle.columns

Index(['industry_group_Education', 'industry_group_Engineering/Construction',
       'industry_group_Finance', 'industry_group_Government/Nonprofit',
       'industry_group_Healthcare', 'industry_group_Manufacturing/Industrial',
       'industry_group_Other', 'industry_group_Other Business/Services',
       'industry_group_Retail/Hospitality', 'industry_group_Technology',
       'industry_group_Transportation', 'salary_avg_norm',
       'location_legitimacy', 'loc_len', 'loc_word_count', 'has_us_prefix',
       'has_state_code', 'starts_with_direction', 'contains_digits'],
      dtype='object')

In [20]:
# Step 1: mapping from your categories -> desired industry groups
mapping = {
    "education": "industry_group_Education",
    "technology": "industry_group_Technology",
    "accountingandfinance": "industry_group_Finance",
    "marking": "industry_group_Other Business/Services",
    "legalandcompliance": "industry_group_Other Business/Services",  # or Government/Nonprofit
    "hr": "industry_group_Other Business/Services"
}

# Step 2: Apply mapping
feature_list["industry_group"] = feature_list["category"].map(mapping)

# Step 3: One-hot encode the mapped column
onehot = pd.get_dummies(feature_list["industry_group"])

# Step 4: Ensure ALL required columns exist (even if all zeros)
final_cols = [
    'industry_group_Education', 
    'industry_group_Engineering/Construction',
    'industry_group_Finance', 
    'industry_group_Government/Nonprofit',
    'industry_group_Healthcare', 
    'industry_group_Manufacturing/Industrial',
    'industry_group_Other', 
    'industry_group_Other Business/Services',
    'industry_group_Retail/Hospitality', 
    'industry_group_Technology',
    'industry_group_Transportation'
]

# Add any missing columns as all zeros
for col in final_cols:
    if col not in onehot.columns:
        onehot[col] = 0

# Reorder columns
onehot = onehot[final_cols]

# Step 5: concat back to df (optional)
feature_list = pd.concat([feature_list, onehot], axis=1)

feature_list.head()


Unnamed: 0,Location,Company Size,category,text,salary_avg,salary_avg_norm,location_legitimacy,loc_len,loc_word_count,has_us_prefix,...,industry_group_Engineering/Construction,industry_group_Finance,industry_group_Government/Nonprofit,industry_group_Healthcare,industry_group_Manufacturing/Industrial,industry_group_Other,industry_group_Other Business/Services,industry_group_Retail/Hospitality,industry_group_Technology,industry_group_Transportation
0,"multi location\nkissimmee, fl: e osceola pkwy ...",1001-5000,education,kids club associate 1. previous experience in ...,27900.0,0.0003443917,5,536,85,0,...,0,False,0,0,0,0,False,0,False,0
1,"long beach, ca",501-1000,education,board certified behavior analyst (bcba) 1. mas...,90000.0,0.001110914,5,14,3,0,...,0,False,0,0,0,0,False,0,False,0
2,"147 s robertson blvd, los angeles, ca, 90048, us",10000+,education,clinical research coordinator i - heart instit...,79650.0,0.00098316,5,48,9,0,...,0,False,0,0,0,0,False,0,False,0
3,"foxfield, co 80016 | 39.589685509 | -104.72748045",10000+,education,"home health slp, speech therapist 1. a current...",60.0,7.529442e-07,5,49,7,0,...,0,False,0,0,0,0,False,0,False,0
4,"batavia, oh",501-1000,education,direct service professional - batavia (day ser...,32400.0,0.0003999368,5,11,2,0,...,0,False,0,0,0,0,False,0,False,0


In [21]:
feature_list.drop(columns = ["category", "text", "Location", "Company Size", "salary_avg", "industry_group"], inplace = True)
feature_list.shape

(8227, 19)

In [22]:
print(feature_list.shape)
print(kaggle.shape)

(8227, 19)
(17889, 19)


In [23]:
feature_list.columns

Index(['salary_avg_norm', 'location_legitimacy', 'loc_len', 'loc_word_count',
       'has_us_prefix', 'has_state_code', 'starts_with_direction',
       'contains_digits', 'industry_group_Education',
       'industry_group_Engineering/Construction', 'industry_group_Finance',
       'industry_group_Government/Nonprofit', 'industry_group_Healthcare',
       'industry_group_Manufacturing/Industrial', 'industry_group_Other',
       'industry_group_Other Business/Services',
       'industry_group_Retail/Hospitality', 'industry_group_Technology',
       'industry_group_Transportation'],
      dtype='object')

The difference here is the employment_group column in the kaggle dataset that our scraped data does not have... will have to remove that data from the original X_train and retrain all models. 

In [24]:
# Get column sets
feature_cols = set(feature_list.columns)
kaggle_cols = set(kaggle.columns)

# Differences
in_feature_not_kaggle = feature_cols - kaggle_cols
in_kaggle_not_feature = kaggle_cols - feature_cols

print("\nColumns in feature_list but NOT in kaggle:")
for col in sorted(in_feature_not_kaggle):
    print("  -", col)

print("\nColumns in kaggle but NOT in feature_list:")
for col in sorted(in_kaggle_not_feature):
    print("  -", col)



Columns in feature_list but NOT in kaggle:

Columns in kaggle but NOT in feature_list:


both lists are empty as they should be, created that for debugging purposes

In [25]:
cols_to_convert = feature_list.select_dtypes(include=["bool", "int64"]).columns
feature_list[cols_to_convert] = feature_list[cols_to_convert].astype("int8")

In [26]:
feature_list.dtypes

salary_avg_norm                            float64
location_legitimacy                           int8
loc_len                                       int8
loc_word_count                                int8
has_us_prefix                                 int8
has_state_code                                int8
starts_with_direction                         int8
contains_digits                               int8
industry_group_Education                      int8
industry_group_Engineering/Construction       int8
industry_group_Finance                        int8
industry_group_Government/Nonprofit           int8
industry_group_Healthcare                     int8
industry_group_Manufacturing/Industrial       int8
industry_group_Other                          int8
industry_group_Other Business/Services        int8
industry_group_Retail/Hospitality             int8
industry_group_Technology                     int8
industry_group_Transportation                 int8
dtype: object

In [27]:
joblib.dump(feature_list, "scraped_feature_list.pkl")

['scraped_feature_list.pkl']