In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
print("package loaded")

package loaded


In [95]:
print("kernel connected")

kernel connected


In [None]:
# only try to process a single file emp_title now

df=pd.read_csv("/Users/justinjiang/Desktop/emp_title.csv")
print("ok")
df.head()

ok


Unnamed: 0.1,Unnamed: 0,emp_title_clean
0,0,
1,1,ryder
2,2,
3,3,air resources board
4,4,university medical group


In [107]:
# only deal with first 10 obs for openai trials
sample_df = df[['emp_title_clean']].dropna().head(10).copy()
sample_df = sample_df.reset_index(drop=True)
sample_df

Unnamed: 0,emp_title_clean
0,ryder
1,air resources board
2,university medical group
3,veolia transportaton
4,southern star photography
5,mkc accounting
6,starbucks
7,southwest rural metro
8,ucla
9,va dept of conservationrecreation


In [None]:
# classify the emp_title column

def classify_title_local(title):
    title = str(title).lower().strip()

    job_keywords = [
        "manager", "engineer", "developer", "designer", "analyst", "consultant",
        "specialist", "technician", "supervisor", "director", "coordinator",
        "officer", "executive", "associate", "assistant", "intern", "clerk", "nurse", "teacher"
    ]

    # but this is too simple because some companies may contain teacher/associate/consultants...
    for keyword in job_keywords:
        if keyword in title:
            return "job"

    #default as company
    return "company"

sample_df['title_type'] = sample_df['emp_title_clean'].apply(classify_title_local)
import pandas as pd
pd.set_option('display.max_rows', 500)  
sample_df

Unnamed: 0,emp_title_clean,title_type
0,ryder,company
1,air resources board,company
2,university medical group,company
3,veolia transportaton,company
4,southern star photography,company
5,mkc accounting,company
6,starbucks,company
7,southwest rural metro,company
8,ucla,company
9,va dept of conservationrecreation,company


In [None]:
# openai classify industry (can do web scraping to save cost)
from openai import OpenAI
import time

client = OpenAI(api_key="API_KEY")

def classify_industry(company_name):
    prompt = f"""What industry does the company "{company_name}" belong to?
Choose from the following categories ONLY:
Technology, Healthcare, Finance, Retail, Education, Manufacturing, Transportation, Energy, Government, Real Estate, Hospitality, Construction, Legal, Food & Beverage, Other.
Only return the category name."""
    
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error for {company_name}: {e}")
        return "unknown"


sample_df['industry'] = sample_df['emp_title_clean'].apply(classify_industry)

In [None]:
sample_df

Unnamed: 0,emp_title_clean,title_type
0,ryder,company
1,air resources board,company
2,university medical group,company
3,veolia transportaton,company
4,southern star photography,company
5,mkc accounting,company
6,starbucks,company
7,southwest rural metro,company
8,ucla,company
9,va dept of conservationrecreation,company


In [None]:
# Google search to get company official site

def get_official_site(company_name):
    query = f"{company_name} company official site"
    try:
        urls = list(search(query, num_results=7))
        # exclude Yelp、Google Maps、Tripadvisor 
        filtered_urls = [url for url in urls if not any(
            banned in url for banned in ['yelp.com', 'google.com/maps', 'tripadvisor.com']
        )]
        # 优先 com/org/gov/edu
        for domain in ['.gov', '.com', '.edu']:
            for url in filtered_urls:
                if domain in url:
                    return url
        return filtered_urls[0] if filtered_urls else "Not found"
    except Exception as e:
        return "error"


company_urls = []
for i, row in sample_df.iterrows():
    if row['title_type'] == 'company':
        company = row['emp_title_clean']
        url = get_official_site(company)
        #print(f"{i+1}. {company} → {url}")
        company_urls.append(url)
    else:
        company_urls.append("N/A") 

    time.sleep(1.5)  

# add new col
sample_df['company_url'] = company_urls
sample_df

Unnamed: 0,emp_title_clean,title_type,company_url
0,ryder,company,https://www.ryder.com/en-us
1,air resources board,company,https://ww2.arb.ca.gov/homepage
2,university medical group,company,https://data.cms.gov/tools/medicare-revalidati...
3,veolia transportaton,company,https://www.veolianorthamerica.com/who-we-serv...
4,southern star photography,company,https://www.cience.com/company/southern-star-p...
5,mkc accounting,company,https://www.mkingcfo.com/services
6,starbucks,company,https://www.starbucks.com/
7,southwest rural metro,company,https://www.nlrb.gov/case/28-CA-164048
8,ucla,company,https://www.uclastore.com/?srsltid=AfmBOor9QeJ...
9,va dept of conservationrecreation,company,https://www.dcr.virginia.gov/


In [None]:
# openai to get company size
def classify_company_size(company_name):
    prompt = f"""Based on the company name "{company_name}", what is its likely company size?
Choose from only one of the following: Small, Medium, or Large.
Assume Small = <100 employees, Medium = 100-1000, Large = >1000.
If uncertain, make the best guess based on name/known brand."""
    
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error for {company_name}: {e}")
        return "unknown"

sample_df['company_size'] = sample_df.apply(
    lambda row: classify_company_size(row['emp_title_clean']) if row['title_type'] == 'company' else "N/A",
    axis=1
)

# will incorporate Wikipedia summary as input to GPT

In [111]:
sample_df

Unnamed: 0,emp_title_clean,title_type,company_url,company_size
0,ryder,company,https://www.ryder.com/en-us,Large
1,air resources board,company,https://ww2.arb.ca.gov/homepage,Large
2,university medical group,company,https://data.cms.gov/tools/medicare-revalidati...,Large
3,veolia transportaton,company,https://www.veolianorthamerica.com/who-we-serv...,Large
4,southern star photography,company,https://www.cience.com/company/southern-star-p...,Small
5,mkc accounting,company,https://www.mkingcfo.com/services,Small
6,starbucks,company,https://www.starbucks.com/,Large
7,southwest rural metro,company,https://www.nlrb.gov/case/28-CA-164048,Medium
8,ucla,company,https://www.uclastore.com/?srsltid=AfmBOor9QeJ...,Large
9,va dept of conservationrecreation,company,https://www.dcr.virginia.gov/,Large


In [97]:
%pip install tqdm

7119.52s - pydevd: Sending message related to process being replaced timed-out after 5 seconds



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
from openai import OpenAI
import os

client = OpenAI(api_key="API_KEY")

def classify_title(title):
    try:
        prompt = f"Is '{title}' more likely to be a company name or a job title? Just reply with 'company' or 'job'."
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{'role': 'user', 'content': prompt}],
            temperature=0,
        )
        result = response.choices[0].message.content.strip().lower()
        return result
    except Exception as e:
        print(f"Error with title '{title}': {e}")
        return "unknown"

sample_df['title_type'] = None
sample_df.head()


Unnamed: 0,emp_title_clean,title_type
0,ryder,
1,air resources board,


In [30]:
print(openai.api_key)

None


In [9]:
df['emp_title'].value_counts().head(1000)  # 查看最常见的前10个
unique_ratio = df['emp_title'].nunique() / df['emp_title'].count() # 只有22%是unique

In [12]:
# process emp_title
df['emp_title_clean'] = (
    df['emp_title']
    .str.lower()  # 全部小写
    .str.replace(r'[^\w\s]', '', regex=True)  # 移除标点
    .str.strip()
)

emp_title_clean = df["emp_title_clean"]
# saved this as emp_title

In [None]:

job_keywords = [
    'manager', 'teacher', 'driver', 'nurse', 'engineer', 'supervisor', 'assistant', 'technician',
    'officer', 'consultant', 'director', 'project', 'accountant', 'attorney', 'server', 'president',
    'sales', 'clerk', 'analyst', 'cashier', 'owner', 'ceo', 'cto', 'cfo', 'coo', 'student', 'retired',
    'unemployed', 'receptionist', 'maintenance', 'foreman', 'welder', 'operator', 'cook', 'chef',
    'mechanic', 'laborer', 'bartender', 'waiter', 'janitor', 'custodian', 'housekeeper', 'security',
    'delivery', 'marketing', 'business', 'self-employed', 'contractor', 'trainer', 'developer',
    'plumber', 'electrician', 'carpenter', 'paralegal', 'pharmacist', 'dentist', 'doctor', 'physician',
    'surgeon', 'psychologist', 'scientist', 'advisor', 'auditor', 'banker', 'insurance', 'real estate',
    'loan officer', 'mortgage', 'dispatcher', 'truck', 'warehouse', 'machinist', 'data', 'it', 'hr',
    'human resources'
]

# 只保留不含职业关键词的
mask = ~df['emp_title_clean'].str.contains('|'.join(job_keywords), na=False)
df_possible_companies = df[mask]

emp_title_clean
rn                                 21706
customer service                    6513
superintendent                      4988
controller                          4978
administrator                       4510
account executive                   4323
principal                           4124
lpn                                 4087
cna                                 3854
secretary                           3771
social worker                       3649
professor                           3615
associate                           3271
bookkeeper                          2970
firefighter                         2814
flight attendant                    2773
respiratory therapist               2740
tech                                2693
management                          2596
partner                             2565
realtor                             2547
letter carrier                      2422
team leader                         2407
accounting                          2397
