In [14]:
import pandas as pd
import numpy as np
data = pd.read_csv('emscad_v1.csv')

In [15]:
data = data.drop('salary_range', axis=1)

In [16]:
columns_to_convert = ['telecommuting', 'has_company_logo', 'has_questions', 'fraudulent', 'in_balanced_dataset']
for col in columns_to_convert:
    data[col] = data[col].replace({'t': 1, 'f': 0}).astype(int)
data['employment_type'] = data['employment_type'].replace(np.nan, 'Other')
data['required_experience'] = data['required_experience'].replace(np.nan, 'Not Applicable')
data['required_education'] = data['required_education'].replace(np.nan, 'Unspecified')
data['industry'] = data['industry'].replace(np.nan, 'Other')
data['function'] = data['function'].replace(np.nan, 'Other')

In [17]:
import nltk
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('stopwords')
nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('sentiwordnet')
# from nltk.corpus import sentiwordnet as swn
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import re
from bs4 import BeautifulSoup
from sklearn.preprocessing import LabelEncoder




[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andreayeo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/andreayeo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [24]:
data

Unnamed: 0,title,location,department,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,in_balanced_dataset
0,marketing intern,usa ny new york,marketing,food52 created groundbreaking award winning co...,food52 fast growing james beard award winning ...,experience content management system major plu...,,0,1,0,Other,Internship,Unspecified,,marketing,0,0
1,customer service cloud video production,nz auckland,success,90 second world cloud video production service...,organised focused vibrant awesome passion cust...,expect key responsibility communicate client 9...,get u part 90 second team gain experience work...,0,1,0,Full-time,Not Applicable,Unspecified,marketing advertising,customer service,0,0
2,commissioning machinery assistant cma,usa ia wever,,valor service provides workforce solution meet...,client located houston actively seeking experi...,implement pre commissioning commissioning proc...,,0,1,0,Other,Not Applicable,Unspecified,,,0,0
3,account executive washington dc,usa dc washington,sale,passion improving quality life geography heart...,company esri environmental system research ins...,education bachelor master gi business administ...,culture anything corporate collaborative creat...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,computer software,sale,0,0
4,bill review manager,usa fl fort worth,,spotsource solution llc global human capital m...,job title itemization review manager location ...,qualification rn license state texas diploma b...,full benefit offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,hospital health care,health care provider,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,account director distribution,ca toronto,sale,vend looking awesome new talent come join u wo...,case first time visited website vend award win...,ace role eat comprehensive statement work brea...,expect u open culture openly share result inpu...,0,1,1,Full-time,Mid-Senior level,Unspecified,computer software,sale,0,0
17876,payroll accountant,usa pa philadelphia,accounting,weblinc e commerce platform service provider f...,payroll accountant focus primarily payroll fun...,b b accounting desire fun love genuine passion...,health wellness medical plan prescription drug...,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,internet,accounting auditing,0,0
17877,project cost control staff engineer cost contr...,usa tx houston,,provide full time permanent position many medi...,experienced project cost control staff enginee...,least 12 year professional experience ability ...,,0,0,0,Full-time,Not Applicable,Unspecified,,,0,0
17878,graphic designer,ng la lagos,,,nemsia studio looking experienced visual graph...,1 must fluent latest version corel adobe cc es...,competitive salary compensation based experien...,0,0,1,Contract,Not Applicable,Professional,graphic design,design,0,0


In [19]:
# replace US with USA
data['location'] = data['location'].str.replace('US', 'USA')

In [22]:
def remove_html_tags(text):
    text = str(text)
    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(text, 'html.parser')
    
    # Extract the text content without HTML tags
    text_without_tags = soup.get_text()
    
    # Remove any extra whitespace and newline characters
    text_without_tags = re.sub(r'\s+', ' ', text_without_tags).strip()
    
    return text_without_tags

def preprocess(text):
    text = remove_html_tags(text)
    text = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=text)
    text = text.lower()
    tokens = word_tokenize(text)
    stop = set(stopwords.words('english'))
    tokens = [str(word) for word in tokens if word not in stop]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # ps = PorterStemmer()
    # tokens = [ps.stem(word) for word in tokens]
    sentence = ' '.join(tokens)
    return sentence

In [23]:
data['title'] = data['title'].apply(preprocess)
data['location'] = data['location'].apply(preprocess)
data['department'] = data['department'].apply(preprocess)
data['company_profile'] = data['company_profile'].apply(preprocess)
data['description'] = data['description'].apply(preprocess)
data['requirements'] = data['requirements'].apply(preprocess)
data['benefits'] = data['benefits'].apply(preprocess)
data['industry'] = data['industry'].apply(preprocess)
data['function'] = data['function'].apply(preprocess)



In [25]:
label_encoder = LabelEncoder()
columns_to_encode = ['employment_type', 'required_experience', 'required_education']
for col in columns_to_encode:
    data[col] = label_encoder.fit_transform(data[col])
    labels = label_encoder.classes_
    print(col)
    for i in range(len(labels)):
        print(f'{i}: {labels[i]}')

employment_type
0: Contract
1: Full-time
2: Other
3: Part-time
4: Temporary
required_experience
0: Associate
1: Director
2: Entry level
3: Executive
4: Internship
5: Mid-Senior level
6: Not Applicable
required_education
0: Associate Degree
1: Bachelor's Degree
2: Certification
3: Doctorate
4: High School or equivalent
5: Master's Degree
6: Professional
7: Some College Coursework Completed
8: Some High School Coursework
9: Unspecified
10: Vocational
11: Vocational - Degree
12: Vocational - HS Diploma


In [27]:
data['title'] = data['title'].replace('nan', '')
data['location'] = data['location'].replace('nan', '')
data['department'] = data['department'].replace('nan', '')
data['company_profile'] = data['company_profile'].replace('nan', '')
data['description'] = data['description'].replace('nan', '')
data['requirements'] = data['requirements'].replace('nan', '')
data['benefits'] = data['benefits'].replace('nan', '')
data['industry'] = data['industry'].replace('nan', '')
data['function'] = data['function'].replace('nan', '')

In [181]:
data.to_csv('preprocessed_data.csv', index=False)