In [1]:
import numpy as np
import pandas as pd

In [2]:
file_url = "https://raw.githubusercontent.com/wongwara/Jobseeker_Baymax/main/dataset/listings2019_2022_salary_adjust01052023.csv"

In [3]:
df = pd.read_csv(file_url)

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3902 entries, 0 to 3901
Data columns (total 52 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   jobId                  3902 non-null   int64  
 1   jobTitle               3902 non-null   object 
 2   jobClassification      3902 non-null   object 
 3   jobSubClassification   3902 non-null   object 
 4   advertiserName         3902 non-null   object 
 5   advertiserId           3902 non-null   int64  
 6   companyId              1067 non-null   float64
 7   companyName            1476 non-null   object 
 8   companyRating          1067 non-null   float64
 9   listingDate            3902 non-null   object 
 10  expiryDate             3902 non-null   object 
 11  teaser                 3374 non-null   object 
 12  nation                 3902 non-null   object 
 13  state                  3902 non-null   object 
 14  city                   3902 non-null   object 
 15  area

In [6]:
df = df[['jobClassification', 'state','teaser','nation','workType','salary_string','isRightToWorkRequired','desktopAdTemplate',
         'Python','SQL','R','Tableau','SAS','Matlab','Hadoop','Spark','Java', 'Scala','recruiter']]

## Clean data

### 1. select only in Australia

In [7]:
df = df[df['nation'].str.contains('Australia')]

In [8]:
df['state'] = df['state'].replace({'Northern Territories': 'Northern Territory'})

In [9]:
df['state'].unique()

array(['New South Wales', 'Australian Capital Territory', 'Victoria',
       'Western Australia', 'Queensland', 'Northern Territory',
       'South Australia', 'Tasmania'], dtype=object)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3897 entries, 0 to 3901
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   jobClassification      3897 non-null   object
 1   state                  3897 non-null   object
 2   teaser                 3369 non-null   object
 3   nation                 3897 non-null   object
 4   workType               3897 non-null   object
 5   salary_string          1512 non-null   object
 6   isRightToWorkRequired  3411 non-null   object
 7   desktopAdTemplate      3203 non-null   object
 8   Python                 3897 non-null   int64 
 9   SQL                    3897 non-null   int64 
 10  R                      3897 non-null   int64 
 11  Tableau                3897 non-null   int64 
 12  SAS                    3897 non-null   int64 
 13  Matlab                 3897 non-null   int64 
 14  Hadoop                 3897 non-null   int64 
 15  Spark                

In [11]:
df = df.drop(['nation'],axis =1)

### 2. clean salary (target feature)

In [12]:
df["salary_string"].describe

<bound method NDFrame.describe of 0                                               NaN
1                                             Super
2                        $90000 - $120000 per annum
3                             $90000 - $110000 p.a.
4                                               NaN
                           ...                     
3897    Open to Quote (Sydney or Canberra Location)
3898                                            NaN
3899                                Desirable Rates
3900                                            NaN
3901                                            NaN
Name: salary_string, Length: 3897, dtype: object>

In [13]:
import re

def extract_salary(text):    
    range_pattern = r'([\d\.]+) *- *\$?([\d\.]+)'
    range_matches = re.search(range_pattern, text.replace(",", ""))
    if range_matches:
        lo, hi = range_matches.groups()
        salary_range = (float(lo), float(hi))
    else:
        salary_range = None
    
    return salary_range

In [14]:
df = df[df['salary_string'].notna()]

In [15]:
df["salary_string"] = df["salary_string"].apply(extract_salary)

In [16]:
df = df[df['salary_string'] != (None,None)]

In [17]:
df = df[df['salary_string'].notna()]

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 817 entries, 2 to 3893
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   jobClassification      817 non-null    object
 1   state                  817 non-null    object
 2   teaser                 695 non-null    object
 3   workType               817 non-null    object
 4   salary_string          817 non-null    object
 5   isRightToWorkRequired  704 non-null    object
 6   desktopAdTemplate      695 non-null    object
 7   Python                 817 non-null    int64 
 8   SQL                    817 non-null    int64 
 9   R                      817 non-null    int64 
 10  Tableau                817 non-null    int64 
 11  SAS                    817 non-null    int64 
 12  Matlab                 817 non-null    int64 
 13  Hadoop                 817 non-null    int64 
 14  Spark                  817 non-null    int64 
 15  Java                  

### 3. Convert categorical

In [19]:
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder,OneHotEncoder

#### 3.1 isRightToWorkRequired

In [20]:
df['isRightToWorkRequired'].unique()

array(['f', 't', nan], dtype=object)

In [21]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy.stats import mode

# Define a function to get mode of the column
def get_mode(column):
    return mode(column, nan_policy='omit')[0][0]

# Convert the column to string type
df['isRightToWorkRequired'] = df['isRightToWorkRequired'].astype(str)

# Replace the NaN values with the mode of the column
mode_val = get_mode(df['isRightToWorkRequired'])
df['isRightToWorkRequired'] = df['isRightToWorkRequired'].replace('nan', mode_val)

# Create the LabelEncoder object
le = LabelEncoder()

# Fit the LabelEncoder object to the column
le.fit(df['isRightToWorkRequired'])

# Transform the column to label encoding
column = le.transform(df['isRightToWorkRequired'])

In [22]:
df['isRightToWorkRequired'].unique()

array(['f', 't'], dtype=object)

In [23]:
df['isRightToWorkRequired'] = df['isRightToWorkRequired'].replace({'f': 0, 't': 1})

#### 3.2 workType

In [24]:
df['workType'].unique()

array(['Full Time', 'Contract/Temp', 'Part Time', 'Casual/Vacation',
       'Full time', 'Part time'], dtype=object)

In [25]:
df['workType'] = df['workType'].replace({'Full time': 'Full Time', 'Part time': 'Part Time'})

In [26]:
df['workType'].unique()

array(['Full Time', 'Contract/Temp', 'Part Time', 'Casual/Vacation'],
      dtype=object)

In [27]:
workType_cats = [['Full Time', 'Contract/Temp', 'Part Time', 'Casual/Vacation']]
Label = LabelEncoder()
# Use our trained encoder to transform this column
df['workType_encoded'] = Label.fit_transform(df[['workType']])

# display the columns
df[['workType','workType_encoded']]

df['workType']= df['workType_encoded']

#### 3.3 jobClassification

In [28]:
jobClassification_cats = [['Information & Communication Technology',
       'Banking & Financial Services', 'Science & Technology',
       'Education & Training', 'Government & Defence',
       'Consulting & Strategy', 'Healthcare & Medical',
       'Human Resources & Recruitment', 'Marketing & Communications',
       'Retail & Consumer Products', 'Administration & Office Support',
       'Accounting', 'Insurance & Superannuation',
       'Mining, Resources & Energy', 'Real Estate & Property',
       'Manufacturing, Transport & Logistics', 'Engineering']]
jobClassification_cats_enc =OrdinalEncoder(categories=jobClassification_cats)
# Use our trained encoder to transform this column
df['jobClassification_encoded'] = jobClassification_cats_enc.fit_transform(df[['jobClassification']])

# display the columns
df[['jobClassification','jobClassification_encoded']]

df['jobClassification'] = df['jobClassification_encoded']

#### 3.4 State

In [29]:
# Import OrdinalEncoder from sklearn.preprocessing
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder,OneHotEncoder

state_cats = [['Australian Capital Territory', 'South Australia',
       'Western Australia']]
Label = LabelEncoder()
# Use our trained encoder to transform this column
df['state_encoded'] = Label.fit_transform(df[['state']])

# display the columns
df[['state','state_encoded']]

df['state']= df['state_encoded']

### 4. Clean text features

In [30]:
df['teaser']= df['teaser'].fillna('')

In [31]:
df['desktopAdTemplate']= df['desktopAdTemplate'].fillna('')

In [32]:
# import re 
# # Define pipeline function to perform multiple cleaning steps on a given text
# def clean_text(text):
#     text = re.sub(r'[^\w]', ' ', str(text)) # Remove symbols
#     text = re.sub(r'[ ]{2,}', ' ', str(text)) # Remove extra spaces
#     text = re.sub(r'[ \t]+$', '', str(text)) # Remove trailing white spaces
#     text = re.sub(r'\s+', ' ', text) # Remove newline (\n)
#     text = re.sub(r"\'", "", text) # Remove quotes
#     return text

In [33]:
# # "jobTitle", "teaser", "mobileAdTemplate" and "desktopAdTemplate"
# df["teaser_cleaned"] = df["teaser"].apply(clean_text)
# df["desktopAdTemplate_cleaned"] = df["desktopAdTemplate"].apply(clean_text)

In [34]:
# df["teaser_cleaned"] = df["teaser_cleaned"].str.lower()
# df["desktopAdTemplate_cleaned"] = df["desktopAdTemplate_cleaned"].str.lower()

In [35]:
# df["teaser_cleaned"] = df["teaser"].to_string()
# df["desktopAdTemplate_cleaned"] = df["desktopAdTemplate"].to_string()

In [36]:
# # Function to remove stopwords
# import string 
# from nltk.corpus import stopwords
# # Function to remove stopwords
# stop_words = stopwords.words('english')

# def remove_stopwords(text):
#     useful_words = []
#     for i in text.split():
#         if i.strip().lower() not in stop_words:
#             useful_words.append(i.strip())
#     return " ".join(useful_words)

In [37]:
# df["teaser_cleaned"] = df["teaser_cleaned"].apply(remove_stopwords)
# df["desktopAdTemplate_cleaned"] = df["desktopAdTemplate_cleaned"].apply(remove_stopwords)

In [38]:
# df['teaser'] = df['teaser_cleaned']
# df['desktopAdTemplate'] = df["desktopAdTemplate_cleaned"]

In [39]:
# df.columns

In [40]:
# df.head()

In [41]:
# df = df.drop(['workType_encoded', 'jobClassification_encoded', 'teaser_cleaned', 'desktopAdTemplate_cleaned', 'state_encoded'], axis=1)# 

In [42]:
# df.columns

In [43]:
# df['salary_string']

In [44]:
# df.to_csv('/Users/thiradatiamklang/Desktop/NLP/AT2/df_cleaned.csv')

In [45]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer


# Clean the text data
df['teaser'] = df['teaser'].str.replace('[^\w\s]', '') # Remove punctuation
df['desktopAdTemplate'] = df['desktopAdTemplate'].str.replace('[^\w\s]', '') # Remove punctuation
df['teaser'] = df['teaser'].str.replace('\d+', '') # Remove digits
df['desktopAdTemplate'] = df['desktopAdTemplate'].str.replace('\d+', '') # Remove digits

# Normalize the text data
stop_words = set(stopwords.words('english'))
df['teaser'] = df['teaser'].apply(lambda x: ' '.join([word.lower() for word in x.split() if word.lower() not in stop_words]))
df['desktopAdTemplate'] = df['desktopAdTemplate'].apply(lambda x: ' '.join([word.lower() for word in x.split() if word.lower() not in stop_words]))

# Tokenize the text data
df['teaser'] = df['teaser'].apply(lambda x: word_tokenize(x))
df['desktopAdTemplate'] = df['desktopAdTemplate'].apply(lambda x: word_tokenize(x))

# Apply stemming
stemmer = PorterStemmer()
df['teaser'] = df['teaser'].apply(lambda x: [stemmer.stem(word) for word in x])
df['desktopAdTemplate'] = df['desktopAdTemplate'].apply(lambda x: [stemmer.stem(word) for word in x])

# # Create TF-IDF vectors
# vectorizer = TfidfVectorizer()
# teaser_tfidf = vectorizer.fit_transform(df['teaser'].apply(lambda x: ' '.join(x)))
# desktopAdTemplate_tfidf = vectorizer.fit_transform(df['desktopAdTemplate'].apply(lambda x: ' '.join(x)))

# # Concatenate the TF-IDF vectors with the original dataframe
# df = pd.concat([df.drop(['teaser', 'desktopAdTemplate'], axis=1), pd.DataFrame(teaser_tfidf.toarray()), pd.DataFrame(desktopAdTemplate_tfidf.toarray())], axis=1)

# # Display the resulting dataframe
# print(df.head())


In [46]:
df.head()

Unnamed: 0,jobClassification,state,teaser,workType,salary_string,isRightToWorkRequired,desktopAdTemplate,Python,SQL,R,...,SAS,Matlab,Hadoop,Spark,Java,Scala,recruiter,workType_encoded,jobClassification_encoded,state_encoded
2,0.0,1,"[fantast, organis, seek, experienc, insight, a...",2,"(90000.0, 120000.0)",0,"[insight, analyst, onlin, video, stream, cut, ...",0,1,1,...,0,0,0,0,0,0,1,2,0.0,1
3,1.0,1,"[role, requir, individu, strong, credit, risk,...",2,"(90000.0, 110000.0)",0,"[credit, risk, analyst, respons, design, devel...",0,1,1,...,1,0,0,0,0,0,1,2,1.0,1
7,1.0,1,"[one, australia, lead, financi, servic, provid...",2,"(110000.0, 120000.0)",1,"[data, analyt, recruit, solut, data, scientist...",1,1,1,...,1,0,0,0,0,0,1,2,1.0,1
10,2.0,0,"[postdoctor, research, molecular, evolut, phyl...",2,"(71509.0, 90215.0)",0,"[postdoctor, fellow, classif, academ, level, a...",0,0,0,...,0,0,0,0,1,0,0,2,2.0,0
11,3.0,0,"[postdoctor, research, molecular, evolut, phyl...",2,"(71509.0, 90215.0)",0,"[postdoctor, fellow, classif, academ, level, a...",0,0,0,...,0,0,0,0,1,0,0,2,3.0,0
