In [75]:
import csv

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

LOCATION_INDEX = {'admin':3,'banking':3,'ceo':3}
POSTTIME_INDEX = {'admin':4,'banking':5,'ceo':5}
CLASSIFIACTION_INDEX = {'admin':5,'banking':4,'ceo':4}
SALARY_INDEX = {'admin':5,'banking':6,'ceo':3}

LOCATION_LENGTH = 9
CLASSIFIACTION_LENGTH = 15

In [61]:
def deduplication(x):
    if not x:
        return 'unkown'
    trim = x.strip()
    index = (trim+trim).find(trim,1)
    if index != -1:
        return trim[:index]
    else:
        return 'unknown'
def apply_classification(x):
    if 'classification' in x:
        return x[CLASSIFIACTION_LENGTH:]
    else:
        return 'unknown'
def apply_salary(x):
    if 'classification:' in x:
        return 'unknown'
    else:
        return x
def apply_posted_time(x):
    if 'd' in x:
        return -int(x[:-1])
    if 'm' in x:
        return -(int(x[:-1])*30)
    else:
        return 0
def anually_salary(x):
    if x != 'unknown' and float(x) < 100:
        return float(x) * 8 * 200
    else:
        return x

In [96]:
def clean_location(df,data):
    df[LOCATION_INDEX[data]] = df[LOCATION_INDEX[data]].str.split(',', expand=True)[0]
    df[['location','area']] = df[LOCATION_INDEX[data]].str.split('area:',expand = True)
    df['location'] = df['location'].apply(lambda x:x[LOCATION_LENGTH:])
    df['location'] = df['location'].apply(deduplication)
    df['area'] = df['area'].apply(deduplication)
    df.drop(LOCATION_INDEX[data], axis = 1, inplace = True)
    return df

In [93]:
def clean_classification(df,data):
    df[['classification','subclassification']] = df[CLASSIFIACTION_INDEX[data]].str.split('subClassification:',expand = True)
    df['classification'] = df['classification'].apply(apply_classification)
    df['subclassification'] = df['subclassification'].apply(deduplication)
 
    df['salary'] = df[SALARY_INDEX[data]].apply(apply_salary)
    df.drop([CLASSIFIACTION_INDEX[data]],axis = 1, inplace = True)
    return df

In [64]:
def clean_posttime(df,data):
    df.rename(columns ={0:'Job title',1:'url',2:'company'}, inplace = True )
    df[['posted time','featured']] = df[POSTTIME_INDEX[data]].str.split(',', expand = True)[[0,2]]
    df['featured'].fillna('unknown',inplace = True)
    df.drop([POSTTIME_INDEX[data]],axis = 1, inplace = True)
    df['posted time']=df['posted time'].str.extract('(\d+[a-z])')
    df['posted time'].fillna('unknown', inplace = True)
    df['posted time']=df['posted time'].apply(apply_posted_time)
    df['company'].fillna(df['featured'], inplace = True)
    return df

In [65]:
def clean_salary(df):
    df['salary']=df[df['salary'].str.contains('\d', na=False)]['salary'].str.replace(' to ','-',regex=True)
    df['salary']=df[df['salary'].str.contains('\d', na=False)]['salary'].str.replace('\d\%','',regex=True)
    df['salary']=df[df['salary'].str.contains('\d', na=False)]['salary'].str.replace(',','',regex=True)
    df['salary'].fillna('unknown')
    df[['low salary','high salary']] = df['salary'].str.split('-', n=1, expand = True).fillna('unknown')
    df['low salary'] = df['low salary'].str.extract('(\d+\.\d+|\d+ \d+|\d+k|\d+)')
    df['high salary'] = df['high salary'].str.extract('(\d+\.\d+|\d+ \d+|\d+k|\d+)')
    df['low salary'] = df[df['low salary'].str.contains('\d', na=False)]['low salary'].str.replace('k', '000', regex=True)
    df['high salary'] = df[df['high salary'].str.contains('\d', na=False)]['high salary'].str.replace('k', '000', regex=True)
    df['low salary'] = df[df['low salary'].str.contains('\d', na=False)]['low salary'].str.replace(' ', '', regex=True)
    df['high salary'] = df[df['high salary'].str.contains('\d', na=False)]['high salary'].str.replace(' ', '', regex=True)
    df['low salary']=df['low salary'].fillna("unknown")
    df['high salary']=df['high salary'].fillna("unknown")
    df.loc[df['low salary'] == 'unknown', 'low salary'] = df['high salary']
    df.loc[df['high salary'] == 'unknown', 'high salary'] = df['low salary']
    df['low salary'] = df['low salary'].apply(anually_salary)
    df['high salary'] = df['high salary'].apply(anually_salary)
    df.drop('salary',axis = 1, inplace = True)
    return df

In [66]:
def clean_seek(df,data):
    
    df = clean_location(df,data)
    df = clean_classification(df,data)
    df = clean_posttime(df,data)
    df = clean_salary(df)
    return df

In [67]:
df_admin = pd.read_csv ('NZ_Admin_JOBS.xlsx - sheet1.csv',header = None)
df_admin = df_admin.drop(0)
df_admin = clean_seek(df_admin,'admin')
df_admin

Unnamed: 0,Job title,url,company,location,area,classification,subclassification,posted time,featured,low salary,high salary
1,Administrator,https://www.seek.co.nz/job/50582301?type=promo...,Private Advertiser,Bay of Plenty,Tauranga,Administration & Office SupportAdministration...,Office Management,0,Private Advertiser,unknown,unknown
2,Receptionist,https://www.seek.co.nz/job/50620889?type=promo...,Avenues Orthodontics,Bay of Plenty,Tauranga,Administration & Office SupportAdministration...,Receptionists,0,unknown,unknown,unknown
3,Prosecutions Support Officer,https://www.seek.co.nz/job/50622169?type=stand...,New Zealand Police,Auckland,unkown,Administration & Office SupportAdministration...,Other,-4,unknown,unknown,unknown
4,Early Childhood Centre Administrator,https://www.seek.co.nz/job/50639620?type=stand...,Kew Pacific Island Early Learning Centre,Southland,Invercargill,Administration & Office SupportAdministration...,Administrative Assistants,0,unknown,unknown,unknown
5,Business Support Administrator,https://www.seek.co.nz/job/50622432?type=stand...,Private Advertiser,Canterbury,Christchurch,Administration & Office SupportAdministration...,Client & Sales Administration,-4,Private Advertiser,unknown,unknown
...,...,...,...,...,...,...,...,...,...,...,...
2704,Key Account Manager,https://www.seek.co.nz/job/50490062?type=stand...,Hays Talent Solutions,Auckland,Auckland Central,Administration & Office SupportAdministration...,Client & Sales Administration,-27,unknown,unknown,unknown
2705,Executive Assistant,https://www.seek.co.nz/job/50488000?type=stand...,one eighty recruitment,Wellington,Wellington Central,unknown,unkown,-27,unknown,unknown,unknown
2706,Temporary Office Roles,https://www.seek.co.nz/job/50524865?type=stand...,Asset Recruitment Ltd,Waikato,Hamilton,unknown,unkown,-20,unknown,unknown,unknown
2707,Temporary Office Roles,https://www.seek.co.nz/job/50477118?type=stand...,Asset Recruitment Ltd,Waikato,Hamilton,unknown,unkown,-28,unknown,unknown,unknown


In [97]:
df_banking = pd.read_excel(r"NZ_Banking_JOBS.xlsx", header = None)
df_banking = df_banking.drop(0)
df_banking = clean_seek(df_banking,'banking')
df_banking.drop(6,axis = 1, inplace = True)
df_banking

Unnamed: 0,Job title,url,company,location,area,classification,subclassification,posted time,featured,low salary,high salary
1,Accounts Receivable,https://www.seek.co.nz/job/50568753?type=promo...,at MTF Finance Mt Wellington,Auckland,Auckland Central,Banking & Financial ServicesBanking & Financi...,Credit,0,unknown,unknown,unknown
2,Internal Audit Manager,https://www.seek.co.nz/job/50556333?type=promo...,at Industrial and Commercial Bank of China (Ne...,Auckland,Auckland Central,Banking & Financial ServicesBanking & Financi...,Compliance & Risk,0,unknown,unknown,unknown
3,Client Services Officer,https://www.seek.co.nz/job/50638706?type=stand...,at NZ Funds Management Limited,Auckland,Auckland Central,Banking & Financial ServicesBanking & Financi...,Client Services,0,unknown,unknown,unknown
4,Customer Banking Consultant - Invercargill,https://www.seek.co.nz/job/50637958?type=stand...,at Westpac,Southland,Invercargill,Banking & Financial ServicesBanking & Financi...,Banking - Retail/Branch,-1,unknown,unknown,unknown
5,Private Wealth Assistant - Queenstown,https://www.seek.co.nz/job/50617226?type=stand...,at Craigs Investment Partners,Otago,Queenstown & Wanaka,Banking & Financial ServicesBanking & Financi...,Client Services,-5,unknown,unknown,unknown
...,...,...,...,...,...,...,...,...,...,...,...
3880,Financial Advisor,https://www.seek.co.nz/job/50497173?type=stand...,at NetYourJob,Auckland,Rodney & North Shore,Banking & Financial ServicesBanking & Financi...,Financial Planning,-26,unknown,unknown,unknown
3881,Senior Private Wealth Specialist,https://www.seek.co.nz/job/50526368?type=stand...,at Debbie Graham & Associates Limited,Auckland,unkown,Banking & Financial ServicesBanking & Financi...,Financial Planning,-20,unknown,unknown,unknown
3882,Financial Adviser (Auckland),https://www.seek.co.nz/job/50501103?type=stand...,at Tyler Wren,Auckland,Auckland Central,Banking & Financial ServicesBanking & Financi...,Financial Planning,-25,unknown,unknown,unknown
3883,New Ventures Manager,https://www.seek.co.nz/job/50487878?type=stand...,at Matthew Wood Search,Auckland,Auckland Central,Banking & Financial ServicesBanking & Financi...,Corporate Finance & Investment Banking,-27,unknown,unknown,unknown
