In [1]:
# import related libraries
import csv
import json
import numpy as np
import os
import pandas as pd
from joblib import dump, load
from sklearn import model_selection
import matplotlib.pyplot as plt

# set random seeds to ensure reproducibility
import random
random.seed(5500)
np.random.seed(5500)

In [87]:
# read the labeled dataset
data = pd.read_csv("data/Train_rev1.csv")

In [51]:
# process missing values
def anyMissingValues(X):
    return any(pd.isna(X))

def numOfMissingValues(X):
    return sum(pd.isna(X))
    
print(anyMissingValues(data.Category))
print(numOfMissingValues(data.Title))
print(numOfMissingValues(data.FullDescription))
print(numOfMissingValues(data.LocationRaw))
print(numOfMissingValues(data.LocationNormalized))
print(numOfMissingValues(data.ContractType))
print(numOfMissingValues(data.ContractTime))
print(numOfMissingValues(data.Company))
print(numOfMissingValues(data.Category))
print(numOfMissingValues(data.SalaryRaw))
print(numOfMissingValues(data.SalaryNormalized))
print(numOfMissingValues(data.SourceName))

False
1
0
0
0
0
63905
32430
0
0
0
1


In [91]:
# look at the 224434th line of the data
print(data.iloc[224434, 1])
print(data.iloc[224434, 2])
print(data.iloc[224434, 3])
print(data.iloc[224434, ])

Senior C++ Engineer / C++ Performance Optimisation Coder / C F Framework Developer, Front Office  Elite Team "
 Tier One Investment Bank"
Senior C++ Engineer / C++ Performance Optimisation Coder / C Elite Team , Tier One Investment Bank Introduction: This is by far one of the most exciting mandates that we have been engaged on this year. Our clients are seeking a bestinclass , industry leading Senior VP level hands on C++ Engineer with skills in C and F to join an elite Quant Strategies Technology team to work with the business on trading tech solutions. The Organisation: A top Investment Bank, regarded by many as the best to work for. The Team: The Quant Strategies Team working within Fixed Income. The Project: An interesting semiindividual project to start with; there is a large amount of C++ code in the form of libraries used for quant models. Following the build of a new strategic risk platform, these are no longer scalable, and must be optimised to improve performance, all within 

In [None]:
# data cleaning, remove anomalies in the dataset 
print([i for i,x in enumerate(data.SourceName.isna()) if x == True]) # number of missing value is 1
print(data.iloc[224434, :])
print(data[data.Company=="Montash Limited"].SourceName)
data.iloc[224434, 11] = "eFinancialCareers"
# According to the missing values in SourceName, we found that the corresponding company publishes its ads in only one Source.

Title_lengths = [len(x) for x in data.Title]
FullDescription_lengths = [len(x) for x in data.FullDescription]
LocationRaw_lengths = [len(x) for x in data.LocationRaw]
LocationNormalized_lengths = [len(x) for x in data.LocationNormalized]

plt.boxplot(FullDescription_lengths)
print(min(FullDescription_lengths)) # 26, anomaly?

plt.boxplot(LocationRaw_lengths)
print(max(LocationRaw_lengths)) # 1682, anomaly

print([i for i,x in enumerate(LocationRaw_lengths) if x == 1682]) # 224434
print(data.iloc[224434, ])
data.iloc[224434, 2] = data.iloc[224434, 3]
data.iloc[224434, 3] = data.iloc[224434, 4] # typo error

In [71]:
# number of unique values
def numOfUniqueValues(X):
    return len(set(X))

print(numOfUniqueValues(data.LocationRaw))
print(numOfUniqueValues(data.LocationNormalized))
print(numOfUniqueValues(data.ContractType))
print(numOfUniqueValues(data.ContractTime))
print(numOfUniqueValues(data.Company))
print(numOfUniqueValues(data.Category))
print(numOfUniqueValues(data.SourceName))

20986
2732
3
3
20813
29
167


In [42]:
# data imputation for ContractType
data.ContractType.fillna("other", inplace=True)

# data imputation for ContractTime
data.ContractTime.fillna("other", inplace=True)

0         other
1         other
2         other
3         other
4         other
          ...  
244763    other
244764    other
244765    other
244766    other
244767    other
Name: ContractType, Length: 244768, dtype: object

In [None]:
# split dataset into train set and test set
train, test = model_selection.train_test_split(data, test_size=0.3, stratify=data["Category"]) # stratify according to category
print(data.shape)
print(train.shape)
print(test.shape)

In [9]:
# code to extract a column of pandas dataframe
data["Category"]
data.Category

0         Engineering Jobs
1         Engineering Jobs
2         Engineering Jobs
3         Engineering Jobs
4         Engineering Jobs
                ...       
244763       Teaching Jobs
244764       Teaching Jobs
244765       Teaching Jobs
244766       Teaching Jobs
244767       Teaching Jobs
Name: Category, Length: 244768, dtype: object

In [11]:
train.head()

Unnamed: 0,Id,Title,FullDescription,LocationRaw,LocationNormalized,ContractType,ContractTime,Company,Category,SalaryRaw,SalaryNormalized,SourceName
28243,67947854,Quality Manager,An opportunity has arisen for a Quality Manage...,Midlands,East Midlands,,permanent,Michael Page Engineering Manufacture,Other/General Jobs,"40,000-45,000 + Car/al",42500,jobsite.co.uk
65564,68710869,Contract Hire Coordinator,Due to the succesfull capture of a new contrac...,Coventry West Midlands (County) West Midlands,UK,,permanent,Flex Recruitment,Other/General Jobs,Negotiable c.16k,16000,totaljobs.com
197901,71845714,Food and Beverage Service Assistant De Vere V...,Food and Beverage Service Assistant De Vere V...,"Leatherhead, Surrey, UK, Surrey",Leatherhead,,,i resourcer,Hospitality & Catering Jobs,From 6.19 to 6.19 per hour,11884,jobs.catererandhotelkeeper.com
74446,68999742,Supply Chain Analyst Automotive ****K Shrew...,This organisation is a business which has seen...,"Shrewsbury, Shropshire, England, Shropshire",Shrewsbury,,contract,Jonathan Lee Recruitment Ltd,Other/General Jobs,28000 - 30000/annum,29000,cv-library.co.uk
22017,67378832,General Manager Gatwick,GENERAL MANAGER We are currently recruiting fo...,"Gatwick, West Sussex West Sussex South East",UK,,,Cherryred Recruitment,Hospitality & Catering Jobs,"From 30,000 to 35,000 per annum",32500,caterer.com


In [12]:
test.head()

Unnamed: 0,Id,Title,FullDescription,LocationRaw,LocationNormalized,ContractType,ContractTime,Company,Category,SalaryRaw,SalaryNormalized,SourceName
199611,71851370,Teacher of English in A PRU,We are looking for a dynamic and positive teac...,Dartford Kent South East,Dartford,,contract,Teaching Appointments,Teaching Jobs,"Up to 35,000 per annum",35000,totaljobs.com
68342,68784262,"DIRECT SALES EXECUTIVE, LEEDS NO EXP NECESSARY","Direct Sales Executive, Leeds . Our client is ...",West Yorkshire Yorkshire,West Yorkshire,,permanent,Hot Recruitment Consultants,Sales Jobs,25k - 50k per year,37500,salestarget.co.uk
124208,69967348,Assistant Quantity Surveyor High Spec Housing,Assistant Quantity Surveyor London City **** ...,City London South East,London,,permanent,Project Resource,Trade & Construction Jobs,23000 - 28000 per annum + private health,25500,careerstructure.com
125258,69989596,GENERAL MANAGER HOSPITALITY AND ENTERTAINMENT...,Fantastic General Manager opportunity to work ...,Macclesfield Cheshire North West,Macclesfield,,permanent,Detail2Leisure,Hospitality & Catering Jobs,"From 30,000 to 30,000 per annum + Bonus",30000,totaljobs.com
116213,69782688,Education Training and Employment Advisor,Swim are looking to appoint an Education Train...,"Merton, Greater London, South London",London,,,Swim Recruitment,Social work Jobs,20.70 per hour,39744,jobs.communitycare.co.uk


In [11]:
# split dataset into validation set and test set
test_data = test
valid, test = model_selection.train_test_split(test_data, test_size=0.5, stratify=test_data["Category"]) # stratify according to category
print(test_data.shape)
print(valid.shape)
print(test.shape)

(73431, 12)
(36715, 12)
(36716, 12)


In [12]:
# save the subsets into local csv files
train.to_csv("data/train.csv", index=False)
valid.to_csv("data/valid.csv", index=False)
test.to_csv("data/test.csv", index=False)