### Naive Bayes Classifier for job postings.

### In this dataset we try to classify whether a job posting is fraudulent or not based on some features of the data.

In [1]:
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
import numpy as np
#import itertools

In [2]:
df = pd.read_excel('job_postings_excel.xlsx')


### features and the target class of the dataset.

In [3]:

features = df.columns[1:df.columns.shape[0]-1].to_numpy()
print('features:', '\n', features, '\n')

target_class = df.columns[df.columns.shape[0]-1]
print('target class:', '\n', target_class)


features: 
 ['title' 'location' 'department' 'salary_range' 'company_profile'
 'description' 'requirements' 'benefits' 'telecommuting'
 'has_company_logo' 'has_questions' 'employment_type'
 'required_experience' 'required_education' 'industry' 'function'] 

target class: 
 fraudulent


### compute priors of any job posting being fraudulent and not being fraudulent.

In [4]:

class_lables, class_lables_counts = np.unique(df['fraudulent'], return_counts=True)

prior_prob_fraud_false = class_lables_counts[0] / np.sum(class_lables_counts)
prior_prob_fraud_true = 1 - prior_prob_fraud_false

print('class lables: ', class_lables, '\n', 'class labels counts: ', class_lables_counts, '\n')
print('prior probability job posting IS NOT fraudulent: ', prior_prob_fraud_false, '\n')
print('prior probability job posting IS fraudulent: ', prior_prob_fraud_true)


class lables:  [0 1] 
 class labels counts:  [17014   866] 

prior probability job posting IS NOT fraudulent:  0.9515659955257271 

prior probability job posting IS fraudulent:  0.04843400447427293


### split dataset into training, validation and test datasets.

In [5]:

training_data = df.loc[:10728]
validation_data = df.loc[10729:16450]
test_data = df.loc[16451:]

print('training data example item:','\n\n', training_data.loc[0])


training data example item: 

 job_id                                                                 1
title                                                   Marketing Intern
location                                                   US,NY,NewYork
department                                                     Marketing
salary_range                                                         NaN
company_profile        We're Food52, and we've created a groundbreaki...
description            Food52, a fast-growing, James Beard Award-winn...
requirements           Experience with content management systems a m...
benefits                                                             NaN
telecommuting                                                          0
has_company_logo                                                       1
has_questions                                                          0
employment_type                                                    Other
required_experience 

### function to compute fraudulent counts of each feature.

In [6]:
# for now this function is only applied to the title feature
# we will try with 50 unique titles since considering all the unique titles slows computation

def compute_fraud_counts(class_labels, feature_full_data, feature_unique_data):

    feature_unique_data_fraud_counts = {}
    fraud_count_false = 0
    fraud_count_true = 0

    for j in range(50):
        for i in range(class_labels.shape[0]):    

            if class_labels[i] == 0 and feature_full_data[i] == feature_unique_data[j]:
                fraud_count_false += 1
                
            elif class_labels[i] == 1 and feature_full_data[i] == feature_unique_data[j]:
                fraud_count_true += 1

        feature_unique_data_fraud_counts[feature_unique_data[j]] = [fraud_count_false, fraud_count_true]
        fraud_count_false = 0
        fraud_count_true = 0
        
    return feature_unique_data_fraud_counts
    

### function to compute fraudulent counts for features with binary data (0, 1).

In [7]:
# what's different between this function and the one above is that for features with binary data we dont have to worry
# about the size of unique values unlike with features that can take on multiple values which can increase the
# size of unique values we have to consider for computation.

def compute_fraud_counts_binary_data(class_labels, feature_full_data, feature_unique_data):

    feature_unique_data_fraud_counts = {}
    fraud_count_false = 0
    fraud_count_true = 0

    for value in feature_unique_data:
        for i in range(class_labels.shape[0]):    

            if class_labels[i] == 0 and feature_full_data[i] == value:
                fraud_count_false += 1
                
            elif class_labels[i] == 1 and feature_full_data[i] == value:
                fraud_count_true += 1

        feature_unique_data_fraud_counts[value] = [fraud_count_false, fraud_count_true]
        fraud_count_false = 0
        fraud_count_true = 0
        
    return feature_unique_data_fraud_counts


### function to compute fraudulent counts for features employment_type, required_education and required_experience.

In [8]:
# this function is applied to the named features since the size of unique values in each of these features is not large
# to the point it slows down computation

def compute_fraud_counts_feature(class_labels, feature_full_data, feature_unique_data):
    
    feature_unique_data_fraud_counts = {}
    fraud_count_false = 0
    fraud_count_true = 0

    for value in feature_unique_data:

        for i in range(class_labels.shape[0]):

            if str(feature_full_data[i]) == 'nan':

                if str(value) == 'nan':

                    if class_labels[i] == 0:
                        fraud_count_false += 1
                        
                    elif class_labels[i] == 1:
                        fraud_count_true += 1

            else:

                if str(value) != 'nan':

                    if class_labels[i] == 0 and value == feature_full_data[i]:
                        fraud_count_false += 1

                    elif class_labels[i] == 1 and value == feature_full_data[i]:
                        fraud_count_true += 1

        feature_unique_data_fraud_counts[value] = [fraud_count_false, fraud_count_true]
        fraud_count_false = 0
        fraud_count_true = 0
        
    return feature_unique_data_fraud_counts


### function to compute probabilities of unique values of each feature with respect to each class label and applies laplacian smoothing if necessary.

In [9]:

def compute_probabilities(feature_values_fraud_counts):
    
    feature_values_fraud_prob = {}

    for key in feature_values_fraud_counts:
        
        if feature_values_fraud_counts[key][0] == 0 or feature_values_fraud_counts[key][1] == 0:
            
            # apply laplacian by adding a fake datapoint count to both counts of each class label
            feature_value_freq_with_laplacian = sum(feature_values_fraud_counts[key]) + 2
            prob_fraud_false = (feature_values_fraud_counts[key][0] + 1) / feature_value_freq_with_laplacian
            prob_fraud_true = (feature_values_fraud_counts[key][1] + 1) / feature_value_freq_with_laplacian

        else:
            
            feature_value_freq = sum(feature_values_fraud_counts[key])
            prob_fraud_false = feature_values_fraud_counts[key][0] / feature_value_freq
            prob_fraud_true = feature_values_fraud_counts[key][1] / feature_value_freq

        feature_values_fraud_prob[key] = [prob_fraud_false, prob_fraud_true]
        
    return feature_values_fraud_prob


### function for printing dictionary values in formatted output.

In [10]:

def print_dictionary(dictionary):
    for key,value in dictionary.items():
        print("{} {}".format(key, value))
        

### title feature.

compute fraud counts of each unique title with respect to the class labels.

In [11]:

training_data_titles = training_data['title'].str.lower()
unique_titles = np.array(training_data_titles.drop_duplicates())

titles_and_fraud_counts = compute_fraud_counts(training_data['fraudulent'], training_data_titles, unique_titles)

print('title [title_fraud_false, title_fraud_true]', '\n')
print_dictionary(titles_and_fraud_counts)


title [title_fraud_false, title_fraud_true] 

marketing intern [11, 0]
customer service - cloud video production [1, 0]
commissioning machinery assistant (cma) [1, 0]
account executive - washington dc [1, 0]
bill review manager [1, 0]
accounting clerk [1, 1]
head of content (m/f) [1, 0]
lead guest service specialist [1, 0]
hp bsm sme [2, 0]
customer service associate - part time [64, 0]
asp.net developer job opportunity at united states,new jersey [1, 0]
talent sourcer (6 months fixed-term contract) [1, 0]
applications developer, digital [1, 0]
installers [2, 0]
account executive - sydney [1, 0]
vp of sales - vault dragon [1, 0]
hands-on qa leader [1, 0]
southend-on-sea traineeships under nas 16-18 year olds only [1, 0]
visual designer [6, 0]
process controls engineer - dcs plc ms office - pa [1, 0]
marketing assistant [8, 0]
front end developer [26, 0]
engagement manager [4, 0]
vice president, sales and sponsorship (businessfriend.com) [1, 0]
customer service [2, 0]
h1b sponsor for l1

compute probabilities of each unique title with respect to the class labes.

In [12]:

titles_and_fraud_prob = compute_probabilities(titles_and_fraud_counts)

print('title [prob_title_fraud_false, prob_title_fraud_true]', '\n')
print_dictionary(titles_and_fraud_prob)


title [prob_title_fraud_false, prob_title_fraud_true] 

marketing intern [0.9230769230769231, 0.07692307692307693]
customer service - cloud video production [0.6666666666666666, 0.3333333333333333]
commissioning machinery assistant (cma) [0.6666666666666666, 0.3333333333333333]
account executive - washington dc [0.6666666666666666, 0.3333333333333333]
bill review manager [0.6666666666666666, 0.3333333333333333]
accounting clerk [0.5, 0.5]
head of content (m/f) [0.6666666666666666, 0.3333333333333333]
lead guest service specialist [0.6666666666666666, 0.3333333333333333]
hp bsm sme [0.75, 0.25]
customer service associate - part time [0.9848484848484849, 0.015151515151515152]
asp.net developer job opportunity at united states,new jersey [0.6666666666666666, 0.3333333333333333]
talent sourcer (6 months fixed-term contract) [0.6666666666666666, 0.3333333333333333]
applications developer, digital [0.6666666666666666, 0.3333333333333333]
installers [0.75, 0.25]
account executive - sydney [0.

### location feature.

compute fraud counts of each unique location with respect to the class lables.

In [13]:

training_data_locations = training_data['location'].str.lower()
locations = []
unique_locations = []

# note: even though we have some nan values it could mean that if a job posting does not have a location then its likely
# that its fraudulent or the location data for the job posting was not provided, thus we have to consider a nan value
# to mean job posting has no location so to say. Similar explanation for subsequent features where nan values are present

# get locations
for i in range(training_data_locations.shape[0]):
    
    if str(training_data_locations[i]) == 'nan':
        locations.append( training_data_locations[i] )
        
    else:
        locations.append( training_data_locations[i].split(',') )

# from the collected locations get unique locations (note: location=nan is also included in unique locations for the
# reasons mentioned above). we will also consider a small set of locations due to dataset being too large for computation.
# we will consider 10 location subsets and from the 10 subsets in locations list we pick unique locations.
# note: it would be hard to decide which locations have abbreviations and their full names in the same subset, so
# we will treate them differently (i.e if locations[j] = [us,ny,newyork] we treat ny & newyork different)

for j in range(20):
    locations_subset = locations[j]
    
    if str(locations_subset) == 'nan':
        if locations_subset not in unique_locations:
            unique_locations.append(locations_subset)
        
    else:
        for k in range(len(locations_subset)):
            if locations_subset[k] not in unique_locations:
                unique_locations.append(locations_subset[k])

print('20 subsets of locations:', '\n', locations[:20], '\n')
print('unique locations:', '\n', unique_locations)


20 subsets of locations: 
 [['us', 'ny', 'newyork'], ['nz', 'auckland'], ['us', 'ia', 'wever'], ['us', 'dc', 'washington'], ['us', 'fl', 'fortworth'], ['us', 'md'], ['de', 'be', 'berlin'], ['us', 'ca', 'sanfrancisco'], ['us', 'fl', 'pensacola'], ['us', 'az', 'phoenix'], ['us', 'nj', 'jerseycity'], ['gb', 'lnd', 'london'], ['us', 'ct', 'stamford'], ['us', 'fl', 'orlando'], ['au', 'nsw', 'sydney'], ['sg', '01', 'singapore'], ['il', 'telaviv', 'israel'], ['gb', 'sos', 'southend-on-sea'], ['us', 'ny', 'newyork'], ['us', 'pa', 'usanortheast']] 

unique locations: 
 ['us', 'ny', 'newyork', 'nz', 'auckland', 'ia', 'wever', 'dc', 'washington', 'fl', 'fortworth', 'md', 'de', 'be', 'berlin', 'ca', 'sanfrancisco', 'pensacola', 'az', 'phoenix', 'nj', 'jerseycity', 'gb', 'lnd', 'london', 'ct', 'stamford', 'orlando', 'au', 'nsw', 'sydney', 'sg', '01', 'singapore', 'il', 'telaviv', 'israel', 'sos', 'southend-on-sea', 'pa', 'usanortheast']


In [14]:

# for each unique location get counts of fraudulent=0 and fraudulent=1

locations_and_fraud_counts = {}
location_fraud_false = 0
location_fraud_true = 0

for location in unique_locations:
        
    for i in range(training_data['fraudulent'].shape[0]):

        if str(locations[i]) == 'nan':

            # for locations[i] == nan value, we only have to consider whether the specific location is a nan value
            # and if its not, continue with the next iteration on i
            
            if str(location) == 'nan':

                if training_data['fraudulent'].loc[i] == 0:
                    location_fraud_false += 1
                    
                elif training_data['fraudulent'].loc[i] == 1:
                    location_fraud_true += 1

        else:
            # for valid locations we only have to consider a location with a non nan value.
            
            if str(location) != 'nan':

                if training_data['fraudulent'].loc[i] == 0 and location in locations[i]:
                    location_fraud_false += 1

                elif training_data['fraudulent'].loc[i] == 1 and location in locations[i]:
                    location_fraud_true += 1

    locations_and_fraud_counts[location] = [location_fraud_false, location_fraud_true]
    location_fraud_false = 0
    location_fraud_true = 0
    
print('location [location_fraud_false, location_fraud_true]', '\n')
print_dictionary(locations_and_fraud_counts)


location [location_fraud_false, location_fraud_true] 

us [5856, 404]
ny [734, 32]
newyork [413, 5]
nz [198, 0]
auckland [143, 0]
ia [45, 2]
wever [1, 0]
dc [221, 1]
washington [213, 1]
fl [199, 16]
fortworth [18, 4]
md [53, 18]
de [234, 1]
be [228, 0]
berlin [150, 0]
ca [1396, 86]
sanfrancisco [261, 5]
pensacola [5, 1]
az [87, 1]
phoenix [43, 1]
nj [116, 2]
jerseycity [11, 0]
gb [1414, 11]
lnd [605, 3]
london [678, 3]
ct [82, 2]
stamford [12, 0]
orlando [17, 2]
au [93, 5]
nsw [48, 2]
sydney [30, 2]
sg [43, 0]
01 [23, 0]
singapore [14, 0]
il [282, 8]
telaviv [19, 0]
israel [1, 0]
sos [1, 0]
southend-on-sea [1, 0]
pa [179, 8]
usanortheast [1, 0]


compute the probabilities of each unique location with respect to the class labels.

In [15]:

locations_and_fraud_prob = compute_probabilities(locations_and_fraud_counts)
    
print('location [prob_location_fraud_false, prob_location_fraud_true]', '\n')
print_dictionary(locations_and_fraud_prob)


location [prob_location_fraud_false, prob_location_fraud_true] 

us [0.9354632587859425, 0.0645367412140575]
ny [0.95822454308094, 0.04177545691906005]
newyork [0.9880382775119617, 0.011961722488038277]
nz [0.995, 0.005]
auckland [0.993103448275862, 0.006896551724137931]
ia [0.9574468085106383, 0.0425531914893617]
wever [0.6666666666666666, 0.3333333333333333]
dc [0.9954954954954955, 0.0045045045045045045]
washington [0.9953271028037384, 0.004672897196261682]
fl [0.9255813953488372, 0.07441860465116279]
fortworth [0.8181818181818182, 0.18181818181818182]
md [0.7464788732394366, 0.2535211267605634]
de [0.9957446808510638, 0.00425531914893617]
be [0.9956521739130435, 0.004347826086956522]
berlin [0.993421052631579, 0.006578947368421052]
ca [0.941970310391363, 0.058029689608636977]
sanfrancisco [0.981203007518797, 0.018796992481203006]
pensacola [0.8333333333333334, 0.16666666666666666]
az [0.9886363636363636, 0.011363636363636364]
phoenix [0.9772727272727273, 0.022727272727272728]
nj [0.

### department feature.

compute fraud counts of each unique department with respect to the class lables.

In [16]:

training_data_departments = training_data['department'].str.lower()
departments = np.array(training_data_departments.drop_duplicates())

departments_and_fraud_counts = {}
department_fraud_false = 0
department_fraud_true = 0

for j in range(20): # again we will consider only a small set of departments, 20 departments.
        
    for i in range(training_data['fraudulent'].shape[0]):

        if str(training_data_departments[i]) == 'nan':

            if str(departments[j]) == 'nan':

                if training_data['fraudulent'].loc[i] == 0:
                    department_fraud_false += 1
                    
                elif training_data['fraudulent'].loc[i] == 1:
                    department_fraud_true += 1

        else:
            
            if str(departments[j]) != 'nan':

                if training_data['fraudulent'].loc[i] == 0 and departments[j] == training_data_departments[i]:
                    department_fraud_false += 1

                elif training_data['fraudulent'].loc[i] == 1 and departments[j] == training_data_departments[i]:
                    department_fraud_true += 1

    departments_and_fraud_counts[departments[j]] = [department_fraud_false, department_fraud_true]   
    department_fraud_false = 0
    department_fraud_true = 0

print('20 unique departments:', '\n', departments[:20], '\n')
print('department [department_fraud_false, department_fraud_true]','\n')
print_dictionary(departments_and_fraud_counts)


20 unique departments: 
 ['marketing' 'success' nan 'sales' 'androidpit' 'hr' 'r&d' 'engagement'
 'businessfriend.com' 'medical' 'field' 'all' 'design' 'production' 'icm'
 'general services' 'engineering' 'it' 'business development'
 'human resources'] 

department [department_fraud_false, department_fraud_true] 

marketing [245, 1]
success [4, 0]
nan [6620, 258]
sales [345, 9]
androidpit [1, 0]
hr [36, 3]
r&d [23, 0]
engagement [12, 0]
businessfriend.com [1, 0]
medical [6, 2]
field [2, 0]
all [11, 0]
design [40, 0]
production [18, 0]
icm [1, 0]
general services [1, 0]
engineering [255, 45]
it [158, 1]
business development [21, 0]
human resources [19, 2]


compute probabilities of each unique department with respect to the class labels.

In [17]:

departments_and_fraud_prob = compute_probabilities(departments_and_fraud_counts)
    
print('department [prob_department_fraud_false, prob_department_fraud_true]','\n')
print_dictionary(departments_and_fraud_prob)


department [prob_department_fraud_false, prob_department_fraud_true] 

marketing [0.9959349593495935, 0.0040650406504065045]
success [0.8333333333333334, 0.16666666666666666]
nan [0.9624890956673452, 0.03751090433265484]
sales [0.9745762711864406, 0.025423728813559324]
androidpit [0.6666666666666666, 0.3333333333333333]
hr [0.9230769230769231, 0.07692307692307693]
r&d [0.96, 0.04]
engagement [0.9285714285714286, 0.07142857142857142]
businessfriend.com [0.6666666666666666, 0.3333333333333333]
medical [0.75, 0.25]
field [0.75, 0.25]
all [0.9230769230769231, 0.07692307692307693]
design [0.9761904761904762, 0.023809523809523808]
production [0.95, 0.05]
icm [0.6666666666666666, 0.3333333333333333]
general services [0.6666666666666666, 0.3333333333333333]
engineering [0.85, 0.15]
it [0.9937106918238994, 0.006289308176100629]
business development [0.9565217391304348, 0.043478260869565216]
human resources [0.9047619047619048, 0.09523809523809523]


### salary_range feature.

compute fraud counts of each unique value in salary_range feature with respect to the class labes.

In [18]:

training_data_salary_ranges = training_data['salary_range'].str.lower()
salary_ranges = np.array(training_data_salary_ranges.drop_duplicates())

salary_range_and_fraud_counts = {}
salary_range_fraud_false = 0
salary_range_fraud_true = 0

for j in range(20): # again we will consider only a small set of salary ranges, 20 of them.
        
    for i in range(training_data['fraudulent'].shape[0]):

        if str(training_data_salary_ranges[i]) == 'nan':

            if str(salary_ranges[j]) == 'nan':

                if training_data['fraudulent'].loc[i] == 0:
                    salary_range_fraud_false += 1
                    
                elif training_data['fraudulent'].loc[i] == 1:
                    salary_range_fraud_true += 1

        else:
            
            if str(salary_ranges[j]) != 'nan':

                if training_data['fraudulent'].loc[i] == 0 and salary_ranges[j] == training_data_salary_ranges[i]:
                    salary_range_fraud_false += 1

                elif training_data['fraudulent'].loc[i] == 1 and salary_ranges[j] == training_data_salary_ranges[i]:
                    salary_range_fraud_true += 1

    salary_range_and_fraud_counts[salary_ranges[j]] = [salary_range_fraud_false, salary_range_fraud_true]
    salary_range_fraud_false = 0
    salary_range_fraud_true = 0

print('20 unique salary ranges:', '\n', salary_ranges[:20], '\n')
print('salary range [salary_range_fraud_false, salary_range_fraud_true]','\n')
print_dictionary(salary_range_and_fraud_counts)


20 unique salary ranges: 
 [nan '20000-28000' '100000-120000' '120000-150000' '50000-65000'
 '40000-50000' '60-80' '65000-70000' '75-115' '75000-110000' '17000-20000'
 '16000-28000' '95000-115000' '15000-18000' '50000-70000' '45000-60000'
 '30000-40000' '70000-90000' '10000-14000' '50-110'] 

salary range [salary_range_fraud_false, salary_range_fraud_true] 

nan [8698, 333]
20000-28000 [1, 0]
100000-120000 [9, 0]
120000-150000 [5, 0]
50000-65000 [12, 1]
40000-50000 [50, 1]
60-80 [1, 0]
65000-70000 [6, 0]
75-115 [1, 0]
75000-110000 [2, 0]
17000-20000 [3, 0]
16000-28000 [1, 0]
95000-115000 [1, 4]
15000-18000 [3, 0]
50000-70000 [19, 0]
45000-60000 [11, 0]
30000-40000 [30, 4]
70000-90000 [26, 0]
10000-14000 [2, 0]
50-110 [1, 0]


compute probabilities of each unique salary range value with respect to the class labels.

In [19]:

salary_range_and_fraud_prob = compute_probabilities(salary_range_and_fraud_counts)
    
print('salary range [prob_salary_range_fraud_false, prob_salary_range_fraud_true]','\n')
print_dictionary(salary_range_and_fraud_prob)


salary range [prob_salary_range_fraud_false, prob_salary_range_fraud_true] 

nan [0.9631270069759716, 0.03687299302402835]
20000-28000 [0.6666666666666666, 0.3333333333333333]
100000-120000 [0.9090909090909091, 0.09090909090909091]
120000-150000 [0.8571428571428571, 0.14285714285714285]
50000-65000 [0.9230769230769231, 0.07692307692307693]
40000-50000 [0.9803921568627451, 0.0196078431372549]
60-80 [0.6666666666666666, 0.3333333333333333]
65000-70000 [0.875, 0.125]
75-115 [0.6666666666666666, 0.3333333333333333]
75000-110000 [0.75, 0.25]
17000-20000 [0.8, 0.2]
16000-28000 [0.6666666666666666, 0.3333333333333333]
95000-115000 [0.2, 0.8]
15000-18000 [0.8, 0.2]
50000-70000 [0.9523809523809523, 0.047619047619047616]
45000-60000 [0.9230769230769231, 0.07692307692307693]
30000-40000 [0.8823529411764706, 0.11764705882352941]
70000-90000 [0.9642857142857143, 0.03571428571428571]
10000-14000 [0.75, 0.25]
50-110 [0.6666666666666666, 0.3333333333333333]


### telecommuting feature.

compute fraud counts of each unique value in telecommuting feature with respect to the class labes.

In [20]:

training_data_telecommutings = training_data['telecommuting']
telecommutings_unique = np.unique(training_data['telecommuting'])

telecommutings_and_fraud_counts = compute_fraud_counts_binary_data(training_data['fraudulent'], training_data_telecommutings, telecommutings_unique)

print('telecommutings unique values: ', telecommutings_unique, '\n')
print('telecommuting [telecomuting_fraud_false, telecomuting_fraud_true]','\n')
print_dictionary(telecommutings_and_fraud_counts)


telecommutings unique values:  [0 1] 

telecommuting [telecomuting_fraud_false, telecomuting_fraud_true] 

0 [9831, 421]
1 [437, 40]


compute probabilities of each unique telecommuting value with respect to the class labels.

In [21]:

telecommutings_and_fraud_prob = compute_probabilities(telecommutings_and_fraud_counts)
    
print('telecommuting [prob_telecommuting_fraud_false, prob_telecommuting_fraud_true]','\n')
print_dictionary(telecommutings_and_fraud_prob)


telecommuting [prob_telecommuting_fraud_false, prob_telecommuting_fraud_true] 

0 [0.9589348419820523, 0.04106515801794772]
1 [0.9161425576519916, 0.08385744234800839]


### has_company_logo feature.

compute fraud counts of each unique value in has_company_logo feature with respect to the class labes.

In [22]:

training_data_has_company_logo = training_data['has_company_logo']
has_company_logo_unique = np.unique(training_data['has_company_logo'])

has_company_logo_and_fraud_counts = compute_fraud_counts_binary_data(training_data['fraudulent'], training_data_has_company_logo, has_company_logo_unique)

print('has company logo unique values: ', has_company_logo_unique, '\n')
print('has_company_logo [has_company_logo_fraud_false, has_company_logo_fraud_true]', '\n')
print_dictionary(has_company_logo_and_fraud_counts)


has company logo unique values:  [0 1] 

has_company_logo [has_company_logo_fraud_false, has_company_logo_fraud_true] 

0 [1873, 275]
1 [8395, 186]


compute probabilities of each unique has_company_logo feature value with respect to the class labels.

In [23]:

has_company_logo_and_fraud_prob = compute_probabilities(has_company_logo_and_fraud_counts)
    
print('has_company_logo [prob_has_company_logo_fraud_false, prob_has_company_logo_fraud_true]', '\n')
print_dictionary(has_company_logo_and_fraud_prob)


has_company_logo [prob_has_company_logo_fraud_false, prob_has_company_logo_fraud_true] 

0 [0.8719739292364991, 0.12802607076350092]
1 [0.9783242046381541, 0.02167579536184594]


### has_questions feature.

compute fraud counts of each unique value in has_questions feature with respect to the class labes.

In [24]:

training_data_has_questions = training_data['has_questions']
has_questions_unique = np.unique(training_data['has_questions'])

has_questions_and_fraud_counts = compute_fraud_counts_binary_data(training_data['fraudulent'], training_data_has_questions, has_questions_unique)

print('has questions unique values: ', has_questions_unique, '\n')
print('has_questions [has_questions_fraud_false, has_questions_fraud_true]', '\n')
print_dictionary(has_questions_and_fraud_counts)


has questions unique values:  [0 1] 

has_questions [has_questions_fraud_false, has_questions_fraud_true] 

0 [4702, 310]
1 [5566, 151]


compute probabilities of each unique has_questions feature value with respect to the class labels.

In [25]:

has_questions_and_fraud_prob = compute_probabilities(has_questions_and_fraud_counts)
    
print('has_questions [prob_has_questions_fraud_false, prob_has_questions_fraud_true]', '\n')
print_dictionary(has_questions_and_fraud_prob)


has_questions [prob_has_questions_fraud_false, prob_has_questions_fraud_true] 

0 [0.9381484437350359, 0.06185155626496409]
1 [0.9735875459156901, 0.026412454084309953]


### employment_type feature.

compute fraud counts of each unique employment_type feature value with respect to the class labels.

In [26]:

training_data_employment_types = training_data['employment_type'].str.lower()
unique_employment_types = np.array(training_data_employment_types.drop_duplicates())

employment_types_and_fraud_counts = compute_fraud_counts_feature(training_data['fraudulent'], training_data_employment_types, unique_employment_types)

print('employment types unique values:', '\n', unique_employment_types, '\n')
print('employment_type [employment_type_fraud_false, employment_type_fraud_true]', '\n')
print_dictionary(employment_types_and_fraud_counts)


employment types unique values: 
 ['other' 'full-time' nan 'part-time' 'contract' 'temporary'] 

employment_type [employment_type_fraud_false, employment_type_fraud_true] 

other [128, 4]
full-time [6405, 289]
nan [2098, 124]
part-time [445, 17]
contract [1040, 26]
temporary [152, 1]


compute probabilities of each unique employment_type feature value with respect to the class labels.

In [27]:

employment_types_and_fraud_prob = compute_probabilities(employment_types_and_fraud_counts)
    
print('employment_type [prob_employment_type_fraud_false, prob_employment_type_fraud_true]', '\n')
print_dictionary(employment_types_and_fraud_prob)


employment_type [prob_employment_type_fraud_false, prob_employment_type_fraud_true] 

other [0.9696969696969697, 0.030303030303030304]
full-time [0.9568270092620257, 0.04317299073797431]
nan [0.9441944194419442, 0.0558055805580558]
part-time [0.9632034632034632, 0.0367965367965368]
contract [0.975609756097561, 0.024390243902439025]
temporary [0.9934640522875817, 0.006535947712418301]


### required_experience feature.

compute fraud counts of each unique required_experience feature value with respect to the class labels.

In [28]:

training_data_required_experience = training_data['required_experience'].str.lower()
required_experience_unique = np.array(training_data_required_experience.drop_duplicates())

required_experience_and_fraud_counts = compute_fraud_counts_feature(training_data['fraudulent'], training_data_required_experience, required_experience_unique)

print('required experience unique values:', '\n', required_experience_unique, '\n')
print('required experience [required_experience_fraud_false, required_experience_fraud_true]', '\n')
print_dictionary(required_experience_and_fraud_counts)


required experience unique values: 
 ['internship' 'not applicable' nan 'mid-senior level' 'associate'
 'entry level' 'executive' 'director'] 

required experience [required_experience_fraud_false, required_experience_fraud_true] 

internship [230, 6]
not applicable [593, 21]
nan [3987, 233]
mid-senior level [2157, 67]
associate [1379, 24]
entry level [1610, 91]
executive [80, 8]
director [232, 11]


compute probabilities of each unique required_experience feature value with respect to the class labels.

In [29]:

required_experience_and_fraud_prob = compute_probabilities(required_experience_and_fraud_counts)
    
print('required_experience [prob_required_experience_fraud_false, prob_required_experience_fraud_true]', '\n')
print_dictionary(required_experience_and_fraud_prob)


required_experience [prob_required_experience_fraud_false, prob_required_experience_fraud_true] 

internship [0.9745762711864406, 0.025423728813559324]
not applicable [0.9657980456026058, 0.03420195439739414]
nan [0.9447867298578199, 0.055213270142180096]
mid-senior level [0.9698741007194245, 0.03012589928057554]
associate [0.9828937990021382, 0.017106200997861726]
entry level [0.9465020576131687, 0.053497942386831275]
executive [0.9090909090909091, 0.09090909090909091]
director [0.9547325102880658, 0.04526748971193416]


### required_education feature.

compute fraud counts of each unique required_education feature value with respect to the class labels.

In [30]:

training_data_required_education = training_data['required_education'].str.lower()
required_education_unique = np.array(training_data_required_education.drop_duplicates())

required_education_and_fraud_counts = compute_fraud_counts_feature(training_data['fraudulent'], training_data_required_education, required_education_unique)

print('required education unique values:', '\n', required_education_unique, '\n')
print('required education [required_education_fraud_false, required_education_fraud_true]','\n')
print_dictionary(required_education_and_fraud_counts)


required education unique values: 
 [nan "bachelor's degree" "master's degree" 'high school or equivalent'
 'unspecified' 'some college coursework completed' 'vocational'
 'certification' 'associate degree' 'professional' 'doctorate'
 'some high school coursework' 'vocational - degree'
 'vocational - hs diploma'] 

required education [required_education_fraud_false, required_education_fraud_true] 

nan [4434, 221]
bachelor's degree [3192, 62]
master's degree [240, 21]
high school or equivalent [1158, 98]
unspecified [853, 25]
some college coursework completed [66, 1]
vocational [24, 0]
certification [80, 11]
associate degree [148, 5]
professional [39, 3]
doctorate [20, 0]
some high school coursework [6, 14]
vocational - degree [5, 0]
vocational - hs diploma [3, 0]


compute probabilities of each unique required_education feature value with respect to the class labels.

In [31]:

required_education_and_fraud_prob = compute_probabilities(required_education_and_fraud_counts)
    
print('required_education [prob_required_education_fraud_false, prob_required_education_fraud_true]','\n')
print_dictionary(required_education_and_fraud_prob)


required_education [prob_required_education_fraud_false, prob_required_education_fraud_true] 

nan [0.9525241675617615, 0.047475832438238455]
bachelor's degree [0.9809465273509527, 0.019053472649047325]
master's degree [0.9195402298850575, 0.08045977011494253]
high school or equivalent [0.9219745222929936, 0.07802547770700637]
unspecified [0.9715261958997722, 0.02847380410022779]
some college coursework completed [0.9850746268656716, 0.014925373134328358]
vocational [0.9615384615384616, 0.038461538461538464]
certification [0.8791208791208791, 0.12087912087912088]
associate degree [0.9673202614379085, 0.032679738562091505]
professional [0.9285714285714286, 0.07142857142857142]
doctorate [0.9545454545454546, 0.045454545454545456]
some high school coursework [0.3, 0.7]
vocational - degree [0.8571428571428571, 0.14285714285714285]
vocational - hs diploma [0.8, 0.2]


### industry feature.

compute fraud counts of each unique industry feature value with respect to the class labels.

In [32]:

training_data_industry = training_data['industry'].str.lower()
industries_unique = np.array(training_data_industry.drop_duplicates())

industries_and_fraud_counts = {}
industry_fraud_false = 0
industry_fraud_true = 0

# again we will consider only a small set of industries, 20 industries.
for j in range(20):
        
    for i in range(training_data['fraudulent'].shape[0]):

        if str(training_data_industry[i]) == 'nan':

            if str(industries_unique[j]) == 'nan':

                if training_data['fraudulent'].loc[i] == 0:
                    industry_fraud_false += 1
                elif training_data['fraudulent'].loc[i] == 1:
                    industry_fraud_true += 1

        else:
            
            if str(industries_unique[j]) != 'nan':

                if training_data['fraudulent'].loc[i] == 0 and industries_unique[j] == training_data_industry[i]:
                    industry_fraud_false += 1

                elif training_data['fraudulent'].loc[i] == 1 and industries_unique[j] == training_data_industry[i]:
                    industry_fraud_true += 1

    industries_and_fraud_counts[industries_unique[j]] = [industry_fraud_false, industry_fraud_true]
    industry_fraud_false = 0
    industry_fraud_true = 0

print('industries unique:', '\n', industries_unique, '\n')
print('industry [industry_fraud_false, industry_fraud_true]','\n')
print_dictionary(industries_and_fraud_counts)


industries unique: 
 [nan 'marketing and advertising' 'computer software'
 'hospital & health care' 'online media'
 'information technology and services' 'financial services'
 'management consulting' 'events services' 'internet'
 'facilities services' 'consumer electronics' 'telecommunications'
 'consumer services' 'construction' 'oil & energy' 'education management'
 'building materials' 'banking' 'food & beverages' 'food production'
 'health, wellness and fitness' 'insurance' 'e-learning' 'cosmetics'
 'staffing and recruiting' 'venture capital & private equity'
 'leisure, travel & tourism' 'human resources' 'pharmaceuticals' 'farming'
 'legal services' 'luxury goods & jewelry' 'machinery' 'real estate'
 'mechanical or industrial engineering'
 'public relations and communications' 'consumer goods' 'medical practice'
 'electrical/electronic manufacturing' 'hospitality' 'music'
 'market research' 'automotive' 'philanthropy' 'utilities'
 'primary/secondary education' 'logistics and suppl

compute probabilities of each unique industry feature value with respect to the class labels.

In [33]:

industries_and_fraud_prob = compute_probabilities(industries_and_fraud_counts)
    
print('industry [prob_industry_fraud_false, prob_industry_fraud_true]','\n')
print_dictionary(industries_and_fraud_prob)


industry [prob_industry_fraud_false, prob_industry_fraud_true] 

nan [0.9608240459304289, 0.03917595406957109]
marketing and advertising [0.9490835030549898, 0.05091649694501019]
computer software [0.9963235294117647, 0.003676470588235294]
hospital & health care [0.9047619047619048, 0.09523809523809523]
online media [0.9852941176470589, 0.014705882352941176]
information technology and services [0.9842436974789915, 0.015756302521008403]
financial services [0.9624765478424016, 0.0375234521575985]
management consulting [0.9305555555555556, 0.06944444444444445]
events services [0.9714285714285714, 0.02857142857142857]
internet [0.9983739837398374, 0.0016260162601626016]
facilities services [0.9830508474576272, 0.01694915254237288]
consumer electronics [0.9743589743589743, 0.02564102564102564]
telecommunications [0.9330143540669856, 0.06698564593301436]
consumer services [0.9521276595744681, 0.047872340425531915]
construction [0.9690721649484536, 0.030927835051546393]
oil & energy [0.589861

### function feature.

compute fraud counts of each unique function feature value with respect to the class labels.

In [34]:

training_data_function = training_data['function'].str.lower()
functions_unique = np.array(training_data_function.drop_duplicates())

functions_and_fraud_counts = compute_fraud_counts_feature(training_data['fraudulent'], training_data_function, functions_unique)

print('function unique values:', '\n', functions_unique, '\n')
print('function [function_fraud_false, function_fraud_true]','\n')
print_dictionary(functions_and_fraud_counts)


function unique values: 
 ['marketing' 'customer service' nan 'sales' 'health care provider'
 'management' 'information technology' 'other' 'engineering'
 'administrative' 'design' 'production' 'education' 'supply chain'
 'business development' 'product management' 'financial analyst'
 'consulting' 'human resources' 'project management' 'manufacturing'
 'public relations' 'strategy/planning' 'advertising' 'finance'
 'general business' 'research' 'accounting/auditing' 'art/creative'
 'quality assurance' 'data analyst' 'business analyst' 'writing/editing'
 'distribution' 'science' 'training' 'purchasing' 'legal'] 

function [function_fraud_false, function_fraud_true] 

marketing [479, 4]
customer service [657, 41]
nan [3652, 155]
sales [854, 22]
health care provider [207, 0]
management [203, 6]
information technology [1036, 14]
other [184, 21]
engineering [681, 90]
administrative [347, 51]
design [191, 1]
production [68, 0]
education [281, 1]
supply chain [16, 0]
business development [14

compute probabilities of each unique function feature value with respect to the class labels.

In [35]:

functions_and_fraud_prob = compute_probabilities(functions_and_fraud_counts)
    
print('function [prob_function_fraud_false, prob_function_fraud_true]','\n')
print_dictionary(functions_and_fraud_prob)


function [prob_function_fraud_false, prob_function_fraud_true] 

marketing [0.9917184265010351, 0.008281573498964804]
customer service [0.9412607449856734, 0.05873925501432665]
nan [0.9592855266614132, 0.04071447333858681]
sales [0.9748858447488584, 0.02511415525114155]
health care provider [0.9952153110047847, 0.004784688995215311]
management [0.9712918660287081, 0.028708133971291867]
information technology [0.9866666666666667, 0.013333333333333334]
other [0.8975609756097561, 0.1024390243902439]
engineering [0.8832684824902723, 0.11673151750972763]
administrative [0.871859296482412, 0.12814070351758794]
design [0.9947916666666666, 0.005208333333333333]
production [0.9857142857142858, 0.014285714285714285]
education [0.9964539007092199, 0.0035460992907801418]
supply chain [0.9444444444444444, 0.05555555555555555]
business development [0.9735099337748344, 0.026490066225165563]
product management [0.9861111111111112, 0.013888888888888888]
financial analyst [0.8235294117647058, 0.17647058

In [83]:

feature_values_prob = []
feature_values_prob.append(titles_and_fraud_prob)
feature_values_prob.append(locations_and_fraud_prob)
feature_values_prob.append(departments_and_fraud_prob)
feature_values_prob.append(salary_range_and_fraud_prob)
feature_values_prob.append(telecommutings_and_fraud_prob)
feature_values_prob.append(has_company_logo_and_fraud_prob)
feature_values_prob.append(has_questions_and_fraud_prob)
feature_values_prob.append(employment_types_and_fraud_prob)
feature_values_prob.append(required_experience_and_fraud_prob)
feature_values_prob.append(required_education_and_fraud_prob)
feature_values_prob.append(industries_and_fraud_prob)
feature_values_prob.append(functions_and_fraud_prob)

selected_features = []
for i in range(features.shape[0]):
    if i not in range(4,8) and i != features.shape[0]:
        selected_features.append(features[i])

data = {}
for i in range(len(feature_values_prob)):
    data[selected_features[i]] = feature_values_prob[i]

print('selected features and unique feature values with their probabilities for each class label:','\n')
for key,value in data.items():
    print(key)
    print_dictionary(value)
    print()


selected features and unique feature values with their probabilities for each class label: 

title
marketing intern [0.9230769230769231, 0.07692307692307693]
customer service - cloud video production [0.6666666666666666, 0.3333333333333333]
commissioning machinery assistant (cma) [0.6666666666666666, 0.3333333333333333]
account executive - washington dc [0.6666666666666666, 0.3333333333333333]
bill review manager [0.6666666666666666, 0.3333333333333333]
accounting clerk [0.5, 0.5]
head of content (m/f) [0.6666666666666666, 0.3333333333333333]
lead guest service specialist [0.6666666666666666, 0.3333333333333333]
hp bsm sme [0.75, 0.25]
customer service associate - part time [0.9848484848484849, 0.015151515151515152]
asp.net developer job opportunity at united states,new jersey [0.6666666666666666, 0.3333333333333333]
talent sourcer (6 months fixed-term contract) [0.6666666666666666, 0.3333333333333333]
applications developer, digital [0.6666666666666666, 0.3333333333333333]
installers 

### Evaluate performance of model.

In [96]:

def classify_job_postings(dataset):
    
    datapoint_prob_0 = []
    datapoint_prob_1 = []
    bayes_numerator = []
    
    job_ids = dataset['job_id'].to_numpy()
    i = job_ids[0]-1
    
    # for each data point
    for i in range(i, i + dataset.shape[0]):
        
        datapoint = dataset.loc[i]
        #-------------------------------------------
        #print('data point:', '\n', datapoint, '\n')
        #-------------------------------------------
        
        # go through features in data
        for feature in data:
            
            value = datapoint[feature]
            #----------------------------------------
            #print('value is: ', value)
            #----------------------------------------    
            if isinstance(value, str):
                value = value.lower()

            if feature == 'location':

                if str(value) == 'nan':
                    #-------------------------------------------------
                    #if value in data[feature]:
                        #print(value, ' not in data with key', feature)
                    #-------------------------------------------------
                    if value in data[feature]:
                        #print(value, ' in data with key', feature)
                        datapoint_prob_0.append(data[feature][value][0])
                        datapoint_prob_1.append(data[feature][value][1])
                    
                else:
                    temp = value.split(',')
                    #--------------------------------------------
                    #if temp[0] not in data[feature]:
                        #print(temp[0], ' not in data with key', feature)
                    #--------------------------------------------
                    
                    # NOTE: here we just picking the first location but some data points have sets of locations so we need
                    # to consider each location of that data point.
                    
                    if temp[0] in data[feature]:
                        #print(temp[0], ' in data with key', feature)
                        datapoint_prob_0.append(data[feature][temp[0]][0])
                        datapoint_prob_1.append(data[feature][temp[0]][1])
            else:
                #----------------------------------------
                #if value not in data[feature]:
                    #print(value, ' not in data with key', feature)
                #----------------------------------------
                if value in data[feature]:
                    #print(value, ' in data with key', feature)
                    datapoint_prob_0.append(data[feature][value][0])
                    datapoint_prob_1.append(data[feature][value][1])

        #print('length of lists respectively: ', len(datapoint_prob_0), ' ', len(datapoint_prob_1))
        
        given_fraud_false = np.prod(datapoint_prob_0) * prior_prob_fraud_false
        given_fraud_true = np.prod(datapoint_prob_1) * prior_prob_fraud_true

        bayes_numerator.append(given_fraud_false)
        bayes_numerator.append(given_fraud_true)

        #print()
        print('job posting: ', datapoint['title'])
        print('bayes numerator given job posting IS NOT fraudulent: ', bayes_numerator[0])
        print('bayes numerator given job posting IS fraudulent: ', bayes_numerator[1])

        max_value = max(bayes_numerator)
        if max_value == bayes_numerator[0]:
            print('classification is: ', 0, ' not fraudulent', '\n\n')
        else:
            print('classification is: ', 1, ' fraudulent', '\n\n')
            
        # clear lists for next data point classification
        datapoint_prob_0.clear()
        datapoint_prob_1.clear()
        bayes_numerator.clear()
    

In [98]:

#----------------------------------------------------------------
# just for illustration but not to be used as a way to determine the performance of the model, cause this is training data

# for training data slicing starts at 0 and ends at 10728
#classify_job_postings(training_data.loc[:20])

# NOTE: there's a job posting thats fraudulent in the range of training data specified below but its classified as 0 (non fraud), 
# however this can be due to a number of reasons but before we can dive into those we must ensure the 
# classify_data_point function is working correctly for multiple data points of each feature.

#classify_job_postings(training_data.loc[97:105])
#----------------------------------------------------------------

# for evaluating performance of model we use validation and test data

# for validation data slicing starts at 10729 and ends at 16450
classify_job_postings(validation_data.loc[10729:10748])

# for test data slicing starts at 16451 till the end of the test data. 
#classify_job_postings(test_data.loc[16451:16470])


job posting:  Senior Front-End Engineer
bayes numerator given job posting IS NOT fraudulent:  0.6273116597897598
bayes numerator given job posting IS fraudulent:  2.881831732158109e-17
classification is:  0  not fraudulent 


job posting:  Field Sales Representative
bayes numerator given job posting IS NOT fraudulent:  0.6306537476832029
bayes numerator given job posting IS fraudulent:  1.3864771398482194e-16
classification is:  0  not fraudulent 


job posting:  Field Supervisor
bayes numerator given job posting IS NOT fraudulent:  0.6120107109683527
bayes numerator given job posting IS fraudulent:  7.937900187126495e-18
classification is:  0  not fraudulent 


job posting:  Systems Engineers/Network Administrators/Tiers I-III
bayes numerator given job posting IS NOT fraudulent:  0.6862303467569415
bayes numerator given job posting IS fraudulent:  7.608869670342588e-16
classification is:  0  not fraudulent 


job posting:  Customer Service Associate
bayes numerator given job posting I