# Research hackathon year 1 - Fraudulent job advertisement detection

Data: http://emscad.samos.aegean.gr/

Aim: Engineering structured data to predict fraudulent job advertisement.

### Imports

In [28]:
import pandas as pd
import numpy as np

# For classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# For feature engineering
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import nltk.tokenize as nt
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/thaole/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/thaole/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [66]:
# Load data
data = pd.read_csv("job_descriptions_dataset.csv")
data.head(3)

Unnamed: 0,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,in_balanced_dataset
0,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, -winning online food c...",\nExperience with content management systems ...,,t,t,f,Other,Internship,,,Marketing,f,f
1,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - NAME_MASKED - Awesome!Do...,What we expect from you: \nYour key responsibi...,What you will get from us \nThrough being part...,t,t,f,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,f,f
2,Commissioning Machinery Assistant (CMA),"US, IA, NAME_MASKED",,,\nNAME_MASKED Services provides Workforce Sol...,"Our client, located in Houston, is actively se...",\nImplement pre-commissioning and commissioni...,,t,t,f,,,,,,f,f


### Oversampling

In [67]:
def over_sampler(data):
    data = data.replace(np.nan, "", regex=True)
    
    #Make a balanced dataset by oversampling
    data_Fraud = data[data['fraudulent']=='t']
    data_NonFraud =  data[data['fraudulent']=='f']    
    df_fraud_example = pd.concat([data_Fraud, data_Fraud, data_Fraud, data_Fraud, 
                                  data_Fraud, data_Fraud, data_Fraud, data_Fraud, 
                                  data_Fraud, data_Fraud, data_Fraud, data_Fraud, 
                                 data_Fraud, data_Fraud, data_Fraud, data_Fraud, 
                                 data_Fraud, data_Fraud, data_Fraud, data_Fraud], axis = 0)
    data_balanced = pd.concat([data_NonFraud, df_fraud_example], axis = 0)
    data_balanced = data_balanced.sample(frac=1).reset_index(drop=True)
    return data_balanced

In [68]:
data = over_sampler(data)

## Data Cleaning

In [50]:
#Function to return wordcounts in strings
def clean_sequence(string):
    string = string.encode("ascii", "ignore").decode()
    string_words = word_tokenize(string)
    return len([word for word in string_words if word.lower() not in stop_words])

def clean_data(data):
    #Change all boolean variables to binary representations.
    data['fraudulent'] = np.where(data['fraudulent']=='t', 1, 0)
    data['telecommuting'] = np.where(data['telecommuting']=='t', 1, 0)
    data['has_company_logo'] = np.where(data['has_company_logo']=='t', 1, 0)
    data['has_questions'] = np.where(data['has_questions']=='t', 1, 0)
    
    #Get wordcounts
    data['description'] = data['description'].apply(lambda x: clean_sequence(str(x)))
    data['benefits'] = data['benefits'].apply(lambda x: clean_sequence(str(x)))
    data['requirements'] = data['requirements'].apply(lambda x: clean_sequence(str(x)))
    data['company_profile'] = data['company_profile'].apply(lambda x: clean_sequence(str(x)))
    return data


def change_to_ordinal(data):
    scale_mapper = {'Entry level':1, 'Not Applicable':0, 'Associate':3, 'Internship':2, 'Mid-Senior level':4, 'Director':5,'Executive':5}
    data['required_experience 1'] = data['required_experience'].replace(scale_mapper)
    data['required_experience 1'] = data['required_experience 1'].replace(np.nan, 0, regex=True)
    return data

list1 = ['..','!!','??']
spamwords_fraudulent = ['home', 'extra' 'easy money', 'week', 'online', 'no experience']

#Function to check whether a text has consecutive puntuation.
def consecutive_punct(list1, list2):
    if any( str in list1 for str in list2):
        return 1
    return 0

In [51]:
cleaned_data = clean_data(data)
cleaned_data

Unnamed: 0,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,in_balanced_dataset
0,Health + Safety Professional,"US, CA, Bakersfield",HSE (Health Safety Environmental),#URL_MASKED#,111,102,271,85,1,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Oil & Energy,Other,1,f
1,ICU RN,"US, NV,",,,0,205,0,0,0,0,0,,,,,,1,t
2,Customer Service Team Lead,"US, UT, Salt Lake City",,,78,293,128,0,1,1,0,Full-time,Entry level,High School or equivalent,Financial Services,Customer Service,0,f
3,Marketing or Journalism Major turned Project M...,"US, IN, Fishers",App Delivery,#URL_MASKED#,81,430,88,0,1,1,1,Full-time,Entry level,Associate Degree,Marketing and Advertising,Project Management,0,f
4,Customer Assistant,"CA, ON, Toronto",,,36,95,22,0,1,1,0,,,,,,1,f
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34329,Receptionist/Office Clerk,"US, GA, ATLANTA",,,0,49,0,0,0,0,0,Full-time,Entry level,High School or equivalent,Accounting,Administrative,1,t
34330,Bill Review Manager,"US, FL, Fort Worth",,,191,164,88,3,1,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,f
34331,Payroll Accountant,"EE, 37, NAME_MASKED",Finance,,185,95,105,84,1,1,0,Full-time,Associate,,Marketing and Advertising,Finance,0,f
34332,Research and Development Part-Time Internship,"US, NY, Brooklyn",,,44,150,42,0,1,1,0,Part-time,Internship,Bachelor's Degree,Biotechnology,Research,0,f


## Feature Engineering

In [52]:
# feature Country and City
cleaned_data['Country'] = data['location'].str.split(',', expand = True)[0]
cleaned_data['City'] = data['location'].str.split(',', expand = True)[2]

In [53]:
# feature located_in_US 
cleaned_data['located_in_US'] = 0
cleaned_data.loc[cleaned_data['Country'] == "US", ['located_in_US']] = 1

In [40]:
# requiring low education
data['low_required_education'] = 0
data.loc[data['required_education'] == ( 'High School or equivalent' or 'Unspecified' or 'Some High School Coursework' or '' or 'Vocational - HS Diploma' or 'Vocational - Degree' or 'Some College Coursework Completed'), ['low_required_education']] = 1

cleaned_data['low_required_education'] = data['low_required_education']

In [44]:
cleaned_data['low_required_education'].value_counts()

0    29024
1     5310
Name: low_required_education, dtype: int64

In [63]:
#feature contains money_in_title, money_in_description
currencies= ['$','£','¥','€']

# create a function to check if a string contains any currency symbol
def contains_currency(s):
    return any(currency in s for currency in currencies)

# apply the function
cleaned_data['money_in_title'] = data['title'].apply(contains_currency).astype(int)
cleaned_data['money_in_desc'] = data['description'].apply(contains_currency).astype(int)

In [75]:
# check if text contains consecutive punctuations
punctuations = ['..','!!','??']

# create a function to check if a string contains consecutive punctuations
def contains_cons_punc(s):
    return any(punct in s for punct in punctuations)

# apply the function
cleaned_data['cons_punc_des'] = data['description'].apply(contains_cons_punc).astype(int)
cleaned_data['cons_punc_title'] = data['title'].apply(contains_cons_punc).astype(int)

In [104]:
# Create some boolean variables
cleaned_data["has_short_company_profile"]= np.where(cleaned_data['company_profile'] < 10, 1, 0)
cleaned_data["has_short_requirements"]= np.where(cleaned_data['requirements'] < 10, 1, 0)