In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
sb.set()

import re
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('wordnet')

from scipy.sparse import csr_matrix, hstack
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.ensemble import RandomForestClassifier

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chenwenhong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/chenwenhong/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
filepath = "fake_job_postings.csv"
dirty_posting = pd.read_csv(filepath)
dirty_posting.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


# Number of non-null values for each feature
## High proportion of non-null values for department and salary_range features. Poor differentiators, and would be dropped as predictors

In [3]:
dirty_posting.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               17880 non-null  int64 
 1   title                17880 non-null  object
 2   location             17534 non-null  object
 3   department           6333 non-null   object
 4   salary_range         2868 non-null   object
 5   company_profile      14572 non-null  object
 6   description          17879 non-null  object
 7   requirements         15185 non-null  object
 8   benefits             10670 non-null  object
 9   telecommuting        17880 non-null  int64 
 10  has_company_logo     17880 non-null  int64 
 11  has_questions        17880 non-null  int64 
 12  employment_type      14409 non-null  object
 13  required_experience  10830 non-null  object
 14  required_education   9775 non-null   object
 15  industry             12977 non-null  object
 16  func

# Categorical class switching:
### 1. Categorical classes for categorical features
### 2. Class switching and standardisation for multi-class categorical features
### 3. Class switching and standardisation for boolean categorical features

## 1. Categorical classes for categorical features
### Possible classes within each categorical features that nan values could be converted to

In [4]:
cat_cols = ['telecommuting', 'has_company_logo', 'has_questions', 'employment_type', 'required_education', 'required_experience', 'industry', 'function', 'fraudulent']

for cat in cat_cols:
    print(dirty_posting[cat].value_counts())
    print("\n")

0    17113
1      767
Name: telecommuting, dtype: int64


1    14220
0     3660
Name: has_company_logo, dtype: int64


0    9088
1    8792
Name: has_questions, dtype: int64


Full-time    11620
Contract      1524
Part-time      797
Temporary      241
Other          227
Name: employment_type, dtype: int64


Bachelor's Degree                    5145
High School or equivalent            2080
Unspecified                          1397
Master's Degree                       416
Associate Degree                      274
Certification                         170
Some College Coursework Completed     102
Professional                           74
Vocational                             49
Some High School Coursework            27
Doctorate                              26
Vocational - HS Diploma                 9
Vocational - Degree                     6
Name: required_education, dtype: int64


Mid-Senior level    3809
Entry level         2697
Associate           2297
Not Applicable      1116
Direc

## 2. Class switching and standardisation for multi-class categorical features
### a) Convert ambiguous classes to feature specific classes
### b) Convert "nan" values to feature specific classes
### c) Concatenate words separated by "-" and  " " with underscores

In [5]:
nan_cat_cols = ['employment_type', 'required_education', 'required_experience', 'industry', 'function']

posting = dirty_posting.copy()

for nan_cat in nan_cat_cols:
    if nan_cat == "employment_type":
        posting[nan_cat].replace({"Other": "Other Employment Types"}, regex = True, inplace = True)
        posting[nan_cat].fillna(value = "Other Employment Types", inplace = True)
        posting[nan_cat].replace({"-": "_"}, regex = True, inplace = True)
        posting[nan_cat].replace({" ": "_"}, regex = True, inplace = True)
    elif nan_cat == "required_education":
        posting[nan_cat].replace({"Unspecified": "Unspecified Education"}, regex = True, inplace = True)
        posting[nan_cat].fillna(value = "Unspecified Education", inplace = True)
        posting[nan_cat].replace({" - ": "_"}, regex = True, inplace = True)
        posting[nan_cat].replace({" ": "_"}, regex = True, inplace = True)
    elif nan_cat == "required_experience":
        posting[nan_cat].replace({"Not Applicable": "Experience Not Applicable"}, regex = True, inplace = True)
        posting[nan_cat].fillna(value = "Experience Not Applicable", inplace = True)
        posting[nan_cat].replace({" ": "_"},regex = True, inplace = True)
        posting[nan_cat].replace({"-": "_"}, regex = True, inplace = True)
    elif nan_cat == "function":
        posting[nan_cat].replace({"Other": "Other Functions"}, regex = True, inplace = True)
        posting[nan_cat].fillna(value = "Other Functions", inplace = True)
        posting[nan_cat].replace({" ": "_"},regex = True, inplace = True)
    else:
        posting[nan_cat].fillna(value = "Other Industries", inplace = True)
        posting[nan_cat].replace({" ": "_"},regex = True, inplace = True)

## 3. Class switching and standardisation for boolean categorical features
### a) Convert "nan" values to feature specific classes

In [6]:
int_cols = posting.columns[posting.dtypes == np.int64]

for int_ in int_cols:
    if int_ == "telecommuting":
        posting[int_].replace({1: "telecommuting", 0 : "not_telecommuting"}, inplace=True)
    elif int_ == "has_company_logo"  :
        posting[int_].replace({1: "has_company_logo", 0 : "no_company_logo"}, inplace=True)
    elif int_ == "has_questions"  :
        posting[int_].replace({1: "has_questions", 0 : "no_questions"}, inplace=True)
    else:
        pass

# Conversion of occurrences of "nan" in within prose

In [7]:
prose_cols = ['location', 'company_profile', 'description', 'requirements', 'benefits']

for prose in prose_cols:
    if prose == "company_profile":
        posting[prose].fillna(value = " nan_profile ", inplace = True)
    elif prose == "description":
        posting[prose].fillna(value = " nan_description ", inplace = True)
    elif prose == "requirements":
        posting[prose].fillna(value = " nan_requirements ", inplace = True)
    elif prose == "benefits":
        posting[prose].fillna(value = " nan_benefits ", inplace = True)
    else:
        posting[prose].fillna(value = " nan_location ", inplace = True)

# Feature generation for character count of prose features

In [8]:
posting["company_profile_clen"] = posting['company_profile'].str.len()
        
posting["description_clen"] = posting['description'].str.len()
        
posting["requirements_clen"] = posting['requirements'].str.len()

posting["benefits_clen"] = posting['benefits'].str.len()

# Confirming of non-null values

In [9]:
posting.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   job_id                17880 non-null  int64 
 1   title                 17880 non-null  object
 2   location              17880 non-null  object
 3   department            6333 non-null   object
 4   salary_range          2868 non-null   object
 5   company_profile       17880 non-null  object
 6   description           17880 non-null  object
 7   requirements          17880 non-null  object
 8   benefits              17880 non-null  object
 9   telecommuting         17880 non-null  object
 10  has_company_logo      17880 non-null  object
 11  has_questions         17880 non-null  object
 12  employment_type       17880 non-null  object
 13  required_experience   17880 non-null  object
 14  required_education    17880 non-null  object
 15  industry              17880 non-null

# Tokenisation of prose
### 1. Cleaning of prose
### 2. Tagging of words with their word classes
### 3. Lemmatisation

## 1. Cleaning of prose
### a) Conversion to lower case
### b) Strip words of leading and trailing symbols and numbers
### c) Capture and preserve URLs, Phone numbers and Emails
### d) Removal of non-breaking spaces "\xa0"

In [10]:
swords = stopwords.words("english")

def text_cleaner(prose):
    cleaned_text = ""
    for text in prose.split(" "):
        text = text.lower().strip()
        if text not in string.punctuation and text not in swords:
            if re.search("url_",text):
                text = re.findall("url_",text)
            elif re.search("phone_",text):
                text = re.findall("phone_",text)
            elif re.search("email_",text):
                text = re.findall("email_",text)
            elif re.search("\xa0",text):
                text = len(re.findall("\xa0",text)) * [" "] + re.split("\xa0",text)
            else:
                 text = re.split("[\\.:;\/,()\"\"\'\'#]+" ,text)
            for word in text:
                cleaned_text += " " + word
    else:
        return cleaned_text

## 2. Tagging of words with their word classes
### a) Tag words with their respective word classes to better preserve their meaning within the sentence

In [11]:
def tag_translate(tag): 
    if re.search("^J",tag):
        return(wordnet.ADJ)
    elif re.search("^R",tag):
        return(wordnet.ADV)
    elif re.search("^V",tag):
        return(wordnet.VERB)
    else:
        return(wordnet.NOUN)

## 3. Lemmatisation
### a) Convert words to their base forms through use of lemmatiser for better tokenisation

In [12]:
lemmer = WordNetLemmatizer()
def text_tag_lem(prose):
    l_prose_list=[]
    words = nltk.word_tokenize(prose)
    tagged_words = nltk.pos_tag(words)
    for i in tagged_words:
        l_prose_list.append(lemmer.lemmatize(i[0], tag_translate(i[1])))
    else:
        return(l_prose_list)

In [13]:
tokenizing_columns = ['title', 'company_profile', 'description', 'requirements', 'benefits']

for i in tokenizing_columns:
    posting[i] = posting[i].apply(lambda x: " ".join(text_tag_lem(text_cleaner(x))))

# Convert feature values to string

In [14]:
lower_columns = ['telecommuting', 'has_company_logo', 'has_questions', 'employment_type', 'required_education', 'required_experience', 'industry', 'function']

for j in lower_columns:
    posting[j] = posting[j].str.lower()

# Feature cleaning for location
## Separate location into 3 categories: Country, State, City

In [15]:
def locater(location):
    if re.search(", ", location) != None:
        return location
    else:
        return "nan_location"
    
def country_locater(location):
    place = locater(location)
    if place == "nan_location":
        return "nan_country"
    else:
        area = re.findall("\w+", place)
        return(area[0]+"_")
    
def state_locater(location):
    place = locater(location)
    area = re.findall("\w+", place)
    try:
        state = area[1] + "_"
    except:
        state = "nan_state"
    return(state)

def city_locater(location):
    place = locater(location)
    area = re.findall("\w+", place)
    city = ""
    try:
        for i in range(2, len(area)):
            city += area[i] + "_"
    except:
        city = "nan_city"
    return(city)


In [16]:
posting["location"].fillna(value = "nan_location", inplace = True)

posting["country"] = posting["location"].apply(lambda x : country_locater(x))

posting["state"] = posting["location"].apply(lambda x : state_locater(x))

posting["city"] = posting["location"].apply(lambda x : city_locater(x))

# Prepare data columns
## Combine columns into "combined_feature" that will undergo text vectorisation

In [17]:
posting.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,...,industry,function,fraudulent,company_profile_clen,description_clen,requirements_clen,benefits_clen,country,state,city
0,1,marketing intern,"US, NY, New York",Marketing,,we re food52 we ve create groundbreaking award...,food52 fast-growing james beard award-winning ...,experience content management system major plu...,nan_benefits,not_telecommuting,...,other_industries,marketing,0,885,905,852,14,US_,NY_,New_York_
1,2,customer service cloud video production,"NZ, , Auckland",Success,,90 second worlds cloud video production servic...,organise focused vibrant awesome ! do passion ...,expect you your key responsibility communicate...,get usthrough part 90 second team gain experie...,not_telecommuting,...,marketing_and_advertising,customer_service,0,1286,2077,1433,1292,NZ_,Auckland_,
2,3,commission machinery assistant cma,"US, IA, Wever",,,valor service provide workforce solution meet ...,client locate houston actively seek experience...,implement pre-commissioning commissioning proc...,nan_benefits,not_telecommuting,...,other_industries,other_functions,0,879,355,1363,14,US_,IA_,Wever_
3,4,account executive washington dc,"US, DC, Washington",Sales,,passion improve quality life geography heart e...,company esri – environmental system research i...,education : bachelor ’ s master ’ s gi busines...,culture anything corporate—we collaborative cr...,not_telecommuting,...,computer_software,sales,0,614,2600,1429,782,US_,DC_,Washington_
4,5,bill review manager,"US, FL, Fort Worth",,,spotsource solution llc global human capital m...,job title itemization review managerlocation f...,qualification rn license state texasdiploma ba...,full benefit offer,not_telecommuting,...,hospital_&_health_care,health_care_provider,0,1628,1520,757,21,US_,FL_,Fort_Worth_


In [18]:
combination_columns = ['title', 'country', 'state', 'city', 'telecommuting', 'has_company_logo', 'has_questions', 'company_profile', 'description', 'requirements', 'benefits', 'employment_type', 'required_education', 'required_experience', 'industry', 'function']

posting[combination_columns] = posting[combination_columns].applymap(str)

posting['combined_feature'] = posting["title"] 

for i in combination_columns:
    posting['combined_feature'] += " " + posting[i]

In [19]:
drop_columns = ['salary_range', 'department', 'title', 'location', 'country', 'state', 'city', 'telecommuting', 'has_company_logo', 'has_questions', 'company_profile', 'description', 'requirements', 'benefits', 'employment_type', 'required_education', 'required_experience', 'industry', 'function']

for i in drop_columns:
    del posting[i]


In [20]:
filepath = "cleaned_postings_final.csv"
posting.to_csv(filepath)

# Vectorisation process

In [21]:
filepath = "cleaned_postings_final.csv"
cleaned_posting = pd.read_csv(filepath)
del cleaned_posting["Unnamed: 0"]

In [22]:
cleaned_posting.head()

Unnamed: 0,job_id,fraudulent,company_profile_clen,description_clen,requirements_clen,benefits_clen,combined_feature
0,1,0,885,905,852,14,marketing intern marketing intern US_ NY_ New_...
1,2,0,1286,2077,1433,1292,customer service cloud video production custom...
2,3,0,879,355,1363,14,commission machinery assistant cma commission ...
3,4,0,614,2600,1429,782,account executive washington dc account execut...
4,5,0,1628,1520,757,21,bill review manager bill review manager US_ FL...


In [23]:
clen_data = pd.DataFrame(cleaned_posting[['company_profile_clen', 'description_clen', 'requirements_clen', 'benefits_clen']])
clen_data.head()

Unnamed: 0,company_profile_clen,description_clen,requirements_clen,benefits_clen
0,885,905,852,14
1,1286,2077,1433,1292
2,879,355,1363,14
3,614,2600,1429,782
4,1628,1520,757,21


# Vectorisation of "combined_feature"
## Account for frequency of word occurrence within prose through use of Count Vectoriser; use of Compressed Sparse Row Matrix

In [24]:
tv = CountVectorizer()
v_data = tv.fit_transform(cleaned_posting["combined_feature"])

In [25]:
type(v_data)

scipy.sparse.csr.csr_matrix

# Normalisation of character length features

In [26]:
clen_data.astype(int)
for i in clen_data.columns:
    max_value = clen_data[i].max()
    min_value = clen_data[i].min()
    value_range = max_value - min_value
    clen_data[i] = clen_data[i].apply(lambda x: (x-min_value)/value_range)

clen_matrix = csr_matrix(clen_data)

# Conversion of character length features into Compressed Sparse Row Matrix and perform column concatenation with matrix for "combined_feature"

In [27]:
cdata = hstack((clen_matrix, v_data))

In [28]:
fraud_data = pd.DataFrame(cleaned_posting["fraudulent"]) #y values

# Comparison of cleaned against uncleaned data

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix


def f1_scoring (con_mat):
    false_pos = con_mat[0][1]
    false_neg = con_mat[1][0]
    true_pos = con_mat[1][1]
    precision = true_pos/(true_pos+false_pos)
    recall = true_pos/(true_pos+false_neg)
    f1 = 2*precision*recall / (precision+recall)
    return f1

## Cleaned data performance 

In [32]:
v_train, v_test, fraud_train, fraud_test = train_test_split(v_data, fraud_data, test_size = 0.2, random_state = 0)

svc = SVC(kernel='linear', gamma= 1.0, random_state=0)
svc.fit(v_train, fraud_train)


pred_v_train = svc.predict(v_train)

print(svc.score(v_train, fraud_train))
con_mat = confusion_matrix(fraud_train, pred_v_train)
print(con_mat)
print(f1_scoring(con_mat))
print("\n")

pred_v_test = svc.predict(v_test)

print(svc.score(v_test, fraud_test))
con_mat = confusion_matrix(fraud_test, pred_v_test)
print(con_mat)
print(f1_scoring(con_mat))

  return f(**kwargs)


1.0
[[13591     0]
 [    0   713]]
1.0


0.9818232662192393
[[3387   36]
 [  29  124]]
0.792332268370607
