In [17]:
import numpy as np
import pandas as pd
import time
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier


import nltk
from nltk.stem import LancasterStemmer
from nltk.stem.porter import PorterStemmer

In [18]:
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, train_test_split

def evaluation(clf, clf_name, train, y):
    x_train, x_test, y_train, y_test = train_test_split(train, y, test_size=.33)
    clf.fit(x_train, y_train)
    
    starting_tm = time.time()
    y_pred = clf.predict(x_test)
    
    print("Classifier: ", clf_name)
    print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
    print("Precision: ", metrics.precision_score(y_test, y_pred))
    print("Recall: ", metrics.recall_score(y_test, y_pred))
    print("F1-Measure: ", metrics.f1_score(y_test, y_pred))
    print("Execution time: " + str(time.time() - starting_tm))

In [2]:
jobs_df = pd.read_csv("fake_job_postings.csv")
jobs_df = jobs_df.fillna('')

In [22]:
y = jobs_df["fraudulent"]
train = jobs_df["description"]

print("len", len(jobs_df))

len 17880


# K

## Part 1
**Approach:** Handling description using TF-IDF or CountVectorizer then train a **Random Forest** model

In [23]:
porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()

def stemming_tokenizer(str_input):
    stemmer = porter_stemmer
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [stemmer.stem(word) for word in words]
    return words

In [24]:
starting_tm = time.time()
vectorizer = TfidfVectorizer(stop_words='english', tokenizer=stemming_tokenizer)

vtrain = vectorizer.fit_transform(train)
print("Vectorization took: ", str(time.time()-starting_tm))

  'stop_words.' % sorted(inconsistent))


Vectorization took:  47.452059507369995


In [25]:
rf = RandomForestClassifier(n_estimators=660, n_jobs=4)
evaluation(rf, "Random Forest", vtrain, y)

Classifier:  Random Forest
Accuracy:  0.9801728520589731
Precision:  0.9939024390243902
Recall:  0.5842293906810035
F1-Measure:  0.7358916478555305
Execution time: 0.9834010601043701


---
## Part 2
# Features Engineering

**Approach:** Feature extraction. Getting some feature from the **description** and using it with the rest of the columns.

In [3]:
jobs_df = pd.read_csv("fake_job_postings.csv")
jobs_df = jobs_df.fillna('')

y = jobs_df["fraudulent"]

### Features Extraction for Description, Company Profile, Benefits, Requirements and Title
- Length
- Character length without spaces
- Number of words

In [4]:
# Description
jobs_df['len_desc'] = jobs_df["description"].apply(lambda x: len(str(x)))
jobs_df['len_char_desc'] = jobs_df["description"].apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
jobs_df['len_word_desc'] = jobs_df["description"].apply(lambda x: len(str(x).split()))

feats_desc = ['len_desc', 'len_char_desc', 'len_word_desc']

In [5]:
# Title
jobs_df['len_title'] = jobs_df["title"].apply(lambda x: len(str(x)))
jobs_df['len_char_title'] = jobs_df["title"].apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
jobs_df['len_word_title'] = jobs_df["title"].apply(lambda x: len(str(x).split()))

feats_title = ['len_title', 'len_char_title', 'len_word_title']

In [6]:
# Company Profile
jobs_df['len_cp'] = jobs_df["company_profile"].apply(lambda x: len(str(x)))
jobs_df['len_char_cp'] = jobs_df["company_profile"].apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
jobs_df['len_word_cp'] = jobs_df["company_profile"].apply(lambda x: len(str(x).split()))

feats_cp = ['len_cp', 'len_char_cp', 'len_word_cp']

In [7]:
# Benefits
jobs_df['len_ben'] = jobs_df["benefits"].apply(lambda x: len(str(x)))
jobs_df['len_char_ben'] = jobs_df["benefits"].apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
jobs_df['len_word_ben'] = jobs_df["benefits"].apply(lambda x: len(str(x).split()))

feats_ben = ['len_ben', 'len_char_ben', 'len_word_ben']

In [8]:
# Requirements
jobs_df['len_req'] = jobs_df["requirements"].apply(lambda x: len(str(x)))
jobs_df['len_char_req'] = jobs_df["requirements"].apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
jobs_df['len_word_req'] = jobs_df["requirements"].apply(lambda x: len(str(x).split()))

feats_req = ['len_req', 'len_char_req', 'len_word_req']

### Binary features will be as they are.
Those are: telecommuting, has_company_logo, has_questions

In [9]:
feats_bin = ['telecommuting', 'has_company_logo', 'has_questions']

### Feature extraction for Salary Range
- Minimum
- Maximum
- Difference

In [10]:
def get_max(r):
    if r!= "":
        if "-" in r:
            if r.split("-")[1].isnumeric():
                return int(r.split("-")[1])
            else:
                return -1
        else:
            return int(r)
    else:
        return -1
    
def get_diff(r):
    if r!= "":
        if "-" in r:
            if r.split("-")[0].isnumeric() and r.split("-")[1].isnumeric():
                return int(r.split("-")[1]) - int(r.split("-")[0])
            else:
                return -1
        else:
            return int(r)
    else:
        return -1
    
jobs_df['min_salary'] = jobs_df["salary_range"].apply(lambda r: int(r.split("-")[0]) if r.split("-")[0].isnumeric() else -1 if "-" in r else r if r != '' else -1)
jobs_df['max_salary'] = jobs_df["salary_range"].apply(get_max)
jobs_df['diff_salary'] = jobs_df["salary_range"].apply(get_diff)

feats_salary = ['min_salary', 'max_salary', 'diff_salary']

### Columns with simple text(few words) that are repeated get as feature an id. Same texts will get the same id
Simple class used: Dictionary

Those are: location, department, employment_type, required_experience, required_education, industry, function

In [12]:
class Dictionary:    
    def __init__(self):
        self.dic = {}
        self.id = 0
    
    def add_to_dict(self, text):
        text = re.sub(r"[^A-Za-z0-9\-]", " ", text).lower() 
        if text in self.dic:
            return self.dic[text]
        else:
            self.dic[text] = int(self.id)
            self.id += 1
            return self.dic[text]
    

In [13]:
location_dict = Dictionary()
jobs_df['loc_ids'] = jobs_df['location'].apply(location_dict.add_to_dict)

department_dict = Dictionary()
jobs_df['dep_ids'] = jobs_df['department'].apply(department_dict.add_to_dict)

employment_type_dict = Dictionary()
jobs_df['emptype_ids'] = jobs_df['employment_type'].apply(employment_type_dict.add_to_dict)

required_experience_dict = Dictionary()
jobs_df['reqexp_ids'] = jobs_df['required_experience'].apply(required_experience_dict.add_to_dict)

required_education_dict = Dictionary()
jobs_df['reqedu_ids'] = jobs_df['required_education'].apply(required_education_dict.add_to_dict)

industry_dict = Dictionary()
jobs_df['ind_ids'] = jobs_df['industry'].apply(industry_dict.add_to_dict)

function_dict = Dictionary()
jobs_df['func_ids'] = jobs_df['function'].apply(function_dict.add_to_dict)

reptext_feat = ['loc_ids', 'dep_ids', 'emptype_ids', 'reqexp_ids', 'reqedu_ids', 'ind_ids', 'func_ids']

In [19]:
feat_train = jobs_df[
    feats_desc + feats_title + feats_cp +
    feats_ben + feats_req + feats_bin +
    feats_salary + reptext_feat
]

train.head()

Unnamed: 0,len_desc,len_char_desc,len_word_desc,len_title,len_char_title,len_word_title,len_cp,len_char_cp,len_word_cp,len_ben,...,min_salary,max_salary,diff_salary,loc_ids,dep_ids,emptype_ids,reqexp_ids,reqedu_ids,ind_ids,func_ids
0,905,46,124,16,10,2,885,46,141,0,...,-1,-1,-1,0,0,0,0,0,0,0
1,2077,71,315,41,18,6,1286,58,153,1292,...,-1,-1,-1,1,1,1,1,0,1,1
2,355,31,50,39,18,4,879,44,141,0,...,-1,-1,-1,2,2,2,2,0,0,2
3,2600,58,346,33,19,5,614,43,85,782,...,-1,-1,-1,3,3,1,3,1,2,3
4,1520,59,168,19,12,3,1628,68,207,21,...,-1,-1,-1,4,2,1,3,1,3,4


In [21]:
rf = RandomForestClassifier(n_estimators=660, n_jobs=4)
evaluation(rf, "Random Forest", feat_train, y)

Classifier:  Random Forest
Accuracy:  0.9771225216065074
Precision:  0.9551282051282052
Recall:  0.5379061371841155
F1-Measure:  0.6882217090069285
Execution time: 0.8378407955169678
