In [18]:
import numpy as np
import pandas as pd
import time
import re

from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem import LancasterStemmer
from nltk.stem.porter import PorterStemmer

In [19]:
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, train_test_split

def evaluation(clf, clf_name, train, y):
    x_train, x_test, y_train, y_test = train_test_split(train, y, test_size=.33)
    clf.fit(x_train, y_train)
    
    starting_tm = time.time()
    y_pred = clf.predict(x_test)
    
    print("Classifier: ", clf_name)
    print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
    print("Precision: ", metrics.precision_score(y_test, y_pred))
    print("Recall: ", metrics.recall_score(y_test, y_pred))
    print("F1-Measure: ", metrics.f1_score(y_test, y_pred))
    print("Execution time: " + str(time.time() - starting_tm))

In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

def grid_evaluattion(clf, msg, X, y, tuned_parameters, scores):
    print(msg)
    
    # Split the dataset in two equal parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)    
    for score in scores:
        print("# Tuning hyper-parameters for %s\n" % score)
    
        clf = GridSearchCV(clf, tuned_parameters, scoring=score)
        clf.fit(X_train, y_train)
    
        print("Best parameters set found on development set: ")
        print(clf.best_params_)
        print("\nGrid scores on development set:")
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()
    
        print("Detailed classification report:\n")
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.\n")
        
        y_true, y_pred = y_test, clf.predict(X_test)
        print(classification_report(y_true, y_pred))
        print() 

In [21]:
jobs_df = pd.read_csv("fake_job_postings.csv")
jobs_df = jobs_df.fillna('')

In [22]:
y = jobs_df["fraudulent"]
train = jobs_df["description"]

print("len", len(jobs_df))

len 17880


# K

## Part 1
**Approach:** Handling description using TF-IDF or CountVectorizer then train a **Random Forest** model

In [23]:
porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()

def stemming_tokenizer(str_input):
    stemmer = porter_stemmer
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [stemmer.stem(word) for word in words]
    return words

In [24]:
starting_tm = time.time()
vectorizer = TfidfVectorizer(stop_words='english', tokenizer=stemming_tokenizer)

vtrain = vectorizer.fit_transform(train)
print("Vectorization took: ", str(time.time()-starting_tm))

  'stop_words.' % sorted(inconsistent))


Vectorization took:  47.276076793670654


1. SGD

In [49]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(n_jobs=12, loss='hinge', max_iter=50000)
evaluation(sgd, "SGD", vtrain, y)

Classifier:  SGD
Accuracy:  0.9689883070665989
Precision:  0.9848484848484849
Recall:  0.4180064308681672
F1-Measure:  0.5869074492099323
Execution time: 0.008592367172241211


2. SVC

In [48]:
from sklearn.svm import SVC
svc = SVC(kernel='rbf', gamma='auto')
evaluation(svc, "SVC", vtrain, y)

# from sklearn.ensemble import RandomForestClassifier
# rf = RandomForestClassifier(n_estimators=660, n_jobs=4)
# evaluation(rf, "Random Forest", vtrain, y)

Classifier:  SVC
Accuracy:  0.9483138451109981
Precision:  0.0
Recall:  0.0
F1-Measure:  0.0
Execution time: 4.066096782684326


### Statistics of the dataset

In [26]:
print("All: ", len(jobs_df))

fraud = jobs_df[jobs_df.fraudulent == 1]
print("fraud", len(fraud), " percent: ", len(fraud)/len(jobs_df)*100)

non_fraud = jobs_df[jobs_df.fraudulent == 0]
print("non fraud", len(non_fraud), " percent: ", len(non_fraud)/len(jobs_df)*100)

All:  17880
fraud 866  percent:  4.8434004474272925
non fraud 17014  percent:  95.1565995525727


# Undersampling
Removing some of the non_fraudulent records in order to get a more realistic and balance dataset. Then evaluate again.

In [29]:
under_non_fraud = non_fraud.sample(len(fraud)*2)
new_dataset = pd.concat([under_non_fraud, fraud]).sample(frac=1)

# new statistics
print("fraud", len(fraud), " percent: ", len(fraud)/len(new_dataset)*100)
print("non fraud", len(new_dataset), " percent: ", len(under_non_fraud)/len(new_dataset)*100)

fraud 866  percent:  33.33333333333333
non fraud 2598  percent:  100.0


In [51]:
new_y = new_dataset["fraudulent"]
new_train = new_dataset["description"]

starting_tm = time.time()
new_vectorizer = TfidfVectorizer(stop_words='english', tokenizer=stemming_tokenizer)

new_vtrain = new_vectorizer.fit_transform(new_train)
print("Vectorization took: ", str(time.time()-starting_tm))

  'stop_words.' % sorted(inconsistent))


Vectorization took:  6.971210479736328


In [52]:
# SGD again
sgd = SGDClassifier(n_jobs=12, loss='hinge', max_iter=50000)
evaluation(sgd, "SGD", new_vtrain, new_y)

Classifier:  SGD
Accuracy:  0.8974358974358975
Precision:  0.8814814814814815
Recall:  0.8095238095238095
F1-Measure:  0.8439716312056739
Execution time: 0.005142688751220703


In [53]:
# SVC again
svc = SVC(kernel='rbf', gamma='auto')
evaluation(svc, "SVC", new_vtrain, new_y)

Classifier:  SVC
Accuracy:  0.6841491841491841
Precision:  0.0
Recall:  0.0
F1-Measure:  0.0
Execution time: 0.6294717788696289


  _warn_prf(average, modifier, msg_start, len(result))


---
## Part 2
# Features Engineering

**Approach:** Feature extraction. Getting some feature from the **description** and using it with the rest of the columns.

In [31]:
jobs_df = pd.read_csv("fake_job_postings.csv")
jobs_df = jobs_df.fillna('')

y = jobs_df["fraudulent"]

### Features Extraction for Description, Company Profile, Benefits, Requirements and Title
- Length
- Character length without spaces
- Number of words

In [32]:
# Description
jobs_df['len_desc'] = jobs_df["description"].apply(lambda x: len(str(x)))
jobs_df['len_char_desc'] = jobs_df["description"].apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
jobs_df['len_word_desc'] = jobs_df["description"].apply(lambda x: len(str(x).split()))

feats_desc = ['len_desc', 'len_char_desc', 'len_word_desc']

In [33]:
# Title
jobs_df['len_title'] = jobs_df["title"].apply(lambda x: len(str(x)))
jobs_df['len_char_title'] = jobs_df["title"].apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
jobs_df['len_word_title'] = jobs_df["title"].apply(lambda x: len(str(x).split()))

feats_title = ['len_title', 'len_char_title', 'len_word_title']

In [34]:
# Company Profile
jobs_df['len_cp'] = jobs_df["company_profile"].apply(lambda x: len(str(x)))
jobs_df['len_char_cp'] = jobs_df["company_profile"].apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
jobs_df['len_word_cp'] = jobs_df["company_profile"].apply(lambda x: len(str(x).split()))

feats_cp = ['len_cp', 'len_char_cp', 'len_word_cp']

In [35]:
# Benefits
jobs_df['len_ben'] = jobs_df["benefits"].apply(lambda x: len(str(x)))
jobs_df['len_char_ben'] = jobs_df["benefits"].apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
jobs_df['len_word_ben'] = jobs_df["benefits"].apply(lambda x: len(str(x).split()))

feats_ben = ['len_ben', 'len_char_ben', 'len_word_ben']

In [36]:
# Requirements
jobs_df['len_req'] = jobs_df["requirements"].apply(lambda x: len(str(x)))
jobs_df['len_char_req'] = jobs_df["requirements"].apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
jobs_df['len_word_req'] = jobs_df["requirements"].apply(lambda x: len(str(x).split()))

feats_req = ['len_req', 'len_char_req', 'len_word_req']

### Binary features will be as they are.
Those are: telecommuting, has_company_logo, has_questions

In [37]:
feats_bin = ['telecommuting', 'has_company_logo', 'has_questions']

### Feature extraction for Salary Range
- Minimum
- Maximum
- Difference

In [38]:
def get_max(r):
    if r!= "":
        if "-" in r:
            if r.split("-")[1].isnumeric():
                return int(r.split("-")[1])
            else:
                return -1
        else:
            return int(r)
    else:
        return -1
    
def get_diff(r):
    if r!= "":
        if "-" in r:
            if r.split("-")[0].isnumeric() and r.split("-")[1].isnumeric():
                return int(r.split("-")[1]) - int(r.split("-")[0])
            else:
                return -1
        else:
            return int(r)
    else:
        return -1
    
jobs_df['min_salary'] = jobs_df["salary_range"].apply(lambda r: int(r.split("-")[0]) if r.split("-")[0].isnumeric() else -1 if "-" in r else r if r != '' else -1)
jobs_df['max_salary'] = jobs_df["salary_range"].apply(get_max)
jobs_df['diff_salary'] = jobs_df["salary_range"].apply(get_diff)

feats_salary = ['min_salary', 'max_salary', 'diff_salary']

### Columns with simple text(few words) that are repeated get as feature an id. Same texts will get the same id
Simple class used: Dictionary

Those are: location, department, employment_type, required_experience, required_education, industry, function

In [39]:
class Dictionary:    
    def __init__(self):
        self.dic = {}
        self.id = 0
    
    def add_to_dict(self, text):
        text = re.sub(r"[^A-Za-z0-9\-]", " ", text).lower() 
        if text in self.dic:
            return self.dic[text]
        else:
            self.dic[text] = int(self.id)
            self.id += 1
            return self.dic[text]
    

In [40]:
location_dict = Dictionary()
jobs_df['loc_ids'] = jobs_df['location'].apply(location_dict.add_to_dict)

department_dict = Dictionary()
jobs_df['dep_ids'] = jobs_df['department'].apply(department_dict.add_to_dict)

employment_type_dict = Dictionary()
jobs_df['emptype_ids'] = jobs_df['employment_type'].apply(employment_type_dict.add_to_dict)

required_experience_dict = Dictionary()
jobs_df['reqexp_ids'] = jobs_df['required_experience'].apply(required_experience_dict.add_to_dict)

required_education_dict = Dictionary()
jobs_df['reqedu_ids'] = jobs_df['required_education'].apply(required_education_dict.add_to_dict)

industry_dict = Dictionary()
jobs_df['ind_ids'] = jobs_df['industry'].apply(industry_dict.add_to_dict)

function_dict = Dictionary()
jobs_df['func_ids'] = jobs_df['function'].apply(function_dict.add_to_dict)

reptext_feat = ['loc_ids', 'dep_ids', 'emptype_ids', 'reqexp_ids', 'reqedu_ids', 'ind_ids', 'func_ids']

In [56]:
feat_train = jobs_df[
    feats_desc + feats_title + feats_cp +
    feats_ben + feats_req + feats_bin +
    feats_salary + reptext_feat
]

feat_train.head()

Unnamed: 0,len_desc,len_char_desc,len_word_desc,len_title,len_char_title,len_word_title,len_cp,len_char_cp,len_word_cp,len_ben,...,min_salary,max_salary,diff_salary,loc_ids,dep_ids,emptype_ids,reqexp_ids,reqedu_ids,ind_ids,func_ids
0,905,46,124,16,10,2,885,46,141,0,...,-1,-1,-1,0,0,0,0,0,0,0
1,2077,71,315,41,18,6,1286,58,153,1292,...,-1,-1,-1,1,1,1,1,0,1,1
2,355,31,50,39,18,4,879,44,141,0,...,-1,-1,-1,2,2,2,2,0,0,2
3,2600,58,346,33,19,5,614,43,85,782,...,-1,-1,-1,3,3,1,3,1,2,3
4,1520,59,168,19,12,3,1628,68,207,21,...,-1,-1,-1,4,2,1,3,1,3,4


1. SGD

In [62]:
sgd = SGDClassifier(n_jobs=12, loss='hinge', max_iter=50000)
evaluation(sgd, "SGD", feat_train, y)

Classifier:  SGD
Accuracy:  0.8622267412303
Precision:  0.09486780715396578
Recall:  0.2089041095890411
F1-Measure:  0.1304812834224599
Execution time: 0.025432586669921875


2. SVC

In [63]:
svc = SVC(kernel='rbf', gamma='auto')
evaluation(svc, "SVC", feat_train, y)

Classifier:  SVC
Accuracy:  0.9556007456363328
Precision:  1.0
Recall:  0.04727272727272727
F1-Measure:  0.09027777777777776
Execution time: 2.728785276412964


3. Decision Tree Classifier

In [67]:
from sklearn.tree import DecisionTreeClassifier

dc = DecisionTreeClassifier()
evaluation(dc, "Decision Tree Classifier", feat_train, y)

Classifier:  Decision Tree Classifier
Accuracy:  0.9671242162345365
Precision:  0.6199376947040498
Recall:  0.7343173431734318
F1-Measure:  0.6722972972972973
Execution time: 0.008805274963378906


4. Bagging Classifier 

using 20 Decission Tress

In [70]:
from sklearn.ensemble import BaggingClassifier, VotingClassifier

bagging = BaggingClassifier(DecisionTreeClassifier(), max_samples=0.5, max_features=1.0, n_estimators=20)
evaluation(bagging, "Bagging Classifier", feat_train, y)

Classifier:  Bagging Classifier
Accuracy:  0.9779698356210812
Precision:  0.930635838150289
Recall:  0.5770609318996416
F1-Measure:  0.7123893805309734
Execution time: 0.024933338165283203


5. Voting Classifier

In [75]:
sgd = SGDClassifier(n_jobs=12, loss='hinge', max_iter=50000)
svc = SVC(kernel='rbf', gamma='auto')
dc = DecisionTreeClassifier()

voting = VotingClassifier(estimators=[('sgd',sgd),('svc',svc),('dc',dc)], voting='hard')
evaluation(voting, "Voting Classifier", feat_train, y)

Classifier:  Voting Classifier
Accuracy:  0.9559396712421624
Precision:  0.5982142857142857
Recall:  0.2375886524822695
F1-Measure:  0.34010152284263956
Execution time: 2.6901402473449707


---
### Reusing undersampling

In [57]:
new_dataset['len_desc'] = new_dataset["description"].apply(lambda x: len(str(x)))
new_dataset['len_char_desc'] = new_dataset["description"].apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
new_dataset['len_word_desc'] = new_dataset["description"].apply(lambda x: len(str(x).split()))
new_dataset['len_title'] = new_dataset["title"].apply(lambda x: len(str(x)))
new_dataset['len_char_title'] = new_dataset["title"].apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
new_dataset['len_word_title'] = new_dataset["title"].apply(lambda x: len(str(x).split()))
new_dataset['len_cp'] = new_dataset["company_profile"].apply(lambda x: len(str(x)))
new_dataset['len_char_cp'] = new_dataset["company_profile"].apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
new_dataset['len_word_cp'] = new_dataset["company_profile"].apply(lambda x: len(str(x).split()))
new_dataset['len_ben'] = new_dataset["benefits"].apply(lambda x: len(str(x)))
new_dataset['len_char_ben'] = new_dataset["benefits"].apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
new_dataset['len_word_ben'] = new_dataset["benefits"].apply(lambda x: len(str(x).split()))
new_dataset['len_req'] = new_dataset["requirements"].apply(lambda x: len(str(x)))
new_dataset['len_char_req'] = new_dataset["requirements"].apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
new_dataset['len_word_req'] = new_dataset["requirements"].apply(lambda x: len(str(x).split()))    
new_dataset['min_salary'] = new_dataset["salary_range"].apply(lambda r: int(r.split("-")[0]) if r.split("-")[0].isnumeric() else -1 if "-" in r else r if r != '' else -1)
new_dataset['max_salary'] = new_dataset["salary_range"].apply(get_max)
new_dataset['diff_salary'] = new_dataset["salary_range"].apply(get_diff)

new_location_dict = Dictionary()
new_dataset['loc_ids'] = new_dataset['location'].apply(new_location_dict.add_to_dict)
new_department_dict = Dictionary()
new_dataset['dep_ids'] = new_dataset['department'].apply(new_department_dict.add_to_dict)
new_employment_type_dict = Dictionary()
new_dataset['emptype_ids'] = new_dataset['employment_type'].apply(new_employment_type_dict.add_to_dict)
new_required_experience_dict = Dictionary()
new_dataset['reqexp_ids'] = new_dataset['required_experience'].apply(new_required_experience_dict.add_to_dict)
new_required_education_dict = Dictionary()
new_dataset['reqedu_ids'] = new_dataset['required_education'].apply(new_required_education_dict.add_to_dict)
new_industry_dict = Dictionary()
new_dataset['ind_ids'] = new_dataset['industry'].apply(new_industry_dict.add_to_dict)
new_function_dict = Dictionary()
new_dataset['func_ids'] = new_dataset['function'].apply(new_function_dict.add_to_dict)

new_feat_train = new_dataset[
    feats_desc + feats_title + feats_cp +
    feats_ben + feats_req + feats_bin +
    feats_salary + reptext_feat
]

new_feat_train.head()


Unnamed: 0,len_desc,len_char_desc,len_word_desc,len_title,len_char_title,len_word_title,len_cp,len_char_cp,len_word_cp,len_ben,...,min_salary,max_salary,diff_salary,loc_ids,dep_ids,emptype_ids,reqexp_ids,reqedu_ids,ind_ids,func_ids
7471,1968,47,256,11,10,2,0,0,0,894,...,-1,-1,-1,0,0,0,0,0,0,0
9182,3085,69,394,30,13,3,251,31,52,232,...,-1,-1,-1,1,1,0,1,0,1,0
17738,1084,45,135,31,16,3,0,0,0,129,...,-1,-1,-1,2,0,0,2,1,2,1
12112,2847,54,239,18,13,2,250,33,30,0,...,-1,-1,-1,3,0,1,2,2,3,2
17633,658,67,69,26,12,3,128,24,19,0,...,-1,-1,-1,4,2,2,0,0,4,3


1. SGD

In [71]:
sgd = SGDClassifier(n_jobs=12, loss='hinge', max_iter=50000)
evaluation(sgd, "SGD", new_feat_train, new_y)

Classifier:  SGD
Accuracy:  0.5629370629370629
Precision:  0.38235294117647056
Recall:  0.5591397849462365
F1-Measure:  0.4541484716157205
Execution time: 0.009584903717041016


2. SVC

In [72]:
svc = SVC(kernel='rbf', gamma='auto')
evaluation(svc, "SVC", new_feat_train, new_y)

Classifier:  SVC
Accuracy:  0.6934731934731935
Precision:  1.0
Recall:  0.0673758865248227
F1-Measure:  0.12624584717607976
Execution time: 0.054877281188964844


3. DecissionTreeClassifier

In [73]:
dc = DecisionTreeClassifier()
evaluation(dc, "Decision Tree Classifier", new_feat_train, new_y)

Classifier:  Decision Tree Classifier
Accuracy:  0.8403263403263403
Precision:  0.7722419928825622
Recall:  0.7482758620689656
F1-Measure:  0.7600700525394046
Execution time: 0.005932331085205078


4. Bagging Classifier 

using 20 DecissionTress

In [74]:
from sklearn.ensemble import BaggingClassifier, VotingClassifier

bagging = BaggingClassifier(DecisionTreeClassifier(), max_samples=0.5, max_features=1.0, n_estimators=20)
evaluation(bagging, "Bagging Classifier", new_feat_train, new_y)

Classifier:  Bagging Classifier
Accuracy:  0.8822843822843823
Precision:  0.84
Recall:  0.7749077490774908
F1-Measure:  0.8061420345489444
Execution time: 0.00916600227355957


5. Voting Classifier

In [76]:
sgd = SGDClassifier(n_jobs=12, loss='hinge', max_iter=50000)
svc = SVC(kernel='rbf', gamma='auto')
dc = DecisionTreeClassifier()

voting = VotingClassifier(estimators=[('sgd',sgd),('svc',svc),('dc',dc)], voting='hard')
evaluation(voting, "Voting Classifier", new_feat_train, new_y)


Classifier:  Voting Classifier
Accuracy:  0.789044289044289
Precision:  0.7911392405063291
Recall:  0.45787545787545786
F1-Measure:  0.580046403712297
Execution time: 0.06924128532409668


6. Random Forest

In [79]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=660, n_jobs=4)
evaluation(rf, "Random Forest", new_feat_train, new_y)


Classifier:  Random Forest
Accuracy:  0.8927738927738927
Precision:  0.852233676975945
Recall:  0.835016835016835
F1-Measure:  0.8435374149659864
Execution time: 0.22014546394348145


## THE END

