In [1]:
import numpy as np
import pandas as pd
import time

In [2]:
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, train_test_split

def evaluation(clf, clf_name, train, y):
    x_train, x_test, y_train, y_test = train_test_split(train, y, test_size=.33)
    clf.fit(x_train, y_train)
    
    starting_tm = time.time()
    y_pred = clf.predict(x_test)
    
    print("Classifier: ", clf_name)
    print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
    print("Precision: ", metrics.precision_score(y_test, y_pred))
    print("Recall: ", metrics.recall_score(y_test, y_pred))
    print("F1-Measure: ", metrics.f1_score(y_test, y_pred))
    print("Execution time: " + str(time.time() - starting_tm))

## Reading the Data

In [3]:
jobs_df = pd.read_csv("fake_job_postings.csv")
jobs_df = jobs_df.fillna('')

y = jobs_df["fraudulent"]
print("len", len(jobs_df))

len 17880


# J
### Part 1: Gaussian Naive Bayes

In [4]:
train = jobs_df["telecommuting"]
train = train.values.reshape(-1, 1)

In [5]:
from sklearn.naive_bayes import GaussianNB

gaussNB_cls = GaussianNB()
evaluation(gaussNB_cls, "Gaussian Naive Bayes", train, y)


Classifier:  Gaussian Naive Bayes
Accuracy:  0.9150991357397051
Precision:  0.056179775280898875
Recall:  0.056818181818181816
F1-Measure:  0.05649717514124293
Execution time: 0.018354177474975586


### Part 2: 

In [6]:
train2 = jobs_df[["telecommuting", "has_company_logo", "has_questions"]]
train2.head()
train2 = train2.values.reshape(-1, 3)

## RandomForest

In [7]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=660, n_jobs=4)
evaluation(rf, "Random Forest", train2, y)

Classifier:  Random Forest
Accuracy:  0.9542450432130147
Precision:  0.0
Recall:  0.0
F1-Measure:  0.0
Execution time: 0.7378592491149902


  _warn_prf(average, modifier, msg_start, len(result))


## Perceptron

In [8]:
from sklearn.linear_model import Perceptron

perceptron = Perceptron(tol=1e-3, random_state=0)
evaluation(perceptron, "Perceptron", train2, y)

Classifier:  Perceptron
Accuracy:  0.9525504151838672
Precision:  0.0
Recall:  0.0
F1-Measure:  0.0
Execution time: 0.07857632637023926


  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
## grid search rf+sgd

# starting_tm = time.time()
# sgd_clf = SGDClassifier(n_jobs=12)
# msg ="Using Stohastic Gradient Descent"
# tuned_parameters = [{'loss':['hinge', 'modified_huber', 'log', 'squared_hinge'], 'max_iter':[16000]}]
# scores = ['accuracy']
# grid_evaluattion(sgd_clf, msg, train2, y, tuned_parameters, scores)
# print("Execution time: " + str(time.time() - starting_tm))
# 
# starting_tm = time.time()
# rf = RandomForestClassifier(n_jobs=4)
# msg = "Using Random Forest Classifier"
# tuned_parameters = [{'n_estimators': [i*10 for i in range(30, 51)]}]
# scores = ['accuracy']
# grid_evaluattion(rf, msg, train2, y, tuned_parameters, scores)

Using Stohastic Gradient Descent
# Tuning hyper-parameters for accuracy

Best parameters set found on development set: 
{'loss': 'hinge', 'max_iter': 16000}

Grid scores on development set:
0.950 (+/-0.000) for {'loss': 'hinge', 'max_iter': 16000}
0.950 (+/-0.000) for {'loss': 'modified_huber', 'max_iter': 16000}
0.950 (+/-0.000) for {'loss': 'log', 'max_iter': 16000}
0.950 (+/-0.000) for {'loss': 'squared_hinge', 'max_iter': 16000}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.95      1.00      0.98      8523
           1       0.00      0.00      0.00       417

    accuracy                           0.95      8940
   macro avg       0.48      0.50      0.49      8940
weighted avg       0.91      0.95      0.93      8940


Execution time: 0.41741085052490234


  _warn_prf(average, modifier, msg_start, len(result))


---

The End