In [1]:
import numpy as np
import pandas as pd
import time

In [2]:
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, train_test_split

def evaluation(clf, clf_name, train, y):
    x_train, x_test, y_train, y_test = train_test_split(train, y, test_size=.33)
    clf.fit(x_train, y_train)
    
    starting_tm = time.time()
    y_pred = clf.predict(x_test)
    
    print("Classifier: ", clf_name)
    print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
    print("Precision: ", metrics.precision_score(y_test, y_pred))
    print("Recall: ", metrics.recall_score(y_test, y_pred))
    print("F1-Measure: ", metrics.f1_score(y_test, y_pred))
    print("Execution time: " + str(time.time() - starting_tm))

In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

def grid_evaluattion(clf, msg, X, y, tuned_parameters, scores):
    print(msg)
    
    # Split the dataset in two equal parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)    
    for score in scores:
        print("# Tuning hyper-parameters for %s\n" % score)
    
        clf = GridSearchCV(clf, tuned_parameters, scoring=score)
        clf.fit(X_train, y_train)
    
        print("Best parameters set found on development set: ")
        print(clf.best_params_)
        print("\nGrid scores on development set:")
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()
    
        print("Detailed classification report:\n")
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.\n")
        
        y_true, y_pred = y_test, clf.predict(X_test)
        print(classification_report(y_true, y_pred))
        print() 

## Reading the Data

In [8]:
jobs_df = pd.read_csv("fake_job_postings.csv")
jobs_df = jobs_df.fillna('')

y = jobs_df["fraudulent"]
print("len", len(jobs_df))

len 17880


# J
### Part 1: Gaussian Naive Bayes

In [48]:
train = jobs_df["telecommuting"]
train = train.values.reshape(-1, 1)

In [49]:
from sklearn.naive_bayes import GaussianNB

gaussNB_cls = GaussianNB()
evaluation(gaussNB_cls, "Gaussian Naive Bayes", train, y)


Classifier:  Gaussian Naive Bayes
Accuracy:  0.9122182680901542
Precision:  0.08208955223880597
Recall:  0.07482993197278912
F1-Measure:  0.07829181494661921
Execution time: 0.04381823539733887


### Part 2: 

In [6]:
train2 = jobs_df[["telecommuting", "has_company_logo", "has_questions"]]
train2.head()
train2 = train2.values.reshape(-1, 3)

## RandomForest

In [9]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=660, n_jobs=4)
evaluation(rf, "Random Forest", train2, y)

Classifier:  Random Forest
Accuracy:  0.9525504151838672
Precision:  0.0
Recall:  0.0
F1-Measure:  0.0
Execution time: 0.7376866340637207


  _warn_prf(average, modifier, msg_start, len(result))


## Perceptron

In [10]:
from sklearn.linear_model import Perceptron

perceptron = Perceptron(tol=1e-3, random_state=0)
evaluation(perceptron, "Perceptron", train2, y)

Classifier:  Perceptron
Accuracy:  0.9544145060159295
Precision:  0.0
Recall:  0.0
F1-Measure:  0.0
Execution time: 0.02906060218811035


  _warn_prf(average, modifier, msg_start, len(result))


## SGDClassifier

In [11]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(n_jobs=12, loss='hinge', max_iter=50000)
evaluation(sgd, "SGD", train2, y)


Classifier:  SGD
Accuracy:  0.9535671920013558
Precision:  0.0
Recall:  0.0
F1-Measure:  0.0
Execution time: 0.010289669036865234


  _warn_prf(average, modifier, msg_start, len(result))


Grid Search parameters for **RandomForest** and **Stochastic Gradient Descent**

In [14]:
starting_tm = time.time()
sgd_clf = SGDClassifier(n_jobs=12)
msg ="Using Stohastic Gradient Descent"
tuned_parameters = [{'loss':['hinge', 'modified_huber', 'log', 'squared_hinge'], 'max_iter':[16000]}]
scores = ['accuracy']
grid_evaluattion(sgd_clf, msg, train2, y, tuned_parameters, scores)
print("Execution time: " + str(time.time() - starting_tm))


Using Stohastic Gradient Descent
# Tuning hyper-parameters for accuracy

Best parameters set found on development set: 
{'loss': 'hinge', 'max_iter': 16000}

Grid scores on development set:
0.950 (+/-0.000) for {'loss': 'hinge', 'max_iter': 16000}
0.950 (+/-0.000) for {'loss': 'modified_huber', 'max_iter': 16000}
0.950 (+/-0.000) for {'loss': 'log', 'max_iter': 16000}
0.950 (+/-0.000) for {'loss': 'squared_hinge', 'max_iter': 16000}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.95      1.00      0.98      8523
           1       0.00      0.00      0.00       417

    accuracy                           0.95      8940
   macro avg       0.48      0.50      0.49      8940
weighted avg       0.91      0.95      0.93      8940


Execution time: 0.41741085052490234


  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
starting_tm = time.time()
rf = RandomForestClassifier(n_jobs=4)
msg = "Using Random Forest Classifier"
tuned_parameters = [{'n_estimators': [i*10 for i in range(30, 51)]}]
scores = ['accuracy']
grid_evaluattion(rf, msg, train2, y, tuned_parameters, scores)

Using Random Forest Classifier
# Tuning hyper-parameters for accuracy

Best parameters set found on development set: 
{'n_estimators': 300}

Grid scores on development set:
0.950 (+/-0.000) for {'n_estimators': 300}
0.950 (+/-0.000) for {'n_estimators': 310}
0.950 (+/-0.000) for {'n_estimators': 320}
0.950 (+/-0.000) for {'n_estimators': 330}
0.950 (+/-0.000) for {'n_estimators': 340}
0.950 (+/-0.000) for {'n_estimators': 350}
0.950 (+/-0.000) for {'n_estimators': 360}
0.950 (+/-0.000) for {'n_estimators': 370}
0.950 (+/-0.000) for {'n_estimators': 380}
0.950 (+/-0.000) for {'n_estimators': 390}
0.950 (+/-0.000) for {'n_estimators': 400}
0.950 (+/-0.000) for {'n_estimators': 410}
0.950 (+/-0.000) for {'n_estimators': 420}
0.950 (+/-0.000) for {'n_estimators': 430}
0.950 (+/-0.000) for {'n_estimators': 440}
0.950 (+/-0.000) for {'n_estimators': 450}
0.950 (+/-0.000) for {'n_estimators': 460}
0.950 (+/-0.000) for {'n_estimators': 470}
0.950 (+/-0.000) for {'n_estimators': 480}
0.950 (+/-

  _warn_prf(average, modifier, msg_start, len(result))


---

The End