In [12]:
import numpy as np
import pandas as pd
import time
import re

import nltk
from nltk.stem import LancasterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer

In [23]:
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, train_test_split

def evaluation(clf, clf_name, train, y):
    x_train, x_test, y_train, y_test = train_test_split(train, y, test_size=.33)
    clf.fit(x_train, y_train)
    
    starting_tm = time.time()
    y_pred = clf.predict(x_test)
    
    print("Classifier: ", clf_name)
    print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
    print("Precision: ", metrics.precision_score(y_test, y_pred))
    print("Recall: ", metrics.recall_score(y_test, y_pred))
    print("F1-Measure: ", metrics.f1_score(y_test, y_pred))
    print("Execution time: " + str(time.time() - starting_tm))

In [3]:
jobs_df = pd.read_csv("fake_job_postings.csv")
jobs_df = jobs_df.fillna('')

In [16]:
y = jobs_df["fraudulent"]
train = jobs_df["description"]

print("len", len(jobs_df))

len 17880


# K

## Part 1:
Handling description using TF-IDF

In [13]:
porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()

def stemming_tokenizer(str_input):
    stemmer = porter_stemmer
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [stemmer.stem(word) for word in words]
    return words

In [18]:
starting_tm = time.time()
vectorizer = CountVectorizer(stop_words='english', tokenizer=stemming_tokenizer)

vtrain = vectorizer.fit_transform(train)
print("Vectorization took: ", str(time.time()-starting_tm))

  'stop_words.' % sorted(inconsistent))


Vectorization took:  47.18437695503235


In [24]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=660, n_jobs=4)
evaluation(rf, "Random Forest", vtrain, y)

Classifier:  Random Forest
Accuracy:  0.9803423148618878
Precision:  1.0
Recall:  0.575091575091575
F1-Measure:  0.7302325581395348
Execution time: 0.7371010780334473
