In [1]:
# library needed
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import re
from sklearn import preprocessing
import scipy
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import sklearn
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import time
import pickle
import copy
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import f_classif
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.pipeline import Pipeline

In [2]:
def parsing(dat,train_dat):
    
    ### SALARY PROCESSING 
    # see if character is in text
    def alpha_in_text(text):
        return(any(c.isalpha() for c in text))

    # see how many dashes are in text
    def number_of_dashes(text):
        return(sum([1 for i in text if '-' in i]))

    # extract smallest salary range value
    def salary_extract_first(text):

        if pd.isna(text) is True:
            return(-1)

        elif alpha_in_text(text) is True:
            return(-2)

        elif '-' in text:
            if number_of_dashes(text) == 1:
                if re.split('-',text)[0].isdigit() is True:
                    return(float(re.split('-',text)[0]))
                else:
                    return(-1)

            else:
                return(-1)
        else:
            return(-1)

    # largest salary range value
    def salary_extract_second(text):

        if pd.isna(text) is True:
            return(-1)

        elif alpha_in_text(text) is True:
            return(-2)

        elif '-' in text:
            if number_of_dashes(text) == 1:
                if re.split('-',text)[1].isdigit() is True:
                    return(float(re.split('-',text)[1]))
                else:
                    return(-1)

            else:
                return(-1)
        else:
            return(-1)

    # convert numeric salary to category
    def salary_category_first(number):
        percentile = [60.0, 14000.0, 20000.0, 30000.0, 35000.0, 44374.4, 55000.0, 70000.0, 90000.0]
        if number == -1:
            return(str(1))

        if number == -2:
            return(str(2))

        for i in range(len(percentile)):
            if i not in {0,8}:
                if (number > percentile[i-1]) & (number <= percentile[i]):
                    return(str(i+3))
                else:
                    continue

            if i == 0:
                if number < percentile[0]:
                    return(str(i+3))
            if i == 8:
                if number >= percentile[8]:
                    return(str(i+3))



    def salary_category_second(number):
        percentile = [120, 20000.0, 30000.0, 40000.0, 50000.0, 65000.0, 80000.0, 100000.0, 130000.0]
        if number == -1:
            return(str(1))

        if number == -2:
            return(str(2))

        for i in range(len(percentile)):
            if i not in {0,8}:
                if (number > percentile[i-1]) & (number <= percentile[i]):
                    return(str(i+3))
                else:
                    continue

            if i == 0:
                if number < percentile[0]:
                    return(str(i+3))
            if i == 8:
                if number >= percentile[8]:
                    return(str(i+3))
    
    
    ### ONE HOT ENCODING (training)
    employment_type_onehot = OneHotEncoder(handle_unknown='ignore').fit(train_dat[['employment_type']].fillna('NaN'))
    required_experience_onehot = OneHotEncoder(handle_unknown='ignore').fit(train_dat[['required_experience']].fillna('NaN'))
    required_education_onehot = OneHotEncoder(handle_unknown='ignore').fit(train_dat[['required_education']].fillna('NaN'))
    industry_onehot = OneHotEncoder(handle_unknown='ignore').fit(train_dat[['industry']].fillna('NaN'))
    function_onehot = OneHotEncoder(handle_unknown='ignore').fit(train_dat[['function.']].fillna('NaN'))
    category_1 = train_dat.salary_range.apply(salary_extract_first).apply(salary_category_first)
    category_2 = train_dat.salary_range.apply(salary_extract_second).apply(salary_category_second)
    salary_1_onehot = OneHotEncoder(handle_unknown='ignore').fit(pd.DataFrame(category_1))
    salary_2_onehot = OneHotEncoder(handle_unknown='ignore').fit(pd.DataFrame(category_2))
    profile_onehot = TfidfVectorizer(token_pattern = r"[A-Za-z]+").fit(train_dat['company_profile'].fillna('NaN'))
    description_onehot = TfidfVectorizer(token_pattern = r"[A-Za-z]+").fit(train_dat['description'].fillna('NaN'))
    requirements_onehot = TfidfVectorizer(token_pattern = r"[A-Za-z]+").fit(train_dat['requirements'].fillna('NaN'))
    benefits_onehot = TfidfVectorizer(token_pattern = r"[A-Za-z]+").fit(train_dat['benefits'].fillna('NaN'))
    
    # transforming to one hot encoding
    profile_transformed = pd.DataFrame.sparse.from_spmatrix(profile_onehot.transform(dat['company_profile'].fillna('NaN')))
    description_transformed = pd.DataFrame.sparse.from_spmatrix(description_onehot.transform(dat['description'].fillna('NaN')))
    requirements_transformed = pd.DataFrame.sparse.from_spmatrix(requirements_onehot.transform(dat['required_education'].fillna('NaN')))
    benefits_transformed = pd.DataFrame.sparse.from_spmatrix(benefits_onehot.transform(dat['benefits'].fillna('NaN')))
    
    names = [['profile.'+ words for words in profile_onehot.get_feature_names()],
            ['description.'+ words for words in description_onehot.get_feature_names()],
            ['requirements.'+ words for words in requirements_onehot.get_feature_names()],
            ['benefits.'+ words for words in benefits_onehot.get_feature_names()]]
    
    descriptive = pd.concat([profile_transformed,description_transformed,requirements_transformed,benefits_transformed],axis=1,ignore_index=True)
    descriptive.columns = [item for sublist in names for item in sublist]
    
    ### OTHER PARSING
    nacols = dat.isna()[['title', 'location', 'department', 'salary_range','description', 'requirements', 'benefits',
                      'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
                      'required_experience', 'required_education', 'industry', 'function.']].astype('int')
    
    numeric_cols = dat[['telecommuting', 'has_company_logo', 'has_questions']]
    # func to count words in document
    document_word_count = lambda document: len(document.split(' '))
    
    # count words in column
    columns = ["company_profile","description","requirements","benefits"]
    df = copy.deepcopy(dat[columns])
    for column in columns:
            df[(str(column) + "_length")] = dat[column].apply(lambda x: len(x) if x == x else 0)
    
    
    # salary column one hot
    category_1 = dat.salary_range.apply(salary_extract_first).apply(salary_category_first)
    category_2 = dat.salary_range.apply(salary_extract_second).apply(salary_category_second)
    
    salary_1_transform = pd.DataFrame.sparse.from_spmatrix(salary_1_onehot.transform(pd.DataFrame(category_1)))
    salary_2_transform = pd.DataFrame.sparse.from_spmatrix(salary_2_onehot.transform(pd.DataFrame(category_2)))
    
    # transform to one hot
    employment_type_transformed =  pd.DataFrame.sparse.from_spmatrix(employment_type_onehot.transform(dat[['employment_type']].fillna('NaN')))
    required_experience_transformed =  pd.DataFrame.sparse.from_spmatrix(required_experience_onehot.transform(dat[['required_experience']].fillna('NaN')))
    required_education_transformed =  pd.DataFrame.sparse.from_spmatrix(required_education_onehot.transform(dat[['required_education']].fillna('NaN')))
    industry_transformed =  pd.DataFrame.sparse.from_spmatrix(industry_onehot.transform(dat[['industry']].fillna('NaN')))
    function_transformed =  pd.DataFrame.sparse.from_spmatrix(function_onehot.transform(dat[['function.']].fillna('NaN')))
    
    
    return(pd.concat([nacols,salary_1_transform, salary_2_transform,df.iloc[:,4:],
                      employment_type_transformed, required_experience_transformed, required_education_transformed, industry_transformed,function_transformed,numeric_cols,
                     descriptive],axis = 1))
    

# 1. import model

In [3]:
with open('Final Random Forest Model.pkl', 'rb') as inp:
    pipe = pickle.load(inp)

# 2. import data

In [4]:
dat_train = pd.read_csv("job_training_data.csv")
dat_test = pd.read_csv("job_verification_data.csv")

# 3. parsing

In [5]:
data_test = parsing(dat_test,dat_train)

In [6]:
X_test = data_test
y_test = dat_test.fraudulent

# 4. classifier

In [16]:
sklearn.metrics.confusion_matrix(y_test,pipe.predict_proba(X_test)[:,1]>0.2)



array([[932,  18],
       [ 10,  40]], dtype=int64)

In [17]:
accuracy_score(y_test,pipe.predict_proba(X_test)[:,1]>0.2)



0.972

In [18]:
recall_score(y_test,pipe.predict_proba(X_test)[:,1]>0.2)



0.8