# Job Posting Data Acquisition and EDA

In [1]:
import numpy as np
import pandas as pd
# from gensim.models import Word2Vec
# from nltk import word_tokenize
# import matplotlib.pyplot as plt
# import seaborn as sns
import re
import dfunc
from dfunc import df_info
from dfunc import chi_sq

np.random.seed(0)
pd.set_option('display.max_columns', 300)

In [2]:
df = pd.read_csv('fake_job_postings.csv', index_col=0)

## Initial Data Cleaning/Engineering and EDA
- Total Observations: 17880 rows
- Total Features: 16 columns
- Target Variable: 'fraudulent', 0 is real, 1 is false
    - 0: 17014
    - 1: 866
- Features to drop:
    - 'title': No standardization of naming job titles, >1000 different titles
- Categorical Features: 'location', 'salary_range', 'telecommuting', 'has_company_logo', 'has_questions', 'employment_type', 'required_experience', 'required_education', 'industry', 'function'
- NLP Features: 'company_profile', 'description', 'requirements', 'benefits'

In [3]:
df_info(df, 'fraudulent')

Total Observations: 17880
Target Variable: fraudulent
Classes: 2
Imbalance: 0 - 17014, 1 - 866
Imbalance Ratio: 0 - 95.16%, 1 - 4.84%

No missing values: title, telecommuting, has_company_logo, has_questions, fraudulent

Values Missing:
---------------
location: 346 (1.94%)
department: 11547 (64.58%) ***
salary_range: 15012 (83.96%) ***
company_profile: 3308 (18.5%)
description: 1 (0.01%)
requirements: 2695 (15.07%)
benefits: 7210 (40.32%) ***
employment_type: 3471 (19.41%)
required_experience: 7050 (39.43%) ***
required_education: 8105 (45.33%) ***
industry: 4903 (27.42%) ***
function: 6455 (36.1%) ***


In [4]:
# Drop 'title'
df.drop(columns=['title'], inplace=True)

### Narrow down to US job postings

In [6]:
# Only keeping US job postings
df = df.loc[df['location'].str[:2] == 'US']

### Run Chi-squared tests on features with missing values
- Null Hypotheses: The proportions of false job reportings for null feature values and a non-null feature values are equal

In [7]:
chi_sq(df, feature='department', target='fraudulent')

Reject Null Hypothesis
Chi-Squared: 66.3287
p-value: 0.0


In [8]:
chi_sq(df, feature='required_education', target='fraudulent')

Reject Null Hypothesis
Chi-Squared: 12.3865
p-value: 0.002


In [9]:
chi_sq(df, feature='required_experience', target='fraudulent')

Reject Null Hypothesis
Chi-Squared: 16.2374
p-value: 0.0003


In [10]:
chi_sq(df, feature='industry', target='fraudulent')

Cannot Reject Null Hypothesis
Chi-Squared: 1.8199
p-value: 0.4025


In [11]:
chi_sq(df, feature='function', target='fraudulent')

Cannot Reject Null Hypothesis
Chi-Squared: 2.914
p-value: 0.2329


In [12]:
chi_sq(df, feature='employment_type', target='fraudulent')

Reject Null Hypothesis
Chi-Squared: 48.2519
p-value: 0.0


In [13]:
chi_sq(df, feature='salary_range', target='fraudulent')

Reject Null Hypothesis
Chi-Squared: 65.1333
p-value: 0.0


### Salary range feature
- Ratio of fake to real job postings is much greater in postings that include salary range
- Convert feature to whether or not salary is posted

In [14]:
# No salary range in posting
df.loc[df['salary_range'].isna() == True]['fraudulent'].value_counts(normalize=True)

0    0.939664
1    0.060336
Name: fraudulent, dtype: float64

In [15]:
# Salary range in posting
df.loc[df['salary_range'].isna() == False]['fraudulent'].value_counts(normalize=True)

0    0.883751
1    0.116249
Name: fraudulent, dtype: float64

In [16]:
# Create salary dummy
df['salary'] = np.where(df['salary_range'].isna() == True, 0, 1)
df.drop(columns='salary_range', inplace=True)

### Department feature

In [17]:
# No department posting
df.loc[df['department'].isna() == True]['fraudulent'].value_counts(normalize=True)

0    0.94408
1    0.05592
Name: fraudulent, dtype: float64

In [18]:
# Department posting
df.loc[df['department'].isna() == False]['fraudulent'].value_counts(normalize=True)

0    0.899934
1    0.100066
Name: fraudulent, dtype: float64

In [19]:
# No employment type posting
df.loc[df['employment_type'].isna() == True]['fraudulent'].value_counts(normalize=True)

0    0.893303
1    0.106697
Name: fraudulent, dtype: float64

In [20]:
# Employment type posting
df.loc[df['employment_type'].isna() == False]['fraudulent'].value_counts(normalize=True)

0    0.93906
1    0.06094
Name: fraudulent, dtype: float64

### Clean location feature
- Replace with state dummies

In [None]:
# Create 'state' feature, if no state exists then 'no state'
condition = df['location'].str.extract(r'([A-Z]{2}(?<!US))').notnull()
value = df['location'].str.extract(r'([A-Z]{2}(?<!US))')
df['state'] = np.where(condition, value, 'No State')
df['state'] = np.where((df['state'] == 'AU') | (df['state'] == 'LO'), 'No State', df['state'])
df.drop(columns='location', inplace=True)

In [None]:
# Categorize, dummy, then drop 'state'
df['state'] = df['state'].astype('category')
df = pd.concat([df, pd.get_dummies(df['state'])], axis=1)
df.drop(columns='state', inplace=True)