# Job Posting Data Acquisition and EDA

In [1]:
import numpy as np
import pandas as pd
# from gensim.models import Word2Vec
# from nltk import word_tokenize
# import matplotlib.pyplot as plt
# import seaborn as sns

import dfunc
from dfunc import df_info

np.random.seed(0)
pd.set_option('display.max_columns', 300)

In [2]:
df = pd.read_csv('fake_job_postings.csv', index_col=0)

In [None]:
df.head(3)

In [None]:
df.shape

## Initial Data Cleaning/Engineering and EDA
- Total Observations: 17880 rows
- Total Features: 16 columns
- Target Variable: 'fraudulent', 0 is real, 1 is false
    - 0: 17014
    - 1: 866
- Features to drop:
    - 'title': No standardization of naming job titles
    - 'department': No standardization of departments, 'function' can replace
- Categorical Features: 'location', 'salary_range', 'telecommuting', 'has_company_logo', 'has_questions', 'employment_type', 'required_experience', 'required_education', 'industry', 'function'
- NLP Features: 'company_profile', 'description', 'requirements', 'benefits'

In [None]:
df['fraudulent'].value_counts()[1]

In [3]:
df_info(df, 'fraudulent')

Target variable: fraudulent
----------------
Classes: 2
Imbalance: 0 - 17014, 1 - 866

No missing values: title, telecommuting, has_company_logo, has_questions, fraudulent

Values Missing:
---------------
location: 346 (1.94%)
department: 11547 (64.58%) ***
salary_range: 15012 (83.96%) ***
company_profile: 3308 (18.5%)
description: 1 (0.01%)
requirements: 2695 (15.07%)
benefits: 7210 (40.32%) ***
employment_type: 3471 (19.41%)
required_experience: 7050 (39.43%) ***
required_education: 8105 (45.33%) ***
industry: 4903 (27.42%) ***
function: 6455 (36.1%) ***


In [None]:
df['fraudulent'].value_counts(normalize=True)

In [None]:
# Missing values in columns
df.isna().sum()['title']

In [None]:
len(df)

In [None]:
df.drop(columns=['title', 'department'], inplace=True)

### Narrow down to US job postings

In [None]:
# Only keeping US job postings
df = df.loc[df['location'].str[:2] == 'US']

print('Real:', df['fraudulent'].value_counts()[0])
print('Fake:', df['fraudulent'].value_counts()[1])
print('Ratio of Real to Fake: {} to {}'.format(round(df['fraudulent'].value_counts(normalize=True)[0], 2),
                                               round(df['fraudulent'].value_counts(normalize=True)[1], 2)))

### Salary range feature
- Ratio of real to fake job postings is much greater in postings that include salary range
- Convert feature to whether or not salary is posted

In [None]:
# No salary range in posting
df.loc[df['salary_range'].isna() == True]['fraudulent'].value_counts(normalize=True)

In [None]:
# Salary range in posting
df.loc[df['salary_range'].isna() == False]['fraudulent'].value_counts(normalize=True)

In [None]:
df['salary'] = np.where(df['salary_range'].isna() == True, 0, 1)

In [None]:
df.drop(columns='salary_range', inplace=True)

In [None]:
df.head(10)