In [1]:
# Step1 - importing libraries
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
# step2 - read csv file into pandas data frame
fake_job_detector_df = pd.read_csv('data/fake_job_postings.csv')

In [6]:
# step3 - find number of rows and columns in data set
fake_job_detector_df.shape

(17880, 18)

## understand dataset - structure of data in dataset 
                     a. what each feature means
                     b. what are possible values of each feature means? 
                        
1. job_id - number given to each job posting 
   assumption - no two job can have same id.
2. title - label given to employee to identify employee's role, 
           responsibilities, hierachy level.
3. location - physical place where employee do his/her task.
4. department - distinct unit or division within company 
               that specialize in specific work.
5. salary_range - maximum and minimum amount given by employer 
                   for specific job
6. company profile - professional overview of employer that provides
                    information about
                    1. mission
                    2. values
                    3. products/services
                    4. history
                    5. culture
                    6. achievements

                    to job seekers and potential employees.
7. description - describe job duties, responsibilities, required skills,
                 qualifications that serve as a guide to both
                 employer and employee.
8. requirements - essential skills, qualifications, experience, 
                  education, personal attributes that 
                  an employee must have to do a specific job 
                  successfully.
7. benifits - attract right candiates,
              improve candidate matching
              sets clear expectations
              communicates company culture
8. telecommuting - working from home or working from location that 
                isn't official workplace
9. has_company_logo - does job posting contains company logo
            0 -> doesn't contains company logo
            1 -> contains company logo
10. has_questions  - does job posting contains questions for any 
                      job seeker?
                     0 -> no question
                     1 -> question
11. employment_type - type of employment
                     Full-time -> permanent role with regular hours,
                                  benifits, consistent employment
                     Contract -> temporary 
12. required_experience - work related history, skills, duration 
                          an employer expects candiates to have to
                           be successful in a role.
13. required_education - 
14. industry - classification of businesses that provides
                simiar goods or services.
15. function - specific role or task performed within company
16. fraudlent -  job posting is fake or not
                 0 -> not fake
                 1 -> fake

In [39]:
fake_job_detector_df.head(20)

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0
5,6,Accounting Clerk,"US, MD,",,,,Job OverviewApex is an environmental consultin...,,,0,0,0,,,,,,0
6,7,Head of Content (m/f),"DE, BE, Berlin",ANDROIDPIT,20000-28000,"Founded in 2009, the Fonpit AG rose with its i...",Your Responsibilities: Manage the English-spea...,Your Know-How: ...,Your Benefits: Being part of a fast-growing co...,0,1,1,Full-time,Mid-Senior level,Master's Degree,Online Media,Management,0
7,8,Lead Guest Service Specialist,"US, CA, San Francisco",,,Airenvy’s mission is to provide lucrative yet ...,Who is Airenvy?Hey there! We are seasoned entr...,"Experience with CRM software, live chat, and p...",Competitive Pay. You'll be able to eat steak e...,0,1,1,,,,,,0
8,9,HP BSM SME,"US, FL, Pensacola",,,Solutions3 is a woman-owned small business who...,Implementation/Configuration/Testing/Training ...,MUST BE A US CITIZEN.An active TS/SCI clearanc...,,0,1,1,Full-time,Associate,,Information Technology and Services,,0
9,10,Customer Service Associate - Part Time,"US, AZ, Phoenix",,,"Novitex Enterprise Solutions, formerly Pitney ...",The Customer Service Associate will be based i...,Minimum Requirements:Minimum of 6 months custo...,,0,1,0,Part-time,Entry level,High School or equivalent,Financial Services,Customer Service,0


In [None]:
# 
fake_job_detector_df['job_id'].min()
fake_job_detector_df['job_id'].max()



98       1
144      1
173      1
180      1
215      1
        ..
17827    1
17828    1
17829    1
17830    1
17831    1
Name: fraudulent, Length: 866, dtype: int64

In [None]:
# find type of data - numerical, categorical, mixed
# numerical - job_id
# categorical - telecommuting, has_company_logo, has_questions, employment_type, 
#               required_experience, required_education, industry, function, 
#               fraudlent, department, salary_range 
# mixed - title, location, company_profile, description, requirements, 
#            benefits, 

In [None]:
# numerical analysis 
# univariate analysis

# assumption - fake job posting with less experience and high salary 
#              range are considered as fake job. 

## univariate analysis on numerical feature

## salary range

**Conclusions**


In [43]:
fake_job_detector_df['salary_range'].isnull().sum() / len(fake_job_detector_df['salary_range'])

np.float64(0.8395973154362416)

## Univariate analysis on categorical feature 

In [85]:
# fake_job_detector_df[fake_job_detector_df['salary_range']].isnull().sum() / len(fake_job_detector_df['salary_range'])

len(fake_job_detector_df['salary_range'])
fake_job_detector_df['salary_range'].isnull().sum() / len(fake_job_detector_df['salary_range']) * 100



# fake_job_detector_df['salary_range'].count() / len(fake_job_detector_df['salary_range']) *100

np.float64(83.95973154362416)

In [72]:

fake_job_detector_df['salary_range'].isnull().count()



np.int64(17880)

In [90]:
# import necessary libraries for sckit-learn
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [102]:
# fill missing values 



X = fake_job_detector_df.drop(columns='title')
y = fake_job_detector_df['title']

# find missing values in all features
fake_job_detector_df.isnull().mean() * 100

job_id                  0.000000
title                   0.000000
location                1.935123
department             64.580537
salary_range           83.959732
company_profile        18.501119
description             0.005593
requirements           15.078300
benefits               40.335570
telecommuting           0.000000
has_company_logo        0.000000
has_questions           0.000000
employment_type        19.412752
required_experience    39.429530
required_education     45.329978
industry               27.421700
function               36.101790
fraudulent              0.000000
dtype: float64

In [99]:
fake_job_detector_df.shape

(17880, 18)

In [103]:
# split dataset into training dataset and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=2)
X_train.shape, X_test.shape

((14304, 17), (3576, 17))

In [106]:
X_train.isnull().mean()


job_id                 0.000000
location               0.018736
department             0.645833
salary_range           0.838926
company_profile        0.186242
description            0.000070
requirements           0.151356
benefits               0.405970
telecommuting          0.000000
has_company_logo       0.000000
has_questions          0.000000
employment_type        0.194491
required_experience    0.394155
required_education     0.450364
industry               0.274399
function               0.359899
fraudulent             0.000000
dtype: float64

## 
