
<h2>1. Reading the data set and doing some quick data exploration</h2>

In [17]:
import pandas as pd

In [18]:
raw_data = pd.read_csv('Data/fake_job_postings.csv')

In [19]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               17880 non-null  int64 
 1   title                17880 non-null  object
 2   location             17534 non-null  object
 3   department           6333 non-null   object
 4   salary_range         2868 non-null   object
 5   company_profile      14572 non-null  object
 6   description          17879 non-null  object
 7   requirements         15184 non-null  object
 8   benefits             10668 non-null  object
 9   telecommuting        17880 non-null  int64 
 10  has_company_logo     17880 non-null  int64 
 11  has_questions        17880 non-null  int64 
 12  employment_type      14409 non-null  object
 13  required_experience  10830 non-null  object
 14  required_education   9775 non-null   object
 15  industry             12977 non-null  object
 16  func

In [22]:
raw_data.isnull().sum()

job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2696
benefits                7212
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64

<p>Too many empty values for department and salart_range fields</p>

In [23]:
raw_data.nunique()

job_id                 17880
title                  11231
location                3105
department              1337
salary_range             874
company_profile         1709
description            14801
requirements           11967
benefits                6204
telecommuting              2
has_company_logo           2
has_questions              2
employment_type            5
required_experience        7
required_education        13
industry                 131
function                  37
fraudulent                 2
dtype: int64

In [24]:
raw_data.head(2)

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0


In [25]:
raw_data.groupby('fraudulent')['job_id'].count()

fraudulent
0    17014
1      866
Name: job_id, dtype: int64

<h3>Class imbalane found for fraudulent class label</h3>

In [26]:
raw_data.drop(columns=['salary_range','department'],inplace=True)

In [27]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               17880 non-null  int64 
 1   title                17880 non-null  object
 2   location             17534 non-null  object
 3   company_profile      14572 non-null  object
 4   description          17879 non-null  object
 5   requirements         15184 non-null  object
 6   benefits             10668 non-null  object
 7   telecommuting        17880 non-null  int64 
 8   has_company_logo     17880 non-null  int64 
 9   has_questions        17880 non-null  int64 
 10  employment_type      14409 non-null  object
 11  required_experience  10830 non-null  object
 12  required_education   9775 non-null   object
 13  industry             12977 non-null  object
 14  function             11425 non-null  object
 15  fraudulent           17880 non-null  int64 
dtypes: i

In [28]:
continuous_cols = [
    'job_id',
    'telecommuting',
    'has_company_logo',
    'has_questions',
    'fraudulent'
]
categorical_cols = [
    'title',
    'location',
    'department',
    'company_profile',
    'requirements',
    'benefits',
    'employment_type',
    'required_experience',
    'required_education',
    'industry',
    'function'
]

In [29]:
sub_categorical_cols = [
    'location',
    'employment_type',
    'required_experience',
    'required_education',
    'industry',
    'function'
]
raw_data[sub_categorical_cols] = raw_data[sub_categorical_cols].fillna('Unknown')

<p>dropping null values of description because it contains only 1 null value for this field </p>

In [30]:
raw_data.dropna(subset=['description'], inplace=True)

In [31]:
text_cols = ['company_profile', 'requirements', 'benefits']
raw_data[text_cols] = raw_data[text_cols].fillna('Unknown')

In [53]:
raw_data.isnull().sum()

job_id                 0
title                  0
location               0
company_profile        0
description            0
requirements           0
benefits               0
telecommuting          0
has_company_logo       0
has_questions          0
employment_type        0
required_experience    0
required_education     0
industry               0
function               0
fraudulent             0
dtype: int64

<h2>2. Preprocessing the dataset</h2>

In [37]:
import regex as re
import nltk
from nltk.corpus import stopwords

In [38]:
nltk.download('stopwords')
stopwords_set = set(stopwords.words('english'))
stopwords_set

[nltk_data] Downloading package stopwords to /Users/nani/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [39]:
def preprocess(text):
  if isinstance(text,str):
    text = re.sub(r'<.*?>', '',text)
    text = re.sub(r'[^a-zA-Z\s]', '',text)
    text = text.lower()
    words = text.split()
    words = [word for word in words if word not in stopwords_set]
    return ' '.join(words)
  return text

In [40]:
text_cols.extend(['title','description'])

In [46]:
raw_data['description']

0        Food52, a fast-growing, James Beard Award-winn...
1        Organised - Focused - Vibrant - Awesome!Do you...
2        Our client, located in Houston, is actively se...
3        THE COMPANY: ESRI – Environmental Systems Rese...
4        JOB TITLE: Itemization Review ManagerLOCATION:...
                               ...                        
17875    Just in case this is the first time you’ve vis...
17876    The Payroll Accountant will focus primarily on...
17877    Experienced Project Cost Control Staff Enginee...
17878    Nemsia Studios is looking for an experienced v...
17879    Who are we?Vend is an award winning web based ...
Name: description, Length: 17879, dtype: object

In [47]:
raw_data[text_cols] = raw_data[text_cols].map(preprocess)

In [48]:
raw_data['description']

0        food fastgrowing james beard awardwinning onli...
1        organised focused vibrant awesomedo passion cu...
2        client located houston actively seeking experi...
3        company esri environmental systems research in...
4        job title itemization review managerlocation f...
                               ...                        
17875    case first time youve visited website vend awa...
17876    payroll accountant focus primarily payroll fun...
17877    experienced project cost control staff enginee...
17878    nemsia studios looking experienced visualgraph...
17879    wevend award winning web based point sale soft...
Name: description, Length: 17879, dtype: object

In [49]:
empty_strings = (raw_data[text_cols] == '').sum()
print("Empty strings in each text column:\n", empty_strings)

Empty strings in each text column:
 company_profile     0
requirements        8
benefits           29
title               0
description         1
dtype: int64


In [50]:
text_lengths = raw_data[text_cols].map(len)
print("Length of text in each column after preprocessing:\n", text_lengths)

Length of text in each column after preprocessing:
        company_profile  requirements  benefits  title  description
0                  642           671         7     16          706
1                  884          1076       786     39         1543
2                  614          1103         7     37          274
3                  478          1209       687     31         2094
4                 1331           648        21     19         1226
...                ...           ...       ...    ...          ...
17875             1229           990       509     29         1020
17876             1624           530       465     18         1004
17877              182           879         7     55         1083
17878                7           362       157     16          356
17879             1229           698         7     26         1279

[17879 rows x 5 columns]


In [51]:
raw_data = raw_data[raw_data['description'] != '']
raw_data['requirements'].replace('', 'Unknown', inplace=True)
raw_data['benefits'].replace('', 'Unknown', inplace=True)
empty_strings = (raw_data[text_cols] == '').sum()
print("Empty strings after fixing:\n", empty_strings)

Empty strings after fixing:
 company_profile    0
requirements       0
benefits           0
title              0
description        0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  raw_data['requirements'].replace('', 'Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  raw_data['benefits'].replace('', 'Unknown', inplace=True)


In [54]:
raw_data.to_csv('Data/cleaned_fake_job_postings.csv', index=False)