# Data Cleaning and Preprocessing

In [None]:
#importing modules
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 

In [3]:
#load dataset
data=pd.read_csv('fake_job_postings.csv')
data.head(3)

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0


In [4]:
#dropping duplicate records if any
data.drop_duplicates(keep='first')
#Finding the percentage of missing data in each column
percent_missing = data.isnull().sum() * 100 / len(data)
missing_values_data = pd.DataFrame({'percent_missing': percent_missing})
print(missing_values_data)

                     percent_missing
job_id                      0.000000
title                       0.000000
location                    1.935123
department                 64.580537
salary_range               83.959732
company_profile            18.501119
description                 0.005593
requirements               15.072707
benefits                   40.324385
telecommuting               0.000000
has_company_logo            0.000000
has_questions               0.000000
employment_type            19.412752
required_experience        39.429530
required_education         45.329978
industry                   27.421700
function                   36.101790
fraudulent                  0.000000


In [5]:
#Since job_id won't be helping or improving our model and since more than 80% of 'salary_range' is missing, we'll be dropping
#these two columns.
data.drop('job_id',axis=1,inplace=True)
data.drop('salary_range',axis=1,inplace=True)

In [6]:
#filling Nan values with 'missing' 
columns=['location','department','company_profile','description','requirements','benefits','employment_type','required_experience','required_education','industry','function']
for i in columns:
    data[i]=data[i].fillna(value='missing')

In [7]:
#transforming binary value columns from 0,1 to appropriate strings
data['telecommuting'].value_counts()
data['telecommuting'].replace({1:'telecommuting',0:'no telecommuting'},inplace=True)
data['has_company_logo'].replace({1:'has logo',0:'no logo'},inplace=True)
data['has_questions'].replace({1:'has questions',0:'no questions'},inplace=True)

In [9]:
#Creating a new column to store all textual data by appending the columns with strings
data['text']=data[['title','company_profile','description','requirements','benefits','employment_type', 'required_experience', 'required_education', 'industry', 'function','telecommuting', 'has_company_logo','has_questions']].agg(' '.join,axis=1)
data_new=data[['text','fraudulent']]
data_new.head()

Unnamed: 0,text,fraudulent
0,"Marketing Intern We're Food52, and we've creat...",0
1,Customer Service - Cloud Video Production 90 S...,0
2,Commissioning Machinery Assistant (CMA) Valor ...,0
3,Account Executive - Washington DC Our passion ...,0
4,Bill Review Manager SpotSource Solutions LLC i...,0


In [10]:
#preprocessing the dataset - normalising, removal of punctuation , urls and special characters
def preprocess(df):
    ret=''
    data = df['text']
    data = data.lower()              
    
    data = re.sub('https?://\S+|www\.\S+', ' ', data) #removing urls
    data = re.sub('#\S+', ' ', data)
    data = re.sub('&amp;+', ' ', data)
    data = re.sub('[^A-Za-z0-9$,]+', ' ', data)
    data = re.sub('\d',' ', data)
    data = re.sub('\xa0+', ' ', data)
    data=re.sub(',','',data)
    data=re.sub(r'\s+', ' ', data)
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    tokens= tokenizer.tokenize(data)
    words = [word for word in tokens if (word.isalnum() or ('$' in word))]
    lemmatizer = WordNetLemmatizer()
    lem=[]
    lem = [lemmatizer.lemmatize(t, pos='v') for t in words] 
    ret = ' '.join(word for word in lem)
    return ret
data_new['text']=data_new[['text']].apply(preprocess,axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [11]:
#export cleaned and preprocessed data
data_new.to_csv('preprocessed.csv',index=False)