# EDA

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [13]:
import httpimport
url = 'https://raw.githubusercontent.com/zach-brown-18/class-toolkit/main/eda/'
with httpimport.remote_repo(['cleaning', 'columns'], url):
    import cleaning as c
    import columns as cols

---

# Load and Look at Data

In [14]:
dice = pd.read_csv('../data/dice.csv', encoding='latin-1')
print(dice.shape)

(21919, 12)


In [15]:
# Drop the fluff
use_cols = ['date_added', 'job_description', 'job_title', 'job_type', 'location', 'organization', 'sector']
dice = dice.loc[:, use_cols]

In [16]:
# 7 total columns used
print(dice.shape)
dice.head(2)

(21919, 7)


Unnamed: 0,date_added,job_description,job_title,job_type,location,organization,sector
0,11/11/2016,"Minimum Required Skills:EDI, TrustedLink, AS2,...",EDI Analyst,"Full Time, Full-time, Employee","Stamford, CT",CyberCoders,"EDI, TrustedLink, AS2, VAN - EDI, TrustedLink,..."
1,11/11/2016,"InformaticaåÊ/ ETL DeveloperSt, Petersburg, FL...",Informatica ETL Developer,"Full Time, Full Time","St Petersburg, FL",TrustMinds,ETL Informatica B2B Data Exchange Netezza Orac...


---

# Cleaning

## Functions

In [17]:
def remove_duplicate_skills(idx,df):
    skills = df.loc[idx, 'sector'].split(' -')[0]
    try:
        df.loc[idx, 'job_description'] = df.loc[idx, 'job_description'].split(skills)[1]
    except:
        print(f'{idx} failed.')

In [18]:
def remove_unwanted_characters(s):
    s = re.sub('\*', ' ', s)
    s = re.sub('/', ' ', s)
    s = re.sub('_', ' ', s)
    s = re.sub('\?', ' ', s)
    s = re.sub('%', ' ', s)
    s = re.sub('@', ' ', s)
    s = re.sub('#', ' ', s)
    s = re.sub('!', ' ', s)
    s = re.sub(',', ' ', s)
    s = re.sub('\+', ' ', s)
    s = re.sub('\d', ' ', s)
    return s

In [19]:
def rem_fluff(data):
    data = data.replace('please share resume', " ")
    data = data.replace('call me', " ")
    data = data.replace('if you are a', " ")
    data = data.replace('if you are an', " ")
    data = data.replace('Job Description', " ")
    data = data.replace('If you are', " ")
    data = data.replace('Reply', " ")
    return data

In [20]:
# function to get rid of repeated skills after a '-'
def split_skills(data):
        x = data
        nospace = x.replace(' ', '') # removing spaces
        if '-' in nospace:
            first_half = nospace.split('-')[0] #first half of string before '-'
            second_half = nospace.split('-')[1] # second half of string after '-'
            if first_half == second_half:
                x = list(x.split('-'))[0].strip() #getting only first half of string to return
                return x
            else:
                return x      
        else:
            return x

In [21]:
def drop_corrupt_rows(df):
    df['word_count'] = df['job_description'].map(lambda x: len(x.split()))
    
    mean_word_count = df['word_count'].mean()
    print('Mean word count per post:', round(mean_word_count))
    
    std_word_count = df['word_count'].std()
    print('std word count per post:', round(std_word_count))
    
    too_short = mean_word_count - 1.5*std_word_count
    too_short = 20
    
    # Drop posts with 20 word or less
    print(len(df), 'before dropping')
    mask = df['word_count'] > too_short
    df = df[mask]
    print(len(df), 'after dropping')

    df.drop(columns='word_count', inplace=True)

---

## Cleaning job_description

In [22]:
# Remove unwanted characters and numbers
dice['sector'].fillna('', inplace=True)
dice['job_description'] = dice['job_description'].map(remove_unwanted_characters)
dice['job_description'] = dice['job_description'].map(rem_fluff)
dice['sector'] = dice['sector'].map(remove_unwanted_characters)
dice['sector'] = dice['sector'].map(rem_fluff)

# Correct multiple spaces (including \t and \n)
dice['job_description'] = dice['job_description'].map(lambda x: re.sub('\s+',' ', x))
dice['sector'].fillna('', inplace=True)
dice['sector'] = dice['sector'].map(lambda x: re.sub('\s+',' ', x))

# Remove special characters
dice['job_description'] = dice['job_description'].map(c.remove_special_chars)
dice['sector'] = dice['sector'].map(c.remove_special_chars)

# Drop corrupted rows
drop_corrupt_rows(dice)

# Remove skills repeated in job description and sector
skills_repeated = dice['job_description'].map(lambda x: 'Minimum Required Skills' in x)
for row in dice.loc[skills_repeated, :].index:
    remove_duplicate_skills(row, dice)

Mean word count per post: 339
std word count per post: 186
21919 before dropping
21800 after dropping


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


11646 failed.
11655 failed.
11676 failed.
17688 failed.
17697 failed.
17733 failed.
17734 failed.
18800 failed.


## Search Duplicate Posts

In [23]:
# Find duplicate posts
description_counts = dice['job_description'].value_counts()
multiple_posts = description_counts > 1
description_counts[multiple_posts].sum()

3528

In [24]:
# CyberCoders are spamming the list
repeated_descriptions = description_counts[multiple_posts].index
repeated_descriptions = dice['job_description'].map(lambda x: x in repeated_descriptions)

dice.loc[repeated_descriptions, :]['organization'].value_counts().head()

CyberCoders               2898
TEKsystems                 105
Robert Half Technology      56
Citrix                      39
NORTHROP GRUMMAN            38
Name: organization, dtype: int64

In [25]:
# Majority of repeated posts are only repeated once
description_counts[description_counts == 2].sum()

1776

**Conclusion:** Repeated, identical posts will distort the data pool. If a post is repeated, drop all occurances but one.

## Drop Duplicate job posts

In [26]:
print(f'{dice.shape[0]} job listings before dropping duplicates')

21919 job listings before dropping duplicates


In [27]:
# Define duplicate posts as having identical job description, job type and organization.
non_duplicates = dice.drop(columns=['date_added', 'location', 'job_title', 'sector']).drop_duplicates().index
dice = dice.loc[non_duplicates, :].reset_index(drop=True)

print(f'{dice.shape[0]} job listings after dropping duplicates')

19800 job listings after dropping duplicates


In [28]:
# Find duplicate posts - still some left over
description_counts = dice['job_description'].value_counts()
multiple_posts = description_counts > 1
print(f'{description_counts[multiple_posts].sum()} duplicates left over')

93 duplicates left over


## Remove phone numbers and email addresses

In [30]:
# Remove them from the job description
dice['job_description'] = dice['job_description'].map(c.remove_phone_numbers)
dice['job_description'] = dice['job_description'].map(c.remove_emails)

In [31]:
# Replace useless text with empty string
mask = dice['job_description'] == 'Please send resume with rate expectations.'
idx = dice[mask]['job_description'].index
dice.loc[idx, 'job_description'] = ''

In [32]:
dice.reset_index(drop=True, inplace=True)

In [33]:
dice[dice['job_description'] == '']

Unnamed: 0,date_added,job_description,job_title,job_type,location,organization,sector,word_count
15768,5/9/2016,,Sr Java Application Developer (backend),"Full Time, 12+ months","Pasadena, CA",Acclaim Systems,Java backend developer Junit,6
17836,12/8/2016,,Sr Technical Project Manager (Human Services),"Contract W2, C2H Independent, 12+ months","Little Rock, AR",Acclaim Systems,IT Project Management Curam Cash Assistance Cu...,6
17887,12/8/2016,,Senior Curam Developer,"Full Time, Contract Corp-To-Corp, Contract Ind...","Little Rock, AR",Acclaim Systems,IT Architecture Curam Cash Assistance Curam Ou...,6


## Cleaning job_title

In [34]:
# Finding null values in job_title
dice[dice['job_title'].isnull()]

Unnamed: 0,date_added,job_description,job_title,job_type,location,organization,sector,word_count
13558,11/4/2016,Our end client is seeking technicians with exp...,,,,,,21


In [35]:
# Removing the one null job_title
dice = dice.dropna(subset = ['job_title'])

### catagorizing job_title

In [36]:
dice.loc[dice['job_title'].str.contains('Developer|developer|Dev|dev|Scrum|scrum'), 
                  'job_title'] = 'Developer'
dice.loc[dice['job_title'].str.contains('Analyst|analyst'), 
                  'job_title'] = 'Analyst'
dice.loc[dice['job_title'].str.contains('Programmer|programmer|Programming|programming|Full'),
                  'job_title'] = 'Programmer'
dice.loc[dice['job_title'].str.contains('Manager|manager|Project Coordinator|Technical Lead'), 
                  'job_title'] = 'Manager'
dice.loc[dice['job_title'].str.contains('Engineer|engineer'), 
                  'job_title'] = 'Engineer'
dice.loc[dice['job_title'].str.contains('Architect|architect'), 
                  'job_title'] = 'Architect'
dice.loc[dice['job_title'].str.contains('Designer|designer'), 
                  'job_title'] = 'Designer'
dice.loc[dice['job_title'].str.contains('Technician|technician'), 
                  'job_title'] = 'Technician'
dice.loc[dice['job_title'].str.contains('Administrator|administrator|Admin|admin'), 
                  'job_title'] = 'Administrator'
dice.loc[dice['job_title'].str.contains('Consulting|consulting|Consultation|consultation|Consultant|consultant'), 
                  'job_title'] = 'Consulting'
dice.loc[dice['job_title'].str.contains('Support|support|Helpdesk'), 
                  'job_title'] = 'Support'
dice.loc[dice['job_title'].str.contains('Director|director|CTO'),
                  'job_title'] = 'Director'
dice.loc[dice['job_title'].str.contains('Entry'), 
                  'job_title'] = 'Entry Position'
dice.loc[dice['job_title'].str.contains('Data Scientist|SQL DBA|SQL Server DBA|IT'), 
                  'job_title'] = 'Data Position'

In [37]:
dice['job_title'].value_counts()[0:13].sum()

16871

In [38]:
# Creating a list of the top 13 most popular job_titles
keep_titles = dice['job_title'].value_counts()[:13].index.tolist()
# Only keeping the top 13 job_titles from keep_titles
dice = dice[dice['job_title'].isin(keep_titles)]

In [39]:
dice['job_title'].value_counts()

Developer        5319
Engineer         4167
Analyst          1934
Manager          1416
Administrator     911
Architect         838
Consulting        598
Technician        352
Support           348
Programmer        325
Data Position     290
Designer          224
Director          149
Name: job_title, dtype: int64

## Cleaning location

### Catagorizing location

In [40]:
dice.loc[dice['location'].str.contains('ME|NH|VT|MA|MARYLAND|Laurel|CT|Ct|RI|NY|Ny|PA|NJ|Reading|OTHER|Portsmouth|West Chester'),
                  'location'] = 'Northeast United States'

dice.loc[dice['location'].str.contains('MI|OH|IN|IL|Il|WI|MN|IA|Ia|MO|Mo|KS|NE|ND|SD|Farmington|Lake County|Ashland'),
                  'location'] = 'Midwest United States'

dice.loc[dice['location'].str.contains('DE|MD|Md|DC|WV|VA|NC|SC|Anderson|KY|Ky|TN|GA|Ga|MS|AL|AR|OK|TX|Stafford|Austin|LA|La|FL|Boca Raton|South'),
                  'location'] = 'Southern United States'

dice.loc[dice['location'].str.contains('MT|WY|CO|Co|NM|AZ|UT|ID|Id|NV|CA|Ca|OR|WA|AK|HI|Hi|Pasadena|Redmond|Greenwood Village|San Francisco'),
                  'location'] = 'Western United States'

dice.loc[dice['location'].str.contains('Hyderabad|London|Bangalore|Dublin|Taguig City|ON|BC|Windsor'),
                  'location'] = 'International'

dice.loc[dice['location'].str.contains('STATE|City|Satellite Office; North America-us-il-chicago; Nor|United States Of America'),
                  'location'] = 'Unknown'
dice.head(3)

Unnamed: 0,date_added,job_description,job_title,job_type,location,organization,sector,word_count
0,11/11/2016,an EDI Analyst with experience please read on...,Analyst,"Full Time, Full-time, Employee",Northeast United States,CyberCoders,EDI TrustedLink AS VAN - EDI TrustedLink AS VAN,272
1,11/11/2016,Informatica ETL DeveloperSt Petersburg FL Only...,Developer,"Full Time, Full Time",Southern United States,TrustMinds,ETL Informatica B B Data Exchange Netezza Orac...,83
2,11/11/2016,pmayekar kanandcorp.com - - Sunnyvale CAANGULA...,Developer,"Full Time, Contract Corp-To-Corp, Contract Ind...",Western United States,K Anand Corporation,Angular,8


In [41]:
# Creating a list of top 4 locations
keep_locations = dice['location'].value_counts()[:4].index.tolist()

# Only keeping the top 4 locations from list keep_locations
dice = dice[dice['location'].isin(keep_locations)]

In [42]:
dice['location'].value_counts()

Southern United States     5278
Western United States      4485
Northeast United States    4214
Midwest United States      2763
Name: location, dtype: int64

## Cleaning job_type

In [43]:
# Droping 238 rows of data that are null for job_type.
dice.dropna(subset = ['job_type'], inplace=True)
dice.reset_index(drop=True, inplace=True)

### Catagorizing job_type

In [44]:
dice.loc[dice['job_type'].str.contains('Full Time|Full-time|per year|per'),
                  'job_type'] = 'Full Time'

dice.loc[dice['job_type'].str.contains('Contract|contract|C2H'),
                  'job_type'] = 'Contract'

dice.loc[dice['job_type'].str.contains('Part Time'),
                  'job_type'] = 'Part Time'

dice.loc[dice['job_type'].str.contains('Market related|Market|Negotiable'),
                  'job_type'] = 'Market Dependent'

dice.loc[dice['job_type'].str.contains('-|EXPERIENCE|RELOCATION|define'),
                  'job_type'] = 'Unknown'

dice.head(3)

Unnamed: 0,date_added,job_description,job_title,job_type,location,organization,sector,word_count
0,11/11/2016,an EDI Analyst with experience please read on...,Analyst,Full Time,Northeast United States,CyberCoders,EDI TrustedLink AS VAN - EDI TrustedLink AS VAN,272
1,11/11/2016,Informatica ETL DeveloperSt Petersburg FL Only...,Developer,Full Time,Southern United States,TrustMinds,ETL Informatica B B Data Exchange Netezza Orac...,83
2,11/11/2016,pmayekar kanandcorp.com - - Sunnyvale CAANGULA...,Developer,Full Time,Western United States,K Anand Corporation,Angular,8


In [45]:
dice['job_type'].value_counts()

Full Time           10279
Contract             6101
Market Dependent      126
Unknown                32
Part Time               9
Competitive             3
Name: job_type, dtype: int64

## Cleaning sector

In [46]:
# renaming sector to be more descriptive
dice=dice.rename(columns = {'sector':'skills'})

In [47]:
# lowercasing skills
dice['skills']= dice['skills'].str.lower()

### Removing commas, backslashes and extra spaces

In [48]:
# making sure no extra spaces. help from : https://stackoverflow.com/questions/43071415/remove-multiple-blanks-in-dataframe
dice['skills'] = dice['skills'].replace('\s+', ' ', regex=True)

### Looking at skills string length

In [49]:
# creating a new column for skills string length
dice['skills_len'] = dice['skills'].str.split().map(lambda x: len(x))

In [50]:
dice[dice['skills_len'] ==2].head(3)

Unnamed: 0,date_added,job_description,job_title,job_type,location,organization,skills,word_count,skills_len
18,11/12/2016,Senior Java DeveloperLocation: SunnyvaleStart ...,Developer,Contract,Western United States,TEKsystems,java developer,396,2
30,11/11/2016,"Client is looking for a ""AEM CQ Developer"" in ...",Developer,Contract,Northeast United States,Nutech Information Systems,aem cq,164,2
31,11/12/2016,As the Project Manager for and HR Solutions te...,Manager,Contract,Western United States,TEKsystems,project manager,497,2


In [51]:
dice[dice['skills_len'] ==3].head(3)

Unnamed: 0,date_added,job_description,job_title,job_type,location,organization,skills,word_count,skills_len
6,11/11/2016,Linux System Administrator opportunity- with i...,Administrator,Full Time,Northeast United States,Landover Assocates,linux system administrator,90,3
34,11/11/2016,The suitable candidate should be familier with...,Architect,Full Time,Southern United States,Sanrose Information Services Inc.,apex data solutions,231,3
39,11/12/2016,Must have at least year of professional Nativ...,Designer,Contract,Western United States,TEKsystems,mobile ux designer,367,3


We can see that many of the skills are actually just the title of the job or job type. These should be changed to an empty string.

### Cleaning skills of already described features

In [52]:
# if skill is just the job title replace with a empty string
dice.loc[(dice['skills'] ==  dice['job_title'].str.lower()), 'skills'] = ""

In [53]:
dice['skills'].value_counts().head(10)

see job description    145
contract w              71
full time               56
                        38
network engineer        30
.net developer          22
please refer to         22
project manager         22
business analyst        20
desktop support         18
Name: skills, dtype: int64

In [54]:
dice['skills'].fillna('', inplace=True)
mask = dice['skills'].map(lambda x: True if re.search('\saa\s', x) else False)
print(len(dice[mask]), 'in skills')

dice['job_description'].fillna('', inplace=True)
mask = dice['job_description'].map(lambda x: True if re.search('\saa\s', x) else False)
print(len(dice[mask]), 'in job_description')

2 in skills
4 in job_description


In [55]:
# changing additional non-skills to empty strings
dice.loc[(dice['skills'] == 'Telecommuting not available Travel not required'.lower()), 'skills'] = ''
dice.loc[(dice['skills'] == 'Full Time'.lower()), 'skills'] = ''
dice.loc[(dice['skills'] == 'Contract W2'.lower()), 'skills'] = ''
dice.loc[(dice['skills'] == 'tad pgs inc. specializes in delivering secure reliable and rapidly implemented workforce solutions to the u.s. federal marketplace including u.s. government agencies and their prime contractors. wi'), 'skills'] = ''
dice.loc[(dice['skills'] == 'see job description'), 'skills'] = ''
dice.loc[(dice['skills'] == 'please refer to job description'), 'skills'] = ''
dice.loc[(dice['skills'] == '(see job description)'), 'skills'] = ''
dice.loc[(dice['skills'] == 'refer to job description'), 'skills'] = ''

### Dealing with duplicate words

In [56]:
# removing duplicate words in a row
# help from:https://stackoverflow.com/questions/47316783/python-dataframe-remove-duplicate-words-in-the-same-cell-within-a-column-in-pyt
dice['skills'] = dice['skills'].str.replace(r'\b(\w+)(\s+\1)+\b', r'\1')

In [57]:
# some postings had repeat word after -
# mapping split_skills to whole skills row
dice['skills'] = dice['skills'].map(split_skills)

# Final DataFrame

In [58]:
dice.reset_index(drop=True, inplace=True)

In [59]:
dice

Unnamed: 0,date_added,job_description,job_title,job_type,location,organization,skills,word_count,skills_len
0,11/11/2016,an EDI Analyst with experience please read on...,Analyst,Full Time,Northeast United States,CyberCoders,edi trustedlink as van,272,9
1,11/11/2016,Informatica ETL DeveloperSt Petersburg FL Only...,Developer,Full Time,Southern United States,TrustMinds,etl informatica b data exchange netezza oracle...,83,9
2,11/11/2016,pmayekar kanandcorp.com - - Sunnyvale CAANGULA...,Developer,Full Time,Western United States,K Anand Corporation,angular,8,1
3,11/12/2016,This nationally recognized Microsoft Gold Part...,Manager,Full Time,Western United States,Nigel Frank International,microsoft dynamics ax project manager - toront...,585,11
4,11/11/2016,a .NET Developer with experience please read ...,Developer,Full Time,Northeast United States,CyberCoders,c asp.net sql javascript mvc,356,11
...,...,...,...,...,...,...,...,...,...
16545,12/9/2016,JPMorgan Chase & Co. (NYSE: JPM) is a leadin...,Developer,Full Time,Northeast United States,JPMorgan Chase,.net architecture developer development git ht...,852,26
16546,12/9/2016,Seeking Jr. Systems Administrators with experi...,Administrator,Contract,Midwest United States,TEKsystems,jr. linux administrator,353,3
16547,12/9/2016,a Senior Lead Devops Engineer with a desired ...,Developer,Full Time,Midwest United States,CyberCoders,amazon web services linux bash ruby python agile,323,17
16548,12/9/2016,Headquartered in downtown San Francisco CA we ...,Developer,Full Time,Western United States,CyberCoders,javascript react.js golang startup ror iot ana...,322,23


In [60]:
# exporting final dataframe to csv
dice.to_csv('../data/job_postings.csv', index=False)