In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('glassdoor_jobs.csv')

In [4]:
# parsing min, max and avg salaries
salary = df['Salary Estimate'].apply(lambda x: x.split('(')[0])
minus_kd = salary.apply(lambda x: x.replace('K', '').replace('$', ''))
df['min_salary'] = minus_kd.apply(lambda x: int(x.split('-')[0]))
df['max_salary'] = minus_kd.apply(lambda x: int(x.split('-')[1]))
df['avg_salary'] = (df.min_salary + df.max_salary) / 2

In [5]:
# Deleting rating from the name of those companies which have rating other than -1
df['company_txt'] = df.apply(lambda x: x['Company Name'] if x['Rating'] == -1 else x['Company Name'][:-4], axis = 1)

In [6]:
# parsing job state from location
df['job_state'] = df['Location'].apply(lambda x: x.split(', ')[1] if ',' in x else 'na')
df.job_state.value_counts()

CA    352
VA    196
NY    165
IL    147
na    140
WA    129
NJ    124
MA     93
CT     78
DC     73
MD     71
OH     55
CO     51
FL     50
GA     46
NC     45
TX     43
MN     42
TN     12
MO     10
AL     10
WI      9
MI      8
UT      7
PA      7
AZ      7
OR      6
RI      4
SC      4
NH      4
IN      3
HI      3
AR      2
IA      1
KS      1
NM      1
WV      1
Name: job_state, dtype: int64

In [7]:
# same_state 0 or 1 (location vs headquarters)
df['same_state'] = df.apply(lambda x: 1 if x.Location == x.Headquarters else 0, axis = 1)

In [8]:
# calculating age of the company
df['age'] = df.Founded.apply(lambda x: x if x == -1 else 2020 - x)

In [24]:
# parsing skills from job despcription
df['python'] = df['Job Description'].apply(lambda x: 1 if 'python' in x.lower() else 0)
df['r'] = df['Job Description'].apply(lambda x: 1 if ' r ' in x.lower() else 0)
df['sql'] = df['Job Description'].apply(lambda x: 1 if 'sql' in x.lower() else 0)
df['spark'] = df['Job Description'].apply(lambda x: 1 if 'spark' in x.lower() else 0)
df['aws'] = df['Job Description'].apply(lambda x: 1 if ' aws ' in x.lower() else 0)
df['hadoop'] = df['Job Description'].apply(lambda x: 1 if 'hadoop' in x.lower() else 0)
df['apache'] = df['Job Description'].apply(lambda x: 1 if 'apache' in x.lower() else 0)
df['sas'] = df['Job Description'].apply(lambda x: 1 if 'sas' in x.lower() else 0)
df['excel'] = df['Job Description'].apply(lambda x: 1 if 'excel' in x.lower() else 0)
df['paid_vac'] = df['Job Description'].apply(lambda x: 1 if 'paid vacation' in x.lower() or 'paid holiday' in x.lower() else 0)
df['ab'] = df['Job Description'].apply(lambda x: 1 if 'a/b testing' in x.lower() else 0)
df['agile'] = df['Job Description'].apply(lambda x: 1 if 'agile' in x.lower() else 0)
df['phd'] = df['Job Description'].apply(lambda x: 1 if 'phd' in x.lower() else 0)

In [25]:
def title_simplifier(title):
    if 'manager' in title.lower() or 'chief' in title.lower() or 'director' in title.lower() or 'vp' in title.lower() or 'principal' in title.lower():
        return 'manager'
    if 'analyst' in title.lower():
        return 'analyst'
    elif 'data engineer' in title.lower():
        return 'data engineer'
    elif 'analyst' in title.lower():
        return 'analyst'
    elif 'machine learning' in title.lower() or 'ai' in title.lower() or 'artificial intelligence' in title.lower() or 'deep' in title.lower():
        return 'mle/ai'
    elif 'data scientist' in title.lower():
        return 'data scientist'
    else:
        return 'na'

def seniority(title):
    if 'sr' in title.lower() or 'senior' in title.lower() or 'manager' in title.lower() or 'lead' in title.lower() or 'chief' in title.lower() or 'director' in title.lower() or 'vp' in title.lower() or 'principal' in title.lower():
        return 'senior'
    elif 'jr' in title.lower() or 'junior' in title.lower() or 'early' in title.lower():
        return 'jr'
    else:
        return 'na'

In [26]:
df['job_simp'] = df['Job Title'].apply(title_simplifier)
df.job_simp.value_counts()

data scientist    1897
mle/ai              59
manager             38
analyst              5
data engineer        1
Name: job_simp, dtype: int64

In [27]:
df['seniority'] = df['Job Title'].apply(seniority)
df.seniority.value_counts()

na        1308
senior     690
jr           2
Name: seniority, dtype: int64

In [28]:
df.isnull().sum()

Job Title            0
Salary Estimate      0
Job Description      0
Rating               0
Company Name         0
Location             0
Headquarters         0
Size                 0
Founded              0
Type of ownership    0
Industry             0
Sector               0
Revenue              0
Competitors          0
min_salary           0
max_salary           0
avg_salary           0
company_txt          0
job_state            0
same_state           0
age                  0
python               0
r                    0
spark                0
aws                  0
hadoop               0
apache               0
sas                  0
excel                0
job_simp             0
seniority            0
paid_vac             0
ab                   0
agile                0
phd                  0
sql                  0
dtype: int64

In [29]:
df = df.fillna(0)

In [30]:
df['desc_leng'] = df['Job Description'].apply(lambda x: len(x))

In [31]:
df['num_comp'] = df['Competitors'].apply(lambda x: len(x.split(','))if x != '-1' else 0)

In [32]:
# saving the ds as csv
df.to_csv('glassdoor_jobs_cleaned.csv', index = False)