In [47]:
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import matplotlib.pyplot as plt

# For dealing with textual data
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [2]:
df = pd.read_csv('../Dataset/glassdoor_jobs.csv')

### Exploratory data analysis

In [3]:
df.sample(3)

Unnamed: 0.1,Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors
836,836,Data Engineer,$65K-$124K (Glassdoor est.),Title: Data Engineer\n\nLocation: Orange Count...,3.5,Alignment Healthcare\n3.5,"Orange, CA","Orange, CA",501 to 1000 employees,2013,Company - Private,Health Care Services & Hospitals,Health Care,Unknown / Non-Applicable,-1
949,949,"Principal, Data Science - Advanced Analytics",$86K-$137K (Glassdoor est.),IQVIA is the leading human data science compan...,3.6,IQVIA\n3.6,"Plymouth Meeting, PA","Durham, NC",10000+ employees,2017,Company - Public,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,$2 to $5 billion (USD),"PPD, INC Research, PRA Health Sciences"
53,53,Data Engineer,-1,"Data Engineer\n£50,000 – £70,000 See Advert\n\...",4.5,Anson McCade\n4.5,"Kingdom, IL","London, United Kingdom",51 to 200 employees,2000,Company - Private,Staffing & Outsourcing,Business Services,$1 to $5 million (USD),-1


Let's write the things which I think needs to be done from the very first glance of the dataset
- Removed the Unnamed:0 feature
- Extract the salary numerical values and create new feature with average salary value
- Extract the company name from given format of {company/rating}

In [4]:
# Removing the unnecessary feature
df.drop(['Unnamed: 0','Competitors'],axis=1,inplace=True)

In [5]:
# Storing the indexes of the rows having per hour amount mentioned
indices = []
for index, row in df.iterrows():
    if 'Per Hour' in row['Salary Estimate']:  
        indices.append(index)

# Removing the rows 
df = df.drop(indices)
df.reset_index(drop=True,inplace=True)

In [24]:
diff_format_indices = []
for idx,row in df.iterrows():
    if 'Provided' in row['Salary Estimate']:
        diff_format_indices.append(idx)

set_diff_format_indices = set(diff_format_indices)

In [38]:
avg_salary = []

for index, row in df.iterrows():
    salary_data_yearly = row['Salary Estimate']
    
    if salary_data_yearly != str(-1):

        if index not in set_diff_format_indices:
            lower_raw, upper_raw = salary_data_yearly.split('(')[0].split('-')
            lower_value, upper_value = np.int64(lower_raw[1:-1]), np.int64(upper_raw[1:-2])
            avg_salary.append(np.mean([lower_value, upper_value]))  
        else:
            lower_raw, upper_raw = salary_data_yearly.split(':')[1].split('-')
            lower_value, upper_value = np.int64(lower_raw[1:-1]), np.int64(upper_raw[1:-1])
            avg_salary.append(np.mean([lower_value, upper_value]))  
    else:
        avg_salary.append(-1)

# Adding a new feature
df['Avg_salary'] = avg_salary
df.drop(['Salary Estimate'],axis=1,inplace=True)

In [41]:
company_name = []

for index, row in df.iterrows():
    item = row['Company Name'].split('\n')[0]
    company_name.append(item) 

df['company_name'] = company_name
df.drop(['Company Name'],axis=1,inplace=True)

In [49]:
def process_jd(text):
    """
    Clean and preprocess a job description text.

    Parameters:
        text (str): The job description text to be processed.

    Returns:
        str: The cleaned and preprocessed job description text.

    Steps:
        1. Replaces newline characters ('\n') with spaces.
        2. Converts the text to lowercase.
        3. Removes punctuation.
        4. Removes English stop words.
        5. Lemmatizes the remaining words.
    """
    # Replacing \n with ' '
    text = text.replace('\n', ' ')
    
    # Lowercasing
    text_lower = text.lower()
    
    # Removing punctuation
    text_no_punctuation = text_lower.translate(str.maketrans('', '', string.punctuation))
    
    # Removing stop words
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text_no_punctuation)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    
    # Lemmatization
    lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_text]
    
    clean_text = ' '.join(lemmatized_text)    
    return clean_text

In [51]:
df['jd'] = df['Job Description'].map(process_jd)
df.drop(['Job Description'],axis=1,inplace=True)

In [55]:
df.head(3)

Unnamed: 0,Job Title,Rating,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,Avg_salary,company_name,jd
0,Data Scientist,3.8,"Albuquerque, NM","Goleta, CA",501 to 1000 employees,1973,Company - Private,Aerospace & Defense,Aerospace & Defense,$50 to $100 million (USD),-1,72.0,Tecolote Research,data scientist location albuquerque nm educati...
1,Healthcare Data Scientist,3.4,"Linthicum, MD","Baltimore, MD",10000+ employees,1984,Other Organization,Health Care Services & Hospitals,Health Care,$2 to $5 billion (USD),-1,87.5,University of Maryland Medical System,general summary healthcare data scientist posi...
2,Data Scientist,4.8,"Clearwater, FL","Clearwater, FL",501 to 1000 employees,2010,Company - Private,Security Services,Business Services,$100 to $500 million (USD),-1,85.0,KnowBe4,knowbe4 inc high growth information security c...


In [44]:
text = '''Data Scientist\nLocation: Albuquerque, NM\nEducation Required: Bachelor’s degree required, preferably in math, engineering, business, or the sciences.\nSkills Required:\nBachelor’s Degree in relevant field, e.g., math, data analysis, database, computer science, Artificial Intelligence (AI); three years’ experience credit for Master’s degree; five years’ experience credit for a Ph.D\nApplicant should be proficient in the use of Power BI, Tableau, Python, MATLAB, Microsoft Word, PowerPoint, Excel, and working knowledge of MS Access, LMS, SAS, data visualization tools, and have a strong algorithmic aptitude\nExcellent verbal and written communication skills, and quantitative analytical skills are required\nApplicant must be able to work in a team environment\nU.S. citizenship and ability to obtain a DoD Secret Clearance required\nResponsibilities: The applicant will be responsible for formulating analytical solutions to complex data problems; creating data analytic models to improve data metrics; analyzing customer behavior and trends; delivering insights to stakeholders, as well as designing and crafting reports, dashboards, models, and algorithms to make data insights actionable; selecting features, building and optimizing classifiers using machine learning techniques; data mining using state-of-the-art methods, extending organization’s data with third party sources of information when needed; enhancing data collection procedures to include information that is relevant for building analytic systems; processing, cleansing, and verifying the integrity of data used for analysis; doing ad-hoc analysis and presenting results in a clear manner; and creating automated anomaly detection systems and constant tracking of its performance.\nBenefits:\nWe offer competitive salaries commensurate with education and experience. We have an excellent benefits package that includes:\nComprehensive health, dental, life, long and short term disability insurance\n100% Company funded Retirement Plans\nGenerous vacation, holiday and sick pay plans\nTuition assistance\n\nBenefits are provided to employees regularly working a minimum of 30 hours per week.\n\nTecolote Research is a private, employee-owned corporation where people are our primary resource. Our investments in technology and training give our employees the tools to ensure our clients are provided the solutions they need, and our very high employee retention rate and stable workforce is an added value to our customers. Apply now to connect with a company that invests in you.'''

# Replace '\n' with ' '
text = text.replace('\n', ' ')

print(text)

Data Scientist Location: Albuquerque, NM Education Required: Bachelor’s degree required, preferably in math, engineering, business, or the sciences. Skills Required: Bachelor’s Degree in relevant field, e.g., math, data analysis, database, computer science, Artificial Intelligence (AI); three years’ experience credit for Master’s degree; five years’ experience credit for a Ph.D Applicant should be proficient in the use of Power BI, Tableau, Python, MATLAB, Microsoft Word, PowerPoint, Excel, and working knowledge of MS Access, LMS, SAS, data visualization tools, and have a strong algorithmic aptitude Excellent verbal and written communication skills, and quantitative analytical skills are required Applicant must be able to work in a team environment U.S. citizenship and ability to obtain a DoD Secret Clearance required Responsibilities: The applicant will be responsible for formulating analytical solutions to complex data problems; creating data analytic models to improve data metrics; 

'Data Scientist\nLocation: Albuquerque, NM\nEducation Required: Bachelor’s degree required, preferably in math, engineering, business, or the sciences.\nSkills Required:\nBachelor’s Degree in relevant field, e.g., math, data analysis, database, computer science, Artificial Intelligence (AI); three years’ experience credit for Master’s degree; five years’ experience credit for a Ph.D\nApplicant should be proficient in the use of Power BI, Tableau, Python, MATLAB, Microsoft Word, PowerPoint, Excel, and working knowledge of MS Access, LMS, SAS, data visualization tools, and have a strong algorithmic aptitude\nExcellent verbal and written communication skills, and quantitative analytical skills are required\nApplicant must be able to work in a team environment\nU.S. citizenship and ability to obtain a DoD Secret Clearance required\nResponsibilities: The applicant will be responsible for formulating analytical solutions to complex data problems; creating data analytic models to improve data

In [42]:
df.head(3)

Unnamed: 0,Job Title,Job Description,Rating,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,Avg_salary,company_name
0,Data Scientist,"Data Scientist\nLocation: Albuquerque, NM\nEdu...",3.8,"Albuquerque, NM","Goleta, CA",501 to 1000 employees,1973,Company - Private,Aerospace & Defense,Aerospace & Defense,$50 to $100 million (USD),-1,72.0,Tecolote Research
1,Healthcare Data Scientist,What You Will Do:\n\nI. General Summary\n\nThe...,3.4,"Linthicum, MD","Baltimore, MD",10000+ employees,1984,Other Organization,Health Care Services & Hospitals,Health Care,$2 to $5 billion (USD),-1,87.5,University of Maryland Medical System
2,Data Scientist,"KnowBe4, Inc. is a high growth information sec...",4.8,"Clearwater, FL","Clearwater, FL",501 to 1000 employees,2010,Company - Private,Security Services,Business Services,$100 to $500 million (USD),-1,85.0,KnowBe4


In [10]:
df['Salary Estimate'].value_counts()

Salary Estimate
-1                                  214
$86K-$143K (Glassdoor est.)           6
$54K-$115K (Glassdoor est.)           6
$49K-$113K (Glassdoor est.)           6
$21-$34 Per Hour(Glassdoor est.)      6
                                   ... 
$74K-$119K (Glassdoor est.)           1
$55K-$97K (Glassdoor est.)            1
$15K-$16K(Employer est.)              1
$61K-$106K (Glassdoor est.)           1
$62K-$113K (Glassdoor est.)           1
Name: count, Length: 417, dtype: int64

In [9]:
df.sample(3)

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors
563,Software Engineer - Data Visualization,$60K-$127K (Glassdoor est.),Join ClearEdge and be a part of the team of me...,4.0,ClearEdge\n4.0,"Annapolis Junction, MD","Annapolis Junction, MD",51 to 200 employees,2002,Company - Private,Computer Hardware & Software,Information Technology,$5 to $10 million (USD),-1
331,Principal Data Scientist with over 10 years ex...,Employer Provided Salary:$200K-$250K,Position Title: Principal Data Scientist\nLoca...,-1.0,CA-One Tech Cloud,"San Francisco, CA","Fremont, CA",51 to 200 employees,2017,Company - Private,IT Services,Information Technology,$5 to $10 million (USD),-1
615,Pharmacovigilance Scientist (Senior Pharmacovi...,-1,We are looking for enthusiastic and talented i...,4.6,Greenwich Biosciences\n4.6,"Carlsbad, CA","Carlsbad, CA",201 to 500 employees,2013,Subsidiary or Business Segment,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,Unknown / Non-Applicable,-1
