# Continuing our orignial efforts... and starting something new
Since we ran into some issues with the selenium web scraping (AKA they figured out that I was using a bot), lets just use this dataset (https://www.kaggle.com/andrewmvd/data-scientist-jobs) to continue our analysis. It may seem unfortunate that our web scraping efforts have come to a halt, although I am partially satisfied knowing that I now have a working understanding of selenium and will still be able to conclude my data science job description analysis.

In [11]:
import os
import string

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk.corpus import stopwords

In [12]:
# desired columns for the time being
usecols = [
    'Job Title',
    'Company Name',
    'Location',
    'Salary Estimate',
    'Job Description']

rename = [
    'title',
    'company',
    'location',
    'salary_estimate',
    'description']

stop_words = stopwords.words('english')

In [13]:
# lets grab this csv from my data folder
path = "C://Users//voyno//Desktop\\indeed-jobs\\data\\data_scientist_jobs.csv"
df = pd.read_csv(path, usecols=usecols)[usecols]

# show current columns
print("Remaining Columns:")
for i, col in enumerate(df.columns):
    print(f'{i:4}. {col}')

Remaining Columns:
   0. Job Title
   1. Company Name
   2. Location
   3. Salary Estimate
   4. Job Description


In [14]:
# update column names
df.columns = rename

print("New Column names:")
for i, col in enumerate(df.columns):
    print(f'{i:4}. {col}')

New Column names:
   0. title
   1. company
   2. location
   3. salary_estimate
   4. description


In [15]:
df.head(3)

Unnamed: 0,title,company,location,salary_estimate,description
0,Senior Data Scientist,Hopper\n3.5,"New York, NY",$111K-$181K (Glassdoor est.),"ABOUT HOPPER\n\nAt Hopper, we’re on a mission ..."
1,"Data Scientist, Product Analytics",Noom US\n4.5,"New York, NY",$111K-$181K (Glassdoor est.),"At Noom, we use scientifically proven methods ..."
2,Data Science Manager,Decode_M,"New York, NY",$111K-$181K (Glassdoor est.),Decode_M\n\nhttps://www.decode-m.com/\n\nData ...


In [16]:
# first we will fix company field

df["company"] = df["company"].apply(lambda x: x.split("\n")[0])

In [17]:
# then we will fix salary estimate values

# column econding hourly salary
df["hourly_salary"] = df["salary_estimate"].apply(lambda x: 1 if "Per Hour" in x else 0)

# remove strings
remove_strings = ["K", "$", " (Glassdoor est.)", "(Employer est.)", "Per Hour(Glassdoor est.)"]
for item in remove_strings:
    df["salary_estimate"] = df["salary_estimate"].apply(lambda x: x.replace(item, ""))

# get mean of salary estimate
df["salary_estimate"] = df["salary_estimate"].apply(lambda x: np.mean(list(map(int, x.split("-")))))

# update hour salaries to consistant with yearly data
condition = df["hourly_salary"] == 1
df.loc[condition, "salary_estimate"] = df.loc[condition, "hourly_salary"].apply(lambda x: x * 40 * 52 / 1000)
df.drop(["hourly_salary"], axis=1, inplace=True)

# set salary type as int
df['salary_estimate'] = df['salary_estimate'].astype('int')

In [18]:
def clean_text(text):
    
    """ Given a sequence of text:
            - convert all characters to lower case
            - split text on " " character
            - remove nltk english defined stop words
            - remove words containing non alphanumerica values
            - join with " " character
            - return remaining data
    """    
    text = text.lower().split()
    text = [word.encode("ascii", errors="ignore").decode() for word in text if word not in stop_words and word.isalnum()]
    
    return " ".join(text)

In [19]:
# clean titles and descriptions
df["title"] = [clean_text(df.loc[idx, "title"]) for idx in df.index]
df["description"] = [clean_text(df.loc[idx, "description"]) for idx in df.index]

In [26]:
print(df.shape)
df.dropna(inplace=True)
print(df.shape)
df.to_csv('data/cleaned_jobs.csv', index=False)

(3909, 5)
(3909, 5)


## Alright, now our data is clean! (or at least clean enough for now)

In [33]:
np.where(pd.isna(df["title"]))

(array([], dtype=int64),)