# Continuing our orignial efforts... and starting something new
Since we ran into some issues with the selenium web scraping (AKA they figured out that I was using a bot), lets just use this dataset (https://www.kaggle.com/andrewmvd/data-scientist-jobs) to continue our analysis. It may seem unfortunate that our web scraping efforts have come to a halt, although I am partially satisfied knowing that I now have a working understanding of selenium and will still be able to conclude my data science job description analysis.

In [336]:
import os
import string

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk.corpus import stopwords

In [337]:
# desired columns for the time being
usecols = [
    'Job Title',
    'Company Name',
    'Location',
    'Salary Estimate',
    'Job Description']

rename = [
    'title',
    'company',
    'location',
    'salary_estimate',
    'description']

In [338]:
# lets grab this csv from my data folder
path = "C://Users//voyno//Desktop\\indeed-jobs\\data\\data_scientist_jobs.csv"
df = pd.read_csv(path, usecols=usecols)[usecols]

# show current columns
print("Remaining Columns:")
for i, col in enumerate(df.columns):
    print(f'{i:4}. {col}')

Remaining Columns:
   0. Job Title
   1. Company Name
   2. Location
   3. Salary Estimate
   4. Job Description


In [339]:
# update column names
df.columns = rename

print("New Column names:")
for i, col in enumerate(df.columns):
    print(f'{i:4}. {col}')

New Column names:
   0. title
   1. company
   2. location
   3. salary_estimate
   4. description


In [340]:
df.head()

Unnamed: 0,title,company,location,salary_estimate,description
0,Senior Data Scientist,Hopper\n3.5,"New York, NY",$111K-$181K (Glassdoor est.),"ABOUT HOPPER\n\nAt Hopper, we’re on a mission ..."
1,"Data Scientist, Product Analytics",Noom US\n4.5,"New York, NY",$111K-$181K (Glassdoor est.),"At Noom, we use scientifically proven methods ..."
2,Data Science Manager,Decode_M,"New York, NY",$111K-$181K (Glassdoor est.),Decode_M\n\nhttps://www.decode-m.com/\n\nData ...
3,Data Analyst,Sapphire Digital\n3.4,"Lyndhurst, NJ",$111K-$181K (Glassdoor est.),Sapphire Digital seeks a dynamic and driven mi...
4,"Director, Data Science",United Entertainment Group\n3.4,"New York, NY",$111K-$181K (Glassdoor est.),"Director, Data Science - (200537)\nDescription..."


In [341]:
# first we will fix company field

df["company"] = df["company"].apply(lambda x: x.split("\n")[0])

In [342]:
# then we will fix salary estimate values

# take note of which salaries are hourly
df["hourly_salary"] = df["salary_estimate"].apply(lambda x: 1 if "Per Hour" in x else 0)

remove_strings = ["K", "$", " (Glassdoor est.)", "(Employer est.)", "Per Hour(Glassdoor est.)"]
for item in remove_strings:
    df["salary_estimate"] = df["salary_estimate"].apply(lambda x: x.replace(item, ""))
df["salary_estimate"] = df["salary_estimate"].apply(lambda x: np.mean(list(map(int, x.split("-")))))

# update hour salaries to consistant with yearly data
condition = df["hourly_salary"] == 1
df.loc[condition, "salary_estimate"] = df.loc[condition, "hourly_salary"].apply(lambda x: x * 40 * 52 / 1000)
df.drop(["hourly_salary"], axis=1, inplace=True)

df['salary_estimate'] = df['salary_estimate'].astype('int')

In [343]:
df.head()

Unnamed: 0,title,company,location,salary_estimate,description
0,Senior Data Scientist,Hopper,"New York, NY",146,"ABOUT HOPPER\n\nAt Hopper, we’re on a mission ..."
1,"Data Scientist, Product Analytics",Noom US,"New York, NY",146,"At Noom, we use scientifically proven methods ..."
2,Data Science Manager,Decode_M,"New York, NY",146,Decode_M\n\nhttps://www.decode-m.com/\n\nData ...
3,Data Analyst,Sapphire Digital,"Lyndhurst, NJ",146,Sapphire Digital seeks a dynamic and driven mi...
4,"Director, Data Science",United Entertainment Group,"New York, NY",146,"Director, Data Science - (200537)\nDescription..."


In [344]:
def clean_text(text):
    
    """ Given a sequence of text:
            - convert all characters to lower case
            - split text on " " character
            - remove nltk english defined stop words
            - remove words containing non alphanumerica values
            - join with " " character
            - return remaining data
    """
    
    text = text.lower().split()
    stop_words = stopwords.words('english')
    text = [word for word in text if word not in stop_words and word.isalnum()]
    
    return " ".join(text)

In [345]:
df["title"] = [clean_text(df.loc[idx, "title"]) for idx in df.index]
df["description"] = [clean_text(df.loc[idx, "description"]) for idx in df.index]

In [346]:
word_frequencies = {}
text = " ".join(df["description"].values).split()

for word in text:
    if word in word_frequencies.keys():
        word_frequencies[word] += 1
    else:
        word_frequencies[word] = 1

In [347]:
word_freq_set = [[key, word_frequencies[key]] for key in word_frequencies]
word_freq_df = pd.DataFrame(word_freq_set, columns=["word", "frequency"])
word_freq_df.sort_values(by="frequency", ascending=False).iloc[:10]

Unnamed: 0,word,frequency
11,data,33037
101,experience,18306
168,work,9297
79,business,8883
73,team,5905
43,years,5372
166,ability,5161
164,skills,4784
77,development,4493
276,knowledge,4377


In [358]:
df.head()

Unnamed: 0,title,company,location,salary_estimate,description
0,senior data scientist,Hopper,"New York, NY",146,hopper mission make booking travel leveraging ...
1,data product analytics,Noom US,"New York, NY",146,use scientifically proven methods help users c...
2,data science manager,Decode_M,"New York, NY",146,data science manager job description hiring da...
3,data analyst,Sapphire Digital,"Lyndhurst, NJ",146,sapphire digital seeks dynamic driven data joi...
4,data science,United Entertainment Group,"New York, NY",146,data science description edelman intelligence ...


## Alright, now our data is clean (enough for now)!