# Data science job market data

In [1]:
import pandas as pd
import numpy as np

In [24]:
ds = 'data-scientist-job-market.csv'
ln = 'linkedin_data.csv'

In [None]:
ds_df = pd.read_csv(ds,encoding = "ISO-8859-16")
ln_df = pd.read_csv(ln,encoding = "ISO-8859-16")

In [4]:
ds_df.head()

Unnamed: 0,position,company,description,reviews,location
0,Development Director,ALS TDI,Development Director\nALS Therapy Development ...,,"Atlanta, GA 30301"
1,An Ostentatiously-Excitable Principal Research...,The Hexagon Lavish,"Job Description\n\n""The road that leads to acc...",,"Atlanta, GA"
2,Data Scientist,Xpert Staffing,"Growing company located in the Atlanta, GA are...",,"Atlanta, GA"
3,Data Analyst,Operation HOPE,DEPARTMENT: Program OperationsPOSITION LOCATIO...,44.0,"Atlanta, GA 30303"
4,Assistant Professor -TT - Signal Processing & ...,Emory University,DESCRIPTION\nThe Emory University Department o...,550.0,"Atlanta, GA"


In [5]:
# Check for non-null objects
ds_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6964 entries, 0 to 6963
Data columns (total 5 columns):
position       6953 non-null object
company        6953 non-null object
description    6953 non-null object
reviews        5326 non-null float64
location       6953 non-null object
dtypes: float64(1), object(4)
memory usage: 272.1+ KB


In [6]:
# Count of location data with zip code
ds_df[ds_df['location'].str.contains('0|1|2|3|4|5|6|7|8|9', na = False)].location.size

2143

In [7]:
# Drop NAN locations
ds_df.dropna(subset=["location"])

Unnamed: 0,position,company,description,reviews,location
0,Development Director,ALS TDI,Development Director\nALS Therapy Development ...,,"Atlanta, GA 30301"
1,An Ostentatiously-Excitable Principal Research...,The Hexagon Lavish,"Job Description\n\n""The road that leads to acc...",,"Atlanta, GA"
2,Data Scientist,Xpert Staffing,"Growing company located in the Atlanta, GA are...",,"Atlanta, GA"
3,Data Analyst,Operation HOPE,DEPARTMENT: Program OperationsPOSITION LOCATIO...,44.0,"Atlanta, GA 30303"
4,Assistant Professor -TT - Signal Processing & ...,Emory University,DESCRIPTION\nThe Emory University Department o...,550.0,"Atlanta, GA"
5,Manager of Data Engineering,McKinsey & Company,Qualifications\nBachelorâs degree in Compute...,385.0,"Atlanta, GA 30318"
6,"Product Specialist - Periscope, New Ventures",McKinsey & Company,Qualifications\nBachelorâs degree\n5-7 years...,385.0,"Atlanta, GA 30318"
7,"Junior to Mid-level Engineer, Geologist or Env...",Wood,Overview / Responsibilities\nWood Environment ...,899.0,"Atlanta, GA"
8,Analyst - CIB Credit Research,SunTrust,Works closely with senior CIB professionals. P...,3343.0,"Atlanta, GA"
9,Senior Associate - Cognitive Data Scientist Na...,KPMG,Known for being a great place to work and buil...,4494.0,"Atlanta, GA 30338"


In [8]:
# Get rid of zip code, leaving only city and state
import re
for row_index,row in ds_df.iterrows():
    
    location_string = re.split(' |, ',str(row['location']))
    citystate = [s for s in location_string if s and not s.isdigit()]
    city = " ".join([s.title() for s in citystate if not s.isupper()])
    for s in citystate:
        if s.isupper():
            state = s.upper()
    ds_df.loc[row_index,'city'] = city
    ds_df.loc[row_index,'state'] = state
                            
ds_df.tail()

Unnamed: 0,position,company,description,reviews,location,city,state
6959,Data Developer / Machine Learning Analyst,NetApp,Are you data-driven? We at NetApp believe in t...,574.0,"Sunnyvale, CA",Sunnyvale,CA
6960,Scientist I,"Pharmacyclics, an Abbvie Company",Pharmacyclics is committed to the development ...,26.0,"Sunnyvale, CA",Sunnyvale,CA
6961,Intern Scientist,Oath Inc,"Oath, a subsidiary of Verizon, is a values-led...",5.0,"Sunnyvale, CA",Sunnyvale,CA
6962,Senior Data & Applied Scientist,Microsoft,We are the Bing Core Relevance team responsibl...,4618.0,"Sunnyvale, CA",Sunnyvale,CA
6963,"Principal Data Scientist, Deep Learning",Comcast,Comcastâs Technology &amp; Product organizat...,11610.0,"Sunnyvale, CA 94089",Sunnyvale,CA


In [9]:
ds_df.sort_values(by="state")

Unnamed: 0,position,company,description,reviews,location,city,state
6963,"Principal Data Scientist, Deep Learning",Comcast,Comcastâs Technology &amp; Product organizat...,11610.0,"Sunnyvale, CA 94089",Sunnyvale,CA
4825,"Senior Staff Scientist, R&D - San Diego",Instrumentation Laboratory,Overview\nInstrumentation Laboratory\nOur Pass...,47.0,"San Diego, CA",San Diego,CA
4824,Software Engineer,ASML,Introduction\nASML brings together the most cr...,176.0,"San Diego, CA",San Diego,CA
4823,Lead Analytic Scientist (AI/Machine Learning/J...,FICO,Job Description\nThe need for analytics is eve...,121.0,"San Diego, CA",San Diego,CA
4822,Project Administrator/Project Accountant,Kleinfelder,Details:\n\nKleinfelder has been connecting gr...,92.0,"San Diego, CA",San Diego,CA
4821,"Manager, Systems Design",Cymer,ASML is one of the worldâs leading manufactu...,52.0,"San Diego, CA",San Diego,CA
4820,Staff System Performance Engineer,Cymer,ASML is one of the worldâs leading manufactu...,52.0,"San Diego, CA",San Diego,CA
4826,"Architect (Deep / Machine Learning, Big Data)",Workbridge Associates,An international software company is hiring a ...,40.0,"San Diego, CA",San Diego,CA
4819,Software Engineer - Customer Solutions,Illumina,Position Summary:\n\nThe Customer Solutions or...,133.0,"San Diego, CA",San Diego,CA
4817,Senior Operations Research Analyst,Engility,About Engility:\n\nEngility delivers innovativ...,436.0,"San Diego, CA 92108",San Diego,CA


In [10]:
# Replace \n in descriptions by space
ds_df = ds_df.replace('\n',' ', regex=True)

In [11]:
# Parse descriptions into words
words = dict(pd.Series(re.split('\.\\s|\s|,\s|\t|\.\\s\s|\t', " ".join(ds_df['description'].astype(str)).lower())).value_counts()[0:1000])
# delete non-words
for word in ['&amp;','(e.g','(e.g.','(including','-','/','1','2','2+','3','3+','5','5+',':','']:
    del words[word]
words = {k:v for k,v in words.items() if ('' or '') not in k}

In [12]:
# List of common english words
with open('common_words.txt', 'r') as f:
    common_words = [line.strip() for line in f]

In [13]:
# Filter words dictionary by list of common words
words_filtered = {}
for key, value in words.items():
    if key not in common_words:
        words_filtered[key]=value
# Get first 500 most frequent words
words_filtered = pd.Series(words_filtered).sort_values(ascending=False)[:500]

In [14]:
len(words_filtered)

500

In [None]:
# Create a new column that contains key words from description
for index, row in ds_df.iterrows():
    descr = row.description
    descr_words = set(re.split('\.\\s|\s|,\s|\t|\.\\s\s|\t', str(descr).lower()))
    key_words = [w for w in words_filtered.index[:100] for d in descr_words if w==d]
    ds_df.loc[index,'key_words']=", ".join(key_words)

In [16]:
# Filter out unwanted columns
ds_df = ds_df[["position","company","key_words","city","state"]]
ds_df.head()

Unnamed: 0,position,company,key_words,city,state
0,Development Director,ALS TDI,"data, research, development, skills, business,...",Atlanta,GA
1,An Ostentatiously-Excitable Principal Research...,The Hexagon Lavish,"data, research, development, skills, ability, ...",Atlanta,GA
2,Data Scientist,Xpert Staffing,"data, research, development, skills, business,...",Atlanta,GA
3,Data Analyst,Operation HOPE,"data, development, skills, years, management, ...",Atlanta,GA
4,Assistant Professor -TT - Signal Processing & ...,Emory University,"data, research, development, ability, learning...",Atlanta,GA


In [17]:
# Create dataframe of most common words
words_filtered_df = pd.DataFrame(words_filtered).reset_index().rename(columns = {'index':'word',0:'frequency'})

In [18]:
# Set up connection to SQLite
from sqlalchemy import create_engine

In [19]:
engine = create_engine('sqlite:///database.db')

In [22]:
ds_df.to_sql('data_science_companies', con=engine, index=False)

In [23]:
words_filtered_df.to_sql('job_common_words', con=engine, index=False)

# Occupation stats data

In [30]:
occu = 'occupation-filtered.csv'
oc_df = pd.read_csv(occu)

In [31]:
oc_df.head()

Unnamed: 0,ST,STATE,OCC_TITLE,TOT_EMP,JOBS_1000,LOC_Q,H_MEAN,A_MEAN,H_PCT10,H_PCT25,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY
0,AL,Alabama,Computer and Information Systems Managers,3120,1.624,0.63,59.51,123790,32.07,43.41,56.24,71.68,88.92,66710,90290,116990,149100,184950,,
1,AL,Alabama,Computer and Mathematical Occupations,39760,20.68,0.69,39.36,81870,20.26,27.23,37.3,49.05,61.78,42150,56630,77590,102020,128500,,
2,AK,Alaska,Computer and Information Systems Managers,360,1.132,0.44,52.91,110050,37.65,43.66,51.71,60.99,72.78,78320,90810,107550,126870,151390,,
3,AK,Alaska,Computer and Mathematical Occupations,5450,17.124,0.57,39.34,81820,20.62,27.33,36.92,47.13,59.75,42890,56840,76800,98040,124290,,
4,AZ,Arizona,Computer and Information Systems Managers,7000,2.589,1.01,64.87,134930,34.48,46.89,62.12,77.06,96.85,71720,97540,129220,160280,201450,,
