# Main questions
***
- Who gets hired? 
- What kind of talent do employers want when they are hiring a data scientist?
- Which location has the most opportunities?
- What skills, tools, degrees or majors do employers want the most for data scientists?
***

In [26]:
# import libraries
import numpy as np
import math as mt
import pandas as pd

In [27]:
#import dataframe
data = pd.read_csv("alldata.csv") 


In [28]:
#lets get an overview
data.head(50)

Unnamed: 0,position,company,description,reviews,location
0,Development Director,ALS TDI,Development Director\nALS Therapy Development ...,,"Atlanta, GA 30301"
1,An Ostentatiously-Excitable Principal Research...,The Hexagon Lavish,"Job Description\n\n""The road that leads to acc...",,"Atlanta, GA"
2,Data Scientist,Xpert Staffing,"Growing company located in the Atlanta, GA are...",,"Atlanta, GA"
3,Data Analyst,Operation HOPE,DEPARTMENT: Program OperationsPOSITION LOCATIO...,44.0,"Atlanta, GA 30303"
4,Assistant Professor -TT - Signal Processing & ...,Emory University,DESCRIPTION\nThe Emory University Department o...,550.0,"Atlanta, GA"
5,Manager of Data Engineering,McKinsey & Company,Qualifications\nBachelor’s degree in Computer ...,385.0,"Atlanta, GA 30318"
6,"Product Specialist - Periscope, New Ventures",McKinsey & Company,Qualifications\nBachelor’s degree\n5-7 years o...,385.0,"Atlanta, GA 30318"
7,"Junior to Mid-level Engineer, Geologist or Env...",Wood,Overview / Responsibilities\nWood Environment ...,899.0,"Atlanta, GA"
8,Analyst - CIB Credit Research,SunTrust,Works closely with senior CIB professionals. P...,3343.0,"Atlanta, GA"
9,Senior Associate - Cognitive Data Scientist Na...,KPMG,Known for being a great place to work and buil...,4494.0,"Atlanta, GA 30338"


In [29]:
data.count()

position       6953
company        6953
description    6953
reviews        5326
location       6953
dtype: int64

In [30]:
#since we don't need the 'reviews' column to answer our question, and the number of rows is oly 25% of the other 
#rows, i decided to drop the whole column
df=data.drop(labels='reviews', axis=1)
df.head(0)

Unnamed: 0,position,company,description,location


In [31]:
df.describe()

Unnamed: 0,position,company,description,location
count,6953,6953,6953,6953
unique,5242,2213,6708,382
top,Data Scientist,Amazon.com,Note: By applying to this position your applic...,"Seattle, WA"
freq,351,358,9,563


Thamo expected count and unique values to be the same but those are most likely NaNs.

Most jobs are apparently being offered in Seattle, New York, Cambridge and Boston along with San Francisco.


In [32]:
# change 'position' to string
df['position'].astype(str)
# standardize 'position' to lowercase
df['position'] = df['position'].str.lower()
df.head(2)

Unnamed: 0,position,company,description,location
0,development director,ALS TDI,Development Director\nALS Therapy Development ...,"Atlanta, GA 30301"
1,an ostentatiously-excitable principal research...,The Hexagon Lavish,"Job Description\n\n""The road that leads to acc...","Atlanta, GA"


In [33]:
# change 'company' to string
df['company'].astype(str)
# standardize 'company' to lowercase
df['company'] = df['company'].str.lower()
df.head(2)

Unnamed: 0,position,company,description,location
0,development director,als tdi,Development Director\nALS Therapy Development ...,"Atlanta, GA 30301"
1,an ostentatiously-excitable principal research...,the hexagon lavish,"Job Description\n\n""The road that leads to acc...","Atlanta, GA"


In [34]:
# change 'description' to string
df['description'].astype(str)
# standardize 'description' to lowercase
df['description'] = df['description'].str.lower()
df.head(2)

Unnamed: 0,position,company,description,location
0,development director,als tdi,development director\nals therapy development ...,"Atlanta, GA 30301"
1,an ostentatiously-excitable principal research...,the hexagon lavish,"job description\n\n""the road that leads to acc...","Atlanta, GA"


In [35]:
# change 'location' to string
df['location'].astype(str)
# standardize location' to lowercase
df['location'] = df['location'].str.lower()
df.head()

Unnamed: 0,position,company,description,location
0,development director,als tdi,development director\nals therapy development ...,"atlanta, ga 30301"
1,an ostentatiously-excitable principal research...,the hexagon lavish,"job description\n\n""the road that leads to acc...","atlanta, ga"
2,data scientist,xpert staffing,"growing company located in the atlanta, ga are...","atlanta, ga"
3,data analyst,operation hope,department: program operationsposition locatio...,"atlanta, ga 30303"
4,assistant professor -tt - signal processing & ...,emory university,description\nthe emory university department o...,"atlanta, ga"


In [36]:
# lets see how many nan values there are
print(df['position'].isnull().sum())
print(df['company'].isnull().sum())
print(df['description'].isnull().sum())
print(df['location'].isnull().sum())

11
11
11
11


In [37]:
# decided to drop the nan rows since its so few of them anyway
df1 = df.dropna()

In [38]:
# check if there are no more nan values
df1.isnull().any().any()

False

In [39]:
# see if there is really really no more nan value


In [40]:
# lets see how many nan values there are
print(df1['position'].isnull().sum())
print(df1['company'].isnull().sum())
print(df1['description'].isnull().sum())
print(df1['location'].isnull().sum())

0
0
0
0


In [41]:
df1.dtypes

position       object
company        object
description    object
location       object
dtype: object

In [42]:
# now lets rassign this again to a new array so that we get our objects back
cleaned=pd.DataFrame(data=df1)

In [43]:
cleaned.dtypes

position       object
company        object
description    object
location       object
dtype: object

In [44]:
cleaned.head(10)

Unnamed: 0,position,company,description,location
0,development director,als tdi,development director\nals therapy development ...,"atlanta, ga 30301"
1,an ostentatiously-excitable principal research...,the hexagon lavish,"job description\n\n""the road that leads to acc...","atlanta, ga"
2,data scientist,xpert staffing,"growing company located in the atlanta, ga are...","atlanta, ga"
3,data analyst,operation hope,department: program operationsposition locatio...,"atlanta, ga 30303"
4,assistant professor -tt - signal processing & ...,emory university,description\nthe emory university department o...,"atlanta, ga"
5,manager of data engineering,mckinsey & company,qualifications\nbachelor’s degree in computer ...,"atlanta, ga 30318"
6,"product specialist - periscope, new ventures",mckinsey & company,qualifications\nbachelor’s degree\n5-7 years o...,"atlanta, ga 30318"
7,"junior to mid-level engineer, geologist or env...",wood,overview / responsibilities\nwood environment ...,"atlanta, ga"
8,analyst - cib credit research,suntrust,works closely with senior cib professionals. p...,"atlanta, ga"
9,senior associate - cognitive data scientist na...,kpmg,known for being a great place to work and buil...,"atlanta, ga 30338"


In [45]:
#convert to an excel we can read and work with normally
cleaned.to_excel(r'cleaned.xlsx', index = False)

In [46]:
#convert to an csv we can read and work with normally

cleaned.to_csv(r'cleanedcsv.csv', index = False, header = True)


In [47]:
# Tony: import regular expressions

import re



In [48]:
# Tony: let's filter the "description" column from the cleaned df

cleaned.loc[:, ["description"]]

Unnamed: 0,description
0,development director\nals therapy development ...
1,"job description\n\n""the road that leads to acc..."
2,"growing company located in the atlanta, ga are..."
3,department: program operationsposition locatio...
4,description\nthe emory university department o...
...,...
6959,are you data-driven? we at netapp believe in t...
6960,pharmacyclics is committed to the development ...
6961,"oath, a subsidiary of verizon, is a values-led..."
6962,we are the bing core relevance team responsibl...


In [49]:
# Tony: assign the filterd description column to variable named test
test = cleaned.loc[:, ["description"]]
test.shape

(6953, 1)

In [50]:
# Kosta helps Tony with testing so he understands what is dataframe and what is series
# and functions related to them

type(test_sql)

NameError: name 'test_sql' is not defined

In [51]:
# Tony: 
# test is cleaned.loc[:,["description"]]
# now we want to test the extraction of keywords
# count them
# store the results into a variable and convert them into an int

test_sql = test[test["description"].str.contains("sql")]
sql_count = test_sql.count()
IntSQLCount = int(sql_count)
IntSQLCount # count the columns # 1924 mentions

1924

In [52]:
test_economics = test[test["description"].str.contains("economics")]
economics_count = test_economics.count()
IntEconomicsCount = int(economics_count)
IntEconomicsCount # count the columns # 630 mentions

630

In [53]:
# Tony: we will continue this way with other criterias that we defined

In [54]:
test_python = test[test["description"].str.contains("python")]
python_count = test_python.count() # count them
IntPythonCount = int(python_count) #
IntPythonCount

# 2852 mentions for python

2852

In [55]:
test_R = test[test["description"].str.contains("R")]
R_count = test_R.count() 
IntRCount = int(R_count)
IntRCount

# Tony:  0 mentions for R? I am probably doing sth wrong :)

0

In [56]:
test_excel = test[test["description"].str.contains("excel")]
excel_count = test_excel.count()
IntExcelCount = int(excel_count)
IntExcelCount

3566

In [57]:
test_tableau = test[test["description"].str.contains("tableau")]
tableau_count = test_tableau.count()
IntTableauCount = int(tableau_count)
IntTableauCount

492

In [58]:
test_bachelor = test[test["description"].str.contains("bachelor degree")]
bachelor_count = test_bachelor.count()
IntBachelorCount = int(bachelor_count)
IntBachelorCount

# Tony: 50 mentions out of 6953 rows?

50

In [59]:
test_master = test[test["description"].str.contains("master degree")]
master_count = test_master.count()
IntMasterCount = int(master_count)
IntMasterCount

# Tony: 54 mentions out of 6953 rows?

54

In [60]:
test_PHD = test[test["description"].str.contains("phd degree")]
PHD_count = test_PHD.count()
IntPHDCount = int(PHD_count)
IntPHDCount

# Tony: 206 mentions from 6953 rows

206

In [61]:
test_data_analyst = test[test["description"].str.contains("data analyst")]
DataAnalyst_count = test_data_analyst.count()
IntDataAnalystCount = int(DataAnalyst_count)
IntDataAnalystCount

# Tony: 233 out of 6953 rows

233

In [62]:
# test
# data[data['position'].str.contains("data scientist" or "scientist")]
df[df['description'].str.contains("bac")]

ValueError: Cannot mask with non-boolean array containing NA / NaN values

In [None]:
# test
description.str.contains("data")

In [None]:
#cleaned.head()
cleaned[(cleaned['description'])]