# Main questions
***
- Who gets hired? 
- What kind of talent do employers want when they are hiring a data scientist?
- Which location has the most opportunities?
- What skills, tools, degrees or majors do employers want the most for data scientists?
***

In [1]:
# import libraries
import numpy as np
import math as mt
import pandas as pd

In [2]:
#import dataframe
data = pd.read_csv("alldata.csv") 


In [3]:
#lets get an overview
data.head(50)

Unnamed: 0,position,company,description,reviews,location
0,Development Director,ALS TDI,Development Director\nALS Therapy Development ...,,"Atlanta, GA 30301"
1,An Ostentatiously-Excitable Principal Research...,The Hexagon Lavish,"Job Description\n\n""The road that leads to acc...",,"Atlanta, GA"
2,Data Scientist,Xpert Staffing,"Growing company located in the Atlanta, GA are...",,"Atlanta, GA"
3,Data Analyst,Operation HOPE,DEPARTMENT: Program OperationsPOSITION LOCATIO...,44.0,"Atlanta, GA 30303"
4,Assistant Professor -TT - Signal Processing & ...,Emory University,DESCRIPTION\nThe Emory University Department o...,550.0,"Atlanta, GA"
5,Manager of Data Engineering,McKinsey & Company,Qualifications\nBachelor’s degree in Computer ...,385.0,"Atlanta, GA 30318"
6,"Product Specialist - Periscope, New Ventures",McKinsey & Company,Qualifications\nBachelor’s degree\n5-7 years o...,385.0,"Atlanta, GA 30318"
7,"Junior to Mid-level Engineer, Geologist or Env...",Wood,Overview / Responsibilities\nWood Environment ...,899.0,"Atlanta, GA"
8,Analyst - CIB Credit Research,SunTrust,Works closely with senior CIB professionals. P...,3343.0,"Atlanta, GA"
9,Senior Associate - Cognitive Data Scientist Na...,KPMG,Known for being a great place to work and buil...,4494.0,"Atlanta, GA 30338"


In [4]:
data.count()

position       6953
company        6953
description    6953
reviews        5326
location       6953
dtype: int64

In [5]:
#since we don't need the 'reviews' column to answer our question, and the number of rows is oly 25% of the other 
#rows, i decided to drop the whole column
df=data.drop(labels='reviews', axis=1)
df.head(0)

Unnamed: 0,position,company,description,location


In [6]:
df.describe()

Unnamed: 0,position,company,description,location
count,6953,6953,6953,6953
unique,5242,2213,6708,382
top,Data Scientist,Amazon.com,Note: By applying to this position your applic...,"Seattle, WA"
freq,351,358,9,563


Already a few interesting thing to note here!
I would have expected count and unique values in the descritption to be the same but those are most likely NaNs.
Most Jobs are offered in seattle. Either there lots of companies missing or theres companies. lets look at nan values

In [7]:
# lets see how many nan values there are
print(df['position'].isnull().sum())
print(df['company'].isnull().sum())
print(df['description'].isnull().sum())
print(df['location'].isnull().sum())

11
11
11
11


In [22]:
# decided to drop the nan rows since its so few of them anyway
df1 = df[df.notna()]
df1

Unnamed: 0,position,company,description,location
0,Development Director,ALS TDI,Development Director\nALS Therapy Development ...,"Atlanta, GA 30301"
1,An Ostentatiously-Excitable Principal Research...,The Hexagon Lavish,"Job Description\n\n""The road that leads to acc...","Atlanta, GA"
2,Data Scientist,Xpert Staffing,"Growing company located in the Atlanta, GA are...","Atlanta, GA"
3,Data Analyst,Operation HOPE,DEPARTMENT: Program OperationsPOSITION LOCATIO...,"Atlanta, GA 30303"
4,Assistant Professor -TT - Signal Processing & ...,Emory University,DESCRIPTION\nThe Emory University Department o...,"Atlanta, GA"
...,...,...,...,...
6959,Data Developer / Machine Learning Analyst,NetApp,Are you data-driven? We at NetApp believe in t...,"Sunnyvale, CA"
6960,Scientist I,"Pharmacyclics, an Abbvie Company",Pharmacyclics is committed to the development ...,"Sunnyvale, CA"
6961,Intern Scientist,Oath Inc,"Oath, a subsidiary of Verizon, is a values-led...","Sunnyvale, CA"
6962,Senior Data & Applied Scientist,Microsoft,We are the Bing Core Relevance team responsibl...,"Sunnyvale, CA"


In [24]:
# check if there are no more nan values
df1.isnull().any().any()

True

In [10]:
df1.dtypes

position       object
company        object
description    object
location       object
dtype: object

In [11]:
# change 'position' to string
df1['position'].astype(str)
# standardize 'position' to lowercase
df1['position'] = df1['position'].str.lower()
df1.head(2)

Unnamed: 0,position,company,description,location
0,development director,ALS TDI,Development Director\nALS Therapy Development ...,"Atlanta, GA 30301"
1,an ostentatiously-excitable principal research...,The Hexagon Lavish,"Job Description\n\n""The road that leads to acc...","Atlanta, GA"


In [12]:
# change 'company' to string
df1['company'].astype(str)
# standardize 'company' to lowercase
df1['company'] = df1['company'].str.lower()
df1.head(2)

Unnamed: 0,position,company,description,location
0,development director,als tdi,Development Director\nALS Therapy Development ...,"Atlanta, GA 30301"
1,an ostentatiously-excitable principal research...,the hexagon lavish,"Job Description\n\n""The road that leads to acc...","Atlanta, GA"


In [13]:
# change 'description' to string
df1['description'].astype(str)
# standardize 'description' to lowercase
df1['description'] = df1['description'].str.lower()
df1.head(2)

Unnamed: 0,position,company,description,location
0,development director,als tdi,development director\nals therapy development ...,"Atlanta, GA 30301"
1,an ostentatiously-excitable principal research...,the hexagon lavish,"job description\n\n""the road that leads to acc...","Atlanta, GA"


In [14]:
# change 'location' to string
df1['location'].astype(str)
# standardize location' to lowercase
df1['location'] = df1['location'].str.lower()
df1.head()

Unnamed: 0,position,company,description,location
0,development director,als tdi,development director\nals therapy development ...,"atlanta, ga 30301"
1,an ostentatiously-excitable principal research...,the hexagon lavish,"job description\n\n""the road that leads to acc...","atlanta, ga"
2,data scientist,xpert staffing,"growing company located in the atlanta, ga are...","atlanta, ga"
3,data analyst,operation hope,department: program operationsposition locatio...,"atlanta, ga 30303"
4,assistant professor -tt - signal processing & ...,emory university,description\nthe emory university department o...,"atlanta, ga"


In [15]:
# now lets rassign this aggain to a new array so that we get our objects back
cleaned=pd.DataFrame(data=df1)

In [16]:
cleaned.dtypes

position       object
company        object
description    object
location       object
dtype: object

In [17]:
cleaned.head(10)

Unnamed: 0,position,company,description,location
0,development director,als tdi,development director\nals therapy development ...,"atlanta, ga 30301"
1,an ostentatiously-excitable principal research...,the hexagon lavish,"job description\n\n""the road that leads to acc...","atlanta, ga"
2,data scientist,xpert staffing,"growing company located in the atlanta, ga are...","atlanta, ga"
3,data analyst,operation hope,department: program operationsposition locatio...,"atlanta, ga 30303"
4,assistant professor -tt - signal processing & ...,emory university,description\nthe emory university department o...,"atlanta, ga"
5,manager of data engineering,mckinsey & company,qualifications\nbachelor’s degree in computer ...,"atlanta, ga 30318"
6,"product specialist - periscope, new ventures",mckinsey & company,qualifications\nbachelor’s degree\n5-7 years o...,"atlanta, ga 30318"
7,"junior to mid-level engineer, geologist or env...",wood,overview / responsibilities\nwood environment ...,"atlanta, ga"
8,analyst - cib credit research,suntrust,works closely with senior cib professionals. p...,"atlanta, ga"
9,senior associate - cognitive data scientist na...,kpmg,known for being a great place to work and buil...,"atlanta, ga 30338"


In [18]:
#convert to an excel we can read and work with normally
cleaned.to_excel(r'cleaned.xlsx', index = False)

In [19]:
#convert to an csv we can read and work with normally

cleaned.to_csv(r'cleanedcsv.csv', index = False, header = True)
