# EDA to draw insights from the dataset

In [1]:
# import data and libraries
import pandas as pd

# read in data
df = pd.read_csv('../2023-04-14-job-search/Clean_Data/combined_data_final.csv')

df.head()

Unnamed: 0,title,company_name,location,via,description,detected_extensions.schedule_type,detected_extensions.work_from_home,detected_extensions.posted_at,detected_extensions.salary,search_parameters.q,Qualifications,Responsibilities,Benefits
0,Ethereum Blockchain Developer (Remote),Ex Populus,Anywhere,via Built In,Company Overview:\nEx Populus is a cutting-edg...,Full-time,True,,,block chain,"2-3 years of Software Development experience,1...","Design, maintain and deploy smart contracts fo...",
1,Blockchain Engineer,21.co,"New York, NY",via Greenhouse,We are seeking a highly motivated and skilled ...,Full-time,,,,block chain,Bachelor's or Master's degree in Computer Scie...,"As a Blockchain Engineer, you will be responsi...",(NYC only) Pursuant to Section 8-102 of title ...
2,Blockchain Course Instructor,Blockchain Institute of Technology,Anywhere,via LinkedIn,"Are you a blockchain, cryptocurrency, NFT, Met...",Contractor,True,24 hours ago,,block chain,"3+ years of experience in blockchain, cryptocu...",Our expert technical team will provide the sup...,
3,Python based - Blockchain developer to join ex...,Upwork,Anywhere,via Upwork,Need someone to join our existing team to spee...,Contractor,True,2 days ago,10–30 an hour,block chain,"Candidates must be willing to sign, non-disclo...",Will discuss details with the selected candidates,
4,Blockchain DevOps Engineer (Remote),Telnyx,United States,via Startup Jobs,"About Telnyx\n\nAt Telnyx, we’re architecting ...",Full-time,,4 days ago,,block chain,You are a highly motivated and experienced Blo...,To build a best-in-class Filecoin (FIL) Mining...,


## Plot bar chart for search_parameters.q

In [33]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"

In [30]:
# bar chart for search_parameters.q
df_temp = df['search_parameters.q'].value_counts()
fig = px.bar(df_temp, x='search_parameters.q', title='Search Parameters',
             color='search_parameters.q',labels={'value':'Count'},
             text='search_parameters.q', 
             color_discrete_sequence=px.colors.qualitative.Plotly
            )
fig.update_traces(texttemplate='%{text:}', textposition='outside')
fig.show()

## Map for Job Locations

In [33]:
df["location"].value_counts()

 Anywhere                 130
  Washington, DC           36
  United States            30
  New York, NY             28
  San Francisco, CA        18
                         ... 
  Cincinnati, OH            1
  Culver City, CA           1
  Pittsburgh, PA            1
  Madrid, AL                1
  Minneapolis, MN           1
Name: location, Length: 170, dtype: int64

In [150]:
# split location into city and state
df_temp = df.copy()

In [9]:
# add longitude and latitude
from geopy.geocoders import Nominatim
import numpy as np
geolocator = Nominatim(user_agent="my_user_agent")

# create empty lists
lat = []
lon = []

# loop through df_temp to get lat and lon
for i in range(len(df_temp)):
     try:
          location = geolocator.geocode(df_temp['location'][i])
          lat.append(location.latitude)
          lon.append(location.longitude)
     except:
          lat.append(np.nan)
          lon.append(np.nan)
     
# add lat and lon to df_temp
df_temp['lat'] = lat
df_temp['lon'] = lon

df_temp.head()


Unnamed: 0,title,company_name,location,via,description,detected_extensions.schedule_type,detected_extensions.work_from_home,detected_extensions.posted_at,detected_extensions.salary,search_parameters.q,Qualifications,Responsibilities,Benefits,lat,lon
0,Ethereum Blockchain Developer (Remote),Ex Populus,Anywhere,via Built In,Company Overview:\nEx Populus is a cutting-edg...,Full-time,True,,,block chain,"2-3 years of Software Development experience,1...","Design, maintain and deploy smart contracts fo...",,-32.513957,19.954959
1,Blockchain Engineer,21.co,"New York, NY",via Greenhouse,We are seeking a highly motivated and skilled ...,Full-time,,,,block chain,Bachelor's or Master's degree in Computer Scie...,"As a Blockchain Engineer, you will be responsi...",(NYC only) Pursuant to Section 8-102 of title ...,42.912569,-76.734675
2,Blockchain Course Instructor,Blockchain Institute of Technology,Anywhere,via LinkedIn,"Are you a blockchain, cryptocurrency, NFT, Met...",Contractor,True,24 hours ago,,block chain,"3+ years of experience in blockchain, cryptocu...",Our expert technical team will provide the sup...,,-32.513957,19.954959
3,Python based - Blockchain developer to join ex...,Upwork,Anywhere,via Upwork,Need someone to join our existing team to spee...,Contractor,True,2 days ago,10–30 an hour,block chain,"Candidates must be willing to sign, non-disclo...",Will discuss details with the selected candidates,,-32.513957,19.954959
4,Blockchain DevOps Engineer (Remote),Telnyx,United States,via Startup Jobs,"About Telnyx\n\nAt Telnyx, we’re architecting ...",Full-time,,4 days ago,,block chain,You are a highly motivated and experienced Blo...,To build a best-in-class Filecoin (FIL) Mining...,,39.78373,-100.445882


In [151]:
# check for null values
df_temp['lat'] = lat
df_temp['lon'] = lon
df_temp.isnull().sum()

title                                   0
company_name                            0
location                                0
via                                     0
description                             0
detected_extensions.schedule_type       1
detected_extensions.work_from_home    496
detected_extensions.posted_at         212
detected_extensions.salary            525
search_parameters.q                     0
Qualifications                          0
Responsibilities                       85
Benefits                              310
lat                                    12
lon                                    12
dtype: int64

In [152]:
# drop null values in lat and lon
df_temp = df_temp.dropna(subset=['lat', 'lon'])

In [153]:
# add emoji to df_temp
df_temp['emoji'] = '📍'

In [154]:
# replace null values in columns with 'None'
df_temp.fillna('None', inplace=True)
df_temp.isnull().sum()

title                                 0
company_name                          0
location                              0
via                                   0
description                           0
detected_extensions.schedule_type     0
detected_extensions.work_from_home    0
detected_extensions.posted_at         0
detected_extensions.salary            0
search_parameters.q                   0
Qualifications                        0
Responsibilities                      0
Benefits                              0
lat                                   0
lon                                   0
emoji                                 0
dtype: int64

In [155]:
# rename columns
df_temp.columns=['Job Title', 'Company Name', 'Location', 'Via', 'Job Description',
       'Schedule Type', 'Work from Home', 'Posted at', 'Salary', 'Search Parameters', 'Qualifications',
       'Responsibilities', 'Benefits', 'lat', 'lon', 'emoji']
df_temp.head()

Unnamed: 0,Job Title,Company Name,Location,Via,Job Description,Schedule Type,Work from Home,Posted at,Salary,Search Parameters,Qualifications,Responsibilities,Benefits,lat,lon,emoji
0,Ethereum Blockchain Developer (Remote),Ex Populus,Anywhere,via Built In,Company Overview:\nEx Populus is a cutting-edg...,Full-time,True,,,block chain,"2-3 years of Software Development experience,1...","Design, maintain and deploy smart contracts fo...",,-32.513957,19.954959,📍
1,Blockchain Engineer,21.co,"New York, NY",via Greenhouse,We are seeking a highly motivated and skilled ...,Full-time,,,,block chain,Bachelor's or Master's degree in Computer Scie...,"As a Blockchain Engineer, you will be responsi...",(NYC only) Pursuant to Section 8-102 of title ...,42.912569,-76.734675,📍
2,Blockchain Course Instructor,Blockchain Institute of Technology,Anywhere,via LinkedIn,"Are you a blockchain, cryptocurrency, NFT, Met...",Contractor,True,24 hours ago,,block chain,"3+ years of experience in blockchain, cryptocu...",Our expert technical team will provide the sup...,,-32.513957,19.954959,📍
3,Python based - Blockchain developer to join ex...,Upwork,Anywhere,via Upwork,Need someone to join our existing team to spee...,Contractor,True,2 days ago,10–30 an hour,block chain,"Candidates must be willing to sign, non-disclo...",Will discuss details with the selected candidates,,-32.513957,19.954959,📍
4,Blockchain DevOps Engineer (Remote),Telnyx,United States,via Startup Jobs,"About Telnyx\n\nAt Telnyx, we’re architecting ...",Full-time,,4 days ago,,block chain,You are a highly motivated and experienced Blo...,To build a best-in-class Filecoin (FIL) Mining...,,39.78373,-100.445882,📍


In [156]:
# add state column
df_temp['State'] = df_temp['Location'].str.split(',').str[1]

In [157]:
# add state count column
df_temp['Job Count(by State)'] = df_temp.groupby('State')['State'].transform('count')

In [158]:
df_temp['Job Count(by State)'] = df_temp['Job Count(by State)'].fillna(0).astype(int)

In [160]:
# dataframe for map based on state
df_map = df_temp[['State', 'Job Count(by State)']].drop_duplicates()

# remove nan values
df_map = df_map[df_map['State'] != 'nan']

In [161]:
# remove empty spaces in State column
df_map['State'] = df_map['State'].str.strip()

In [178]:
# create map by job count in each state
fig = px.choropleth(df_map, locations='State', locationmode="USA-states", color='Job Count(by State)',
                    scope='usa', hover_data=['State', 'Job Count(by State)'],
                    color_continuous_scale="Sunset",
                    )

fig.add_trace(px.scatter_geo(
    df_temp,
    lat='lat', lon='lon', hover_name='Company Name',
    hover_data = {
        'lon': False, 'lat': False,
        'Job Title': True,
        'Company Name': True,
        'Location': True,
        'Via': True,
        'Job Description': False,
        'Schedule Type': True,
        'Work from Home': True,
        'Posted at': True,
        'Salary': True,
        'Search Parameters': True,
        'Qualifications': False,
        'Responsibilities': False,
        'Benefits': False,
        'emoji': False
    },
    text='emoji',).update_traces(textfont_size=20).data[0])

fig.update_layout(
    title=dict(
        text='Job Opportunities in the United States',
        font = dict(size=24)
    ),
    autosize=True,
    hovermode='closest',
    mapbox=dict(
        zoom=6
    ),
    geo = dict(
        scope='usa',
        projection=dict(type='albers usa'),
        showland=True,
        landcolor= 'rgb(217, 217, 217)',
        subunitwidth=1,
        countrywidth=1,
    ),
)
      
fig.show()

In [179]:
# download map to html
#fig.write_html("map.html")

In [209]:
# State with most job opportunities
df_temp['State'].value_counts(normalize=True).head(10), df_temp['State'].value_counts().head(10)

( CA       0.210407
  MD       0.151584
  VA       0.144796
  DC       0.081448
  NY       0.067873
  IL       0.047511
  MA       0.042986
  TX       0.038462
  WA       0.033937
  CO       0.015837
 Name: State, dtype: float64,
  CA       93
  MD       67
  VA       64
  DC       36
  NY       30
  IL       21
  MA       19
  TX       17
  WA       15
  CO        7
 Name: State, dtype: int64)

In [210]:
# Schedule Type in percentage
df_temp['Schedule Type'].value_counts(normalize=True),df_temp['Schedule Type'].value_counts()

(Full-time     0.874593
 Contractor    0.091205
 Internship    0.027687
 Part-time     0.004886
 None          0.001629
 Name: Schedule Type, dtype: float64,
 Full-time     537
 Contractor     56
 Internship     17
 Part-time       3
 None            1
 Name: Schedule Type, dtype: int64)

In [211]:
# Work from Home
df_temp['Work from Home'].value_counts(normalize=True), df_temp['Work from Home'].value_counts()

(None    0.788274
 True    0.211726
 Name: Work from Home, dtype: float64,
 None    484
 True    130
 Name: Work from Home, dtype: int64)