In [2]:
import pandas as pd
import numpy as np
import yake
import plotly.express as px
import plotly.graph_objects as go
import re

In [3]:
us = pd.read_csv("./data/cleaned/us.csv")
dc = pd.read_csv("./data/cleaned/dc.csv")

In [4]:
us  = us.drop(us.columns[[0,1]], axis = 1)
dc  = dc.drop(dc.columns[[0,1]], axis = 1)

In [5]:
full = pd.concat([dc,us], axis = 0, ignore_index= True)
full.city = [city.strip() for city in full.city]

In [6]:
grouped = full.groupby(['location','domain'], as_index=False).mean(['min_salary','max_salary']).dropna()

In [7]:
fig = px.scatter(grouped, 
           x= grouped.index.to_list(), 
           y = 'min_salary', 
           color  = 'location', 
           hover_data  = 'domain',
           labels = dict(x = "", min_salary = 'Minimum Salary (in Million USD)'),
           title = "Distribution of Minimum Salary Ranges Overall (DC vs. US)")

fig.update_traces(marker = dict(size =20,
                                line = dict(width =2)))

fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
})

In [8]:
fig = px.scatter(grouped, 
           x= grouped.index.to_list(), 
           y = 'max_salary', 
           color  = 'location', 
           hover_data  = 'domain',
           labels = dict(x = "", max_salary = 'Maximum Salary (in Million USD)'),
           title = "Distribution of Maximum Salary Ranges Overall (DC vs. US)")

fig.update_traces(marker = dict(size =20,
                                line = dict(width =2)))

fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
})

In [9]:
fig = px.bar(grouped, x = "location", 
             y = "min_salary", 
             color = "location", 
             barmode = "group", 
             facet_col="domain", 
             facet_col_wrap=3, 
             width = 600, height = 800,
             labels = dict(x = "", min_salary = 'Maximum Salary (USD)'),
             title = "Average Minimum Salary by Domain (DC vs. US)")
             

for axis in fig.layout:
    if type(fig.layout[axis]) == go.layout.XAxis:
        fig.layout[axis].title.text = ''

fig['layout']['xaxis2']['title']['text']= ""    

fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

for axis in fig.layout:
    if type(fig.layout[axis]) == go.layout.YAxis:
        fig.layout[axis].title.text = ''

fig.add_annotation(x=-0.1,y=0.5,
                   text="Maximum Salary (USD)", textangle=-90,
                    xref="paper", yref="paper")

fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
})

fig.show()

In [10]:
fig = px.violin(full, 
                y="min_salary", 
                x="location", 
                color="location", 
                box=True,
                log_y=False, 
                points="all",
                facet_col='domain', 
                facet_col_wrap = 5, 
                width = 1000, height = 700,
                title = "Distribution of Salary by Domain (DC vs. US)")

fig.update_yaxes(matches=None)

for axis in fig.layout:
    if type(fig.layout[axis]) == go.layout.XAxis:
        fig.layout[axis].title.text = ''

fig['layout']['xaxis2']['title']['text']= ""    

fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

for axis in fig.layout:
    if type(fig.layout[axis]) == go.layout.YAxis:
        fig.layout[axis].title.text = ''

fig.add_annotation(x=-0.1,y=0.5,
                   text="Minimum Salary (USD)", textangle=-90,
                    xref="paper", yref="paper")

fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
})

fig.show()





In [11]:
grouped = full.groupby(['city','domain'], as_index=False).count().dropna()

In [12]:
x = []
for i in range(grouped.shape[0]):
    if grouped.loc[i,"city"].strip() in ['New York', "Washington", "Anywhere", "San Francisco", "Annapolis Junction"]:
        x.append(True)
    else:
        x.append(False)

grouped = grouped[x].reset_index()

In [13]:
fig = px.bar(grouped, x = "city", 
             y = "location", 
             color = "city", 
             barmode = "group", 
             facet_col="domain", 
             facet_col_wrap=3, 
             width = 1000, height = 1000,
             labels = dict(x = "", min_salary = 'Maximum Salary (USD)'),
             title = "Number of Jobs in Top 5 Job Count Cities by Domain")
             
fig.update_traces(width=1)
fig.update_yaxes(matches=None)


for axis in fig.layout:
    if type(fig.layout[axis]) == go.layout.XAxis:
        fig.layout[axis].title.text = ''

fig['layout']['xaxis2']['title']['text']= ""    

fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

for axis in fig.layout:
    if type(fig.layout[axis]) == go.layout.YAxis:
        fig.layout[axis].title.text = ''

fig.add_annotation(x=-.01,y=0.5,
                   text="Maximum Salary (USD)", textangle=-90,
                    xref="paper", yref="paper")

fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
})

fig.show()

In [14]:
grouped1 = full.groupby(['state'], as_index=False).count().dropna()
grouped2 = full.groupby(['state'], as_index=False).mean(['min_salary', 'max_salary'])

In [15]:
grouped = grouped1.merge(grouped2, how = 'inner', on = 'state')
grouped = grouped[['state','min_salary_y', 'max_salary_y', 'location']].dropna()

In [17]:
fig = px.scatter(grouped, 
           x= 'max_salary_y', 
           y = 'min_salary_y', 
           color  = 'location', 
           hover_data  = 'state',
           labels = dict(max_salary_y = "Average Maximum Salary (USD)", min_salary_y = "Average Minimum Salary (USD)", location = "Number of Jobs"),
           title = "Number of Jobs according to Min/Max Salary by State")

fig.update_traces(marker = dict(size = grouped['location'],
                                line = dict(width =2)))

fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
})

In [18]:
options = grouped.state.unique().tolist()

In [20]:
cities = full[full['state'] == options[0]].city.unique().tolist()

In [21]:
x = []
for i in range(full.shape[0]):
    if full.loc[i,"city"].strip() in cities:
        x.append(True)
    else:
        x.append(False)

grouped = full[x].reset_index()

In [22]:
groupedx = grouped.groupby(['city','domain'], as_index=False).mean(['min_salary','max_salary']).dropna()

In [23]:
fig = px.bar(groupedx, x = "city", 
             y = "min_salary", 
             color = "city", 
             barmode = "group", 
             facet_col="domain", 
             facet_col_wrap=3, 
             width = 800, height = 800,
             labels = dict(x = "", min_salary = 'Maximum Salary (USD)'),
             title = f"Average Minimum Salary by Domain {options[0]}")
fig.update_traces(width=1)
fig.update_yaxes(matches=None)             

for axis in fig.layout:
    if type(fig.layout[axis]) == go.layout.XAxis:
        fig.layout[axis].title.text = ''

fig['layout']['xaxis2']['title']['text']= ""    

fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

for axis in fig.layout:
    if type(fig.layout[axis]) == go.layout.YAxis:
        fig.layout[axis].title.text = ''

fig.add_annotation(x=-0.1,y=0.5,
                   text="Maximum Salary (USD)", textangle=-90,
                    xref="paper", yref="paper")

fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
})

fig.show()

In [24]:
available_domains = groupedx.domain.unique().tolist()

In [25]:
groupedy = grouped[grouped['domain'] == available_domains[0]]
top_jobs = groupedy.sort_values(by = "min_salary")[:3].reset_index()

In [26]:
extractor = yake.KeywordExtractor()
language = "en"
max_ngram_size = 3
deduplication_threshold = 0.9
keywords = 5
custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=keywords, features=None)


In [28]:
print("The Best Job Fit for you would be in",options[0], "specifically",cities[0], "working in domain of", available_domains[0])
print("Here are the Top 3 Job Recommendations for you and the requirements!")
print('\n')
for i in range(top_jobs.shape[0]):
    x = custom_kw_extractor.extract_keywords(top_jobs.responsibility[i])
    y = custom_kw_extractor.extract_keywords(top_jobs.qualification[i])
    z = custom_kw_extractor.extract_keywords(top_jobs.description[i])
    print(f"Position {i+1}")
    print("---------------------")
    print("Postion:",top_jobs.position[i], "| Company:", top_jobs.company_name[i])
    print("Minimum Salary:",top_jobs.min_salary[i], "| Maximum Salary:", top_jobs.max_salary[i])
    print("Responsibility Top5 Keywords")
    fig1 = go.Figure([go.Bar(
    x=[t[0] for t in x],
    y=[t[1] for t in x],
    text= [t[0] for t in x])])
    fig1.update_xaxes(visible=False)
    fig1.show()
    print("Qualification Top5 Keywords")
    fig2 = go.Figure([go.Bar(
    x=[t[0] for t in x],
    y=[t[1] for t in x],
    text= [t[0] for t in x])])
    fig2.update_xaxes(visible=False)
    fig2.show()
    print("Description Top5 Keywords")
    fig3 = go.Figure([go.Bar(
    x=[t[0] for t in x],
    y=[t[1] for t in x],
    text= [t[0] for t in x])])
    fig3.update_xaxes(visible=False)
    fig3.show()
    print('\n')

The Best Job Fit for you would be in CA specifically South San Francisco working in domain of machine learning
Here are the Top 3 Job Recommendations for you and the requirements!


Position 1
---------------------
Postion: Research scientist | Company: Warner Bros. Discovery
Minimum Salary: 93800.0 | Maximum Salary: 174000.0
Responsibility Top5 Keywords


Qualification Top5 Keywords


Description Top5 Keywords




Position 2
---------------------
Postion: Researcher | Company: Warner Bros. Discovery
Minimum Salary: 93800.0 | Maximum Salary: 174000.0
Responsibility Top5 Keywords


Qualification Top5 Keywords


Description Top5 Keywords




Position 3
---------------------
Postion: Infrastructure engineer | Company: Technical Link
Minimum Salary: 170560.0 | Maximum Salary: nan
Responsibility Top5 Keywords


Qualification Top5 Keywords


Description Top5 Keywords






In [29]:
groupedx = full.groupby(['domain'], as_index=False).mean(['min_salary','max_salary']).dropna()
groupedy = full.groupby(['domain'], as_index=False).count()
groupedy = groupedy[['domain','location']]

In [30]:
grouped = groupedx.merge(groupedy, how = 'inner', on = 'domain')

In [32]:
fig = px.scatter(grouped, 
           x= 'max_salary', 
           y = 'min_salary', 
           color  = 'domain', 
           hover_data  = 'domain',
           labels = dict(max_salary = "Average Maximum Salary (USD)", min_salary = "Average Minimum Salary (USD)", location = "Number of Jobs"),
           title = "Number of Jobs according to Min/Max Salary by Domain")

fig.update_traces(marker = dict(size = grouped['location'],
                                line = dict(width =1)))

fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
})

In [33]:
possilbe_domains = grouped.domain.unique().tolist()

In [34]:
groupedx = full[full['domain'] == possilbe_domains[0]].reset_index().groupby(['state'], as_index = False).mean(['min_salary','max_salary']).dropna()
groupedy = full[full['domain'] == possilbe_domains[0]].reset_index().groupby(['state'], as_index = False).count()
groupedy = groupedy[['state','location']]
grouped = groupedx.merge(groupedy, how = 'inner', on = 'state')

In [37]:
fig5 = go.Figure([go.Bar(
x=grouped['state'],
y=grouped['min_salary'],
text= grouped['state'],
)])

fig5.update_layout(
    title=f"Minimum Salary Average for Top States of Domain: {possilbe_domains[0]}",
    xaxis_title="State",
    yaxis_title="Average Minimum Salary (USD)",
)
fig5.update_xaxes(visible=False)

fig5.show()

In [38]:
possible_states = grouped.state.unique().tolist()

In [39]:
grouped = full[full['domain'] == possilbe_domains[0]].reset_index()
x = []
for i in range(grouped.shape[0]):
    if full.loc[i,"state"] == possible_states[3]:
        x.append(True)
    else:
        x.append(False)

grouped = grouped[x].reset_index()

In [40]:
groupedx = grouped.groupby(['city'], as_index=False).mean(['min_salary', 'max_salary']).dropna()
groupedy = grouped.groupby(['city'], as_index=False).count()
groupedy = groupedy[['city','location']]
grouped = groupedx.merge(groupedy, how = 'inner', on = 'city')

In [42]:
if grouped.shape[0] ==0:
    print("No jobs found! Try other cities :) ")
else:
    fig = px.scatter(grouped, 
            x= 'max_salary', 
            y = 'min_salary', 
            color  = grouped['location'], 
            hover_data  = 'city',
            labels = dict(max_salary = "Average Maximum Salary (USD)", min_salary = "Average Minimum Salary (USD)", location = "Number of Jobs"),
            title = f"Number of Jobs according to Min/Max Salary in {possible_states[0]} Cities")

    fig.update_traces(marker = dict(size = grouped['location']*100,
                                    line = dict(width =1)))

    fig.update_layout({
    'plot_bgcolor': 'rgba(0, 0, 0, 0)',
    'paper_bgcolor': 'rgba(0, 0, 0, 0)',
    })
    fig.show()

In [43]:
possible_cities = grouped.city.unique().tolist()
grouped = full[full['domain'] == possilbe_domains[0]].reset_index()
grouped = grouped[grouped['city'] == possible_cities[1]]
top_jobs = grouped.sort_values(by = "min_salary")[:3].reset_index()

In [46]:
top_jobs.responsibility[2]

'Work in a team with other smart AIS employees and use cutting-edge technologies to solve challenging enterprise problems Design data platform solutions using Azure data services such as Data Factory, Azure Event Hub, Azure Synapse Analytics, and Azure Databricks Design scalable data processing and analytics solutions, including Big data storage for various data types and large-scale data processing using Databricks Apply your skills in Azure Cognitive Services, Azure P,aaS, and Machine Learning services Use experience working with Azure Data & Storage, Azure Analytics & Azure IoT tools, and the traditional Microsoft BI Stack Provide mentorship to more junior consultants This is a remote position with occasional travel (if needed)'

In [47]:
print("The Best Job Fit for you would be in the domain of",possilbe_domains[0], "located in",possible_cities[0], ",", possible_states[0])
print("Here are the Top 3 Job Recommendations for you and the requirements!")
print('\n')
for i in range(top_jobs.shape[0]):
    x = custom_kw_extractor.extract_keywords(top_jobs.responsibility[i])
    y = custom_kw_extractor.extract_keywords(top_jobs.qualification[i])
    z = custom_kw_extractor.extract_keywords(top_jobs.description[i])
    print(f"Position {i+1}")
    print("---------------------")
    print("Postion:",top_jobs.position[i], "| Company:", top_jobs.company_name[i])
    print("Minimum Salary:",top_jobs.min_salary[i], "| Maximum Salary:", top_jobs.max_salary[i])

    print("Responsibility Top5 Keywords")
    fig1 = go.Figure([go.Bar(
    x=[a[0] for a in x],
    y=[b[1] for b in x],
    text= [t[0] for t in x])])
    fig1.update_xaxes(visible=False)
    fig1.show()

    print("Qualification Top5 Keywords")
    fig2 = go.Figure([go.Bar(
    x=[c[0] for c in x],
    y=[d[1] for d in x],
    text= [t[0] for t in x])])
    fig2.update_xaxes(visible=False)
    fig2.show()
    
    print("Description Top5 Keywords")
    fig3 = go.Figure([go.Bar(
    x=[e[0] for e in x],
    y=[f[1] for f in x],
    text= [t[0] for t in x])])
    fig3.update_xaxes(visible=False)
    fig3.show()
    print('\n')

The Best Job Fit for you would be in the domain of big data located in Chicago , CA
Here are the Top 3 Job Recommendations for you and the requirements!


Position 1
---------------------
Postion: Software engineer | Company: Google
Minimum Salary: 120000.0 | Maximum Salary: 190000.0
Responsibility Top5 Keywords


Qualification Top5 Keywords


Description Top5 Keywords




Position 2
---------------------
Postion: Design engineer | Company: Google
Minimum Salary: 120000.0 | Maximum Salary: 190000.0
Responsibility Top5 Keywords


Qualification Top5 Keywords


Description Top5 Keywords




Position 3
---------------------
Postion: Data architect | Company: Applied Information Sciences
Minimum Salary: nan | Maximum Salary: nan
Responsibility Top5 Keywords


Qualification Top5 Keywords


Description Top5 Keywords






In [48]:
x = custom_kw_extractor.extract_keywords(top_jobs.responsibility[1])

In [49]:
z

[('Azure data services', 0.00664585356973983),
 ('Azure data', 0.007472026886229037),
 ('Data', 0.008514485731433343),
 ('Azure', 0.010369276440114214),
 ('Big data', 0.013131107028711404)]