In [1]:
import pandas as pd
import numpy as np
import yake
import plotly.express as px
import plotly.graph_objects as go
import re
import streamlit as st

In [None]:
us = pd.read_csv("./data/cleaned/us.csv")
dc = pd.read_csv("./data/cleaned/dc.csv")
us  = us.drop(us.columns[[0,1]], axis = 1)
dc  = dc.drop(dc.columns[[0,1]], axis = 1)
full = pd.concat([dc,us], axis = 0, ignore_index= True)
full.city = [city.strip() for city in full.city]

extractor = yake.KeywordExtractor()
language = "en"
max_ngram_size = 3
deduplication_threshold = 0.9
keywords = 5
custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=keywords, features=None)

In [None]:
st.title("Post Graduation Data Science Job Location")
st.header("Is DC Attractive for Data Science?")

st.write("SALARY")
min_max = st.selectbox("Select Minimum or Maximum:",
                       ['Minimum', 'Maximum'])

d_p_1 = full.groupby(['location','domain'], as_index=False).mean(['min_salary','max_salary']).dropna()

if min_max == "Minimum":
    fig1 = px.scatter(d_p_1, 
           x= d_p_1.index.to_list(), 
           y = 'min_salary', 
           color  = 'location', 
           hover_data  = 'domain',
           labels = dict(x = "", min_salary = 'Minimum Salary (in Million USD)'),
           title = "Distribution of Minimum Salary Ranges Overall (DC vs. US)")

    fig1.update_traces(marker = dict(size =20,
                                    line = dict(width =2)))

    fig1.update_layout({
    'plot_bgcolor': 'rgba(0, 0, 0, 0)',
    'paper_bgcolor': 'rgba(0, 0, 0, 0)',
    })

    fig1.show()

if min_max == "Maximum":
    fig2 = px.scatter(d_p_1, 
           x= d_p_1.index.to_list(), 
           y = 'max_salary', 
           color  = 'location', 
           hover_data  = 'domain',
           labels = dict(x = "", max_salary = 'Maximum Salary (in Million USD)'),
           title = "Distribution of Maximum Salary Ranges Overall (DC vs. US)")

    fig2.update_traces(marker = dict(size =20,
                                    line = dict(width =2)))

    fig2.update_layout({
    'plot_bgcolor': 'rgba(0, 0, 0, 0)',
    'paper_bgcolor': 'rgba(0, 0, 0, 0)',
    })

st.write("barplot minimum salaries by domain dc vs. us")

fig4 = px.bar(d_p_1, x = "location", 
             y = "min_salary", 
             color = "location", 
             barmode = "group", 
             facet_col="domain", 
             facet_col_wrap=3, 
             width = 600, height = 800,
             labels = dict(x = "", min_salary = 'Maximum Salary (USD)'),
             title = "Average Minimum Salary by Domain (DC vs. US)")
             

for axis in fig4.layout:
    if type(fig4.layout[axis]) == go.layout.XAxis:
        fig4.layout[axis].title.text = ''

fig4['layout']['xaxis2']['title']['text']= ""    

fig4.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

for axis in fig4.layout:
    if type(fig4.layout[axis]) == go.layout.YAxis:
        fig4.layout[axis].title.text = ''

fig4.add_annotation(x=-0.1,y=0.5,
                   text="Maximum Salary (USD)", textangle=-90,
                    xref="paper", yref="paper")

fig4.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
})

fig4.show()

st.write("VIOLIN PLOT")
fig5 = px.violin(full, 
                y="min_salary", 
                x="location", 
                color="location", 
                box=True,
                log_y=False, 
                points="all",
                facet_col='domain', 
                facet_col_wrap = 5, 
                width = 1000, height = 700,
                title = "Distribution of Salary by Domain (DC vs. US)")

fig5.update_yaxes(matches=None)

for axis in fig5.layout:
    if type(fig5.layout[axis]) == go.layout.XAxis:
        fig5.layout[axis].title.text = ''

fig5['layout']['xaxis2']['title']['text']= ""    

fig5.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

for axis in fig5.layout:
    if type(fig5.layout[axis]) == go.layout.YAxis:
        fig5.layout[axis].title.text = ''

fig5.add_annotation(x=-0.1,y=0.5,
                   text="Minimum Salary (USD)", textangle=-90,
                    xref="paper", yref="paper")

fig5.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
})

fig5.show()

st.write("job counts by top 5 cities")
d_p_2 = full.groupby(['city','domain'], as_index=False).count().dropna()

x = []
for i in range(d_p_2.shape[0]):
    if d_p_2.loc[i,"city"].strip() in ['New York', "Washington", "Anywhere", "San Francisco", "Annapolis Junction"]:
        x.append(True)
    else:
        x.append(False)

d_p_2 = d_p_2[x].reset_index()

fig6 = px.bar(d_p_2, x = "city", 
             y = "location", 
             color = "city", 
             barmode = "group", 
             facet_col="domain", 
             facet_col_wrap=3, 
             width = 1000, height = 1000,
             labels = dict(x = "", min_salary = 'Maximum Salary (USD)'),
             title = "Number of Jobs in Top 5 Job Count Cities by Domain")
             
fig6.update_traces(width=1)
fig6.update_yaxes(matches=None)


for axis in fig6.layout:
    if type(fig6.layout[axis]) == go.layout.XAxis:
        fig6.layout[axis].title.text = ''

fig6['layout']['xaxis2']['title']['text']= ""    

fig6.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

for axis in fig6.layout:
    if type(fig6.layout[axis]) == go.layout.YAxis:
        fig6.layout[axis].title.text = ''

fig6.add_annotation(x=-.01,y=0.5,
                   text="Maximum Salary (USD)", textangle=-90,
                    xref="paper", yref="paper")

fig6.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
})

fig6.show()

st.header("ARE YOU MORE ABOUT THE LOCATION? OR DOMAIN?")
path = st.selectbox("SELECT", ["LOCATION","DOMAIN"])

if path == "LOCATION":
    dp3 = full.groupby(['state'], as_index=False).count().dropna()
    dp4 = full.groupby(['state'], as_index=False).mean(['min_salary', 'max_salary'])
    dp5 = dp3.merge(dp4, how = 'inner', on = 'state')
    dp5 = dp5[['state','min_salary_y', 'max_salary_y', 'location']].dropna()

    fig7 = px.scatter(dp5, 
            x= 'max_salary_y', 
            y = 'min_salary_y', 
            color  = 'location', 
            hover_data  = 'state',
            labels = dict(max_salary_y = "Average Maximum Salary (USD)", min_salary_y = "Average Minimum Salary (USD)", location = "Number of Jobs"),
            title = "Number of Jobs according to Min/Max Salary by State")

    fig7.update_traces(marker = dict(size = dp5['location'],
                                    line = dict(width =2)))

    fig7.update_layout({
    'plot_bgcolor': 'rgba(0, 0, 0, 0)',
    'paper_bgcolor': 'rgba(0, 0, 0, 0)',
    })

    options = dp5.state.unique().tolist()
    path = st.selectbox("select state", options)
    cities = full[full['state'] == path].city.unique().tolist()
    x = []
    for i in range(full.shape[0]):
        if full.loc[i,"city"].strip() in cities:
            x.append(True)
        else:
            x.append(False)

    dp6 = full[x].reset_index()

    dp7 = dp6.groupby(['city','domain'], as_index=False).mean(['min_salary','max_salary']).dropna()

    fig8 = px.bar(dp7, x = "city", 
                y = "min_salary", 
                color = "city", 
                barmode = "group", 
                facet_col="domain", 
                facet_col_wrap=3, 
                width = 800, height = 800,
                labels = dict(x = "", min_salary = 'Maximum Salary (USD)'),
                title = f"Average Minimum Salary by Domain {options[0]}")
    fig8.update_traces(width=1)
    fig8.update_yaxes(matches=None)             

    for axis in fig8.layout:
        if type(fig8.layout[axis]) == go.layout.XAxis:
            fig8.layout[axis].title.text = ''

    fig8['layout']['xaxis2']['title']['text']= ""    

    fig8.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

    for axis in fig8.layout:
        if type(fig8.layout[axis]) == go.layout.YAxis:
            fig8.layout[axis].title.text = ''

    fig8.add_annotation(x=-0.1,y=0.5,
                    text="Maximum Salary (USD)", textangle=-90,
                        xref="paper", yref="paper")

    fig8.update_layout({
    'plot_bgcolor': 'rgba(0, 0, 0, 0)',
    'paper_bgcolor': 'rgba(0, 0, 0, 0)',
    })

    fig8.show()

    available_domains = dp7.domain.unique().tolist()
    domain_select_1 = st.selectbox("select domain", available_domains)

    dp8 = dp6[dp6['domain'] == domain_select_1]
    top_jobs_1 = dp8.sort_values(by = "min_salary")[:3].reset_index()

    print("The Best Job Fit for you would be in",options[0], "specifically",cities[0], "working in domain of", available_domains[0])
    print("Here are the Top 3 Job Recommendations for you and the requirements!")
    print('\n')
    for i in range(top_jobs_1.shape[0]):
        x = custom_kw_extractor.extract_keywords(top_jobs_1.responsibility[i])
        y = custom_kw_extractor.extract_keywords(top_jobs_1.qualification[i])
        z = custom_kw_extractor.extract_keywords(top_jobs_1.description[i])
        print(f"Position {i+1}")
        print("---------------------")
        print("Postion:",top_jobs_1.position[i], "| Company:", top_jobs_1.company_name[i])
        print("Minimum Salary:",top_jobs_1.min_salary[i], "| Maximum Salary:", top_jobs_1.max_salary[i])
        print("Responsibility Top5 Keywords")
        fig9 = go.Figure([go.Bar(
        x=[t[0] for t in x],
        y=[t[1] for t in x],
        text= [t[0] for t in x])])
        fig9.update_xaxes(visible=False)
        fig9.show()
        print("Qualification Top5 Keywords")
        fig10 = go.Figure([go.Bar(
        x=[t[0] for t in x],
        y=[t[1] for t in x],
        text= [t[0] for t in x])])
        fig10.update_xaxes(visible=False)
        fig10.show()
        print("Description Top5 Keywords")
        fig11 = go.Figure([go.Bar(
        x=[t[0] for t in x],
        y=[t[1] for t in x],
        text= [t[0] for t in x])])
        fig11.update_xaxes(visible=False)
        fig11.show()
        print('\n')

if path == "DOMAIN":
    dp9 = full.groupby(['domain'], as_index=False).mean(['min_salary','max_salary']).dropna()
    dp10 = full.groupby(['domain'], as_index=False).count()
    dp10 = dp10[['domain','location']]
    dp11 = dp9.merge(dp10, how = 'inner', on = 'domain')

    fig12 = px.scatter(dp11, 
           x= 'max_salary', 
           y = 'min_salary', 
           color  = 'domain', 
           hover_data  = 'domain',
           labels = dict(max_salary = "Average Maximum Salary (USD)", min_salary = "Average Minimum Salary (USD)", location = "Number of Jobs"),
           title = "Number of Jobs according to Min/Max Salary by Domain")

    fig12.update_traces(marker = dict(size = dp11['location'],
                                    line = dict(width =1)))

    fig12.update_layout({
    'plot_bgcolor': 'rgba(0, 0, 0, 0)',
    'paper_bgcolor': 'rgba(0, 0, 0, 0)',
    })

    possilbe_domains_2 = dp11.domain.unique().tolist()
    domain_2 = st.selectbox("SELECT Domain", possilbe_domains_2)

    dp12 = full[full['domain'] == domain_2].reset_index().groupby(['state'], as_index = False).mean(['min_salary','max_salary']).dropna()
    dp13 = full[full['domain'] == domain_2].reset_index().groupby(['state'], as_index = False).count()
    dp13 = dp13[['state','location']]
    dp14 = dp12.merge(dp13, how = 'inner', on = 'state')

    fig13 = go.Figure([go.Bar(
    x=dp14['state'],
    y=dp14['min_salary'],
    text= dp14['state'],
    )])

    fig13.update_layout(
        title=f"Minimum Salary Average for Top States of Domain: {domain_2}",
        xaxis_title="State",
        yaxis_title="Average Minimum Salary (USD)",
    )
    fig13.update_xaxes(visible=False)

    fig5.show()

    possible_states = dp14.state.unique().tolist()

    state_2 = st.selectbox("SELECT STATE", possible_states)
    dp14 = full[full['domain'] == domain_2].reset_index()
    x = []
    for i in range(dp14.shape[0]):
        if full.loc[i,"state"] == state_2:
            x.append(True)
        else:
            x.append(False)

    dp15 = dp14[x].reset_index()

    dp16 = dp15.groupby(['city'], as_index=False).mean(['min_salary', 'max_salary']).dropna()
    dp17 = dp15.groupby(['city'], as_index=False).count()
    dp17 = dp17[['city','location']]
    dp18 = dp16.merge(dp17, how = 'inner', on = 'city')

    if dp18.shape[0] ==0:
        print("No jobs found! Try other cities :) ")
    else:
        fig = px.scatter(dp18, 
                x= 'max_salary', 
                y = 'min_salary', 
                color  = dp18['location'], 
                hover_data  = 'city',
                labels = dict(max_salary = "Average Maximum Salary (USD)", min_salary = "Average Minimum Salary (USD)", location = "Number of Jobs"),
                title = f"Number of Jobs according to Min/Max Salary in {state_2} Cities")

        fig.update_traces(marker = dict(size = dp18['location']*100,
                                        line = dict(width =1)))

        fig.update_layout({
        'plot_bgcolor': 'rgba(0, 0, 0, 0)',
        'paper_bgcolor': 'rgba(0, 0, 0, 0)',
        })
        fig.show()

    possible_cities_3 = dp18.city.unique().tolist()
    cities_3 = st.selectbox("SELECT", possible_cities_3)

    dp19 = full[full['domain'] == domain_2].reset_index()
    dp20 = dp19[dp19['city'] == cities_3]
    top_jobs_3 = dp20.sort_values(by = "min_salary")[:3].reset_index()

    print("The Best Job Fit for you would be in the domain of",domain_2[0], "located in",cities_3[0], ",", state_2)
    print("Here are the Top 3 Job Recommendations for you and the requirements!")
    print('\n')
    for i in range(top_jobs_3.shape[0]):
        x = custom_kw_extractor.extract_keywords(top_jobs_3.responsibility[i])
        y = custom_kw_extractor.extract_keywords(top_jobs_3.qualification[i])
        z = custom_kw_extractor.extract_keywords(top_jobs_3.description[i])
        print(f"Position {i+1}")
        print("---------------------")
        print("Postion:",top_jobs_3.position[i], "| Company:", top_jobs_3.company_name[i])
        print("Minimum Salary:",top_jobs_3.min_salary[i], "| Maximum Salary:", top_jobs_3.max_salary[i])

        print("Responsibility Top5 Keywords")
        fig1 = go.Figure([go.Bar(
        x=[a[0] for a in x],
        y=[b[1] for b in x],
        text= [t[0] for t in x])])
        fig1.update_xaxes(visible=False)
        fig1.show()

        print("Qualification Top5 Keywords")
        fig2 = go.Figure([go.Bar(
        x=[c[0] for c in x],
        y=[d[1] for d in x],
        text= [t[0] for t in x])])
        fig2.update_xaxes(visible=False)
        fig2.show()
        
        print("Description Top5 Keywords")
        fig3 = go.Figure([go.Bar(
        x=[e[0] for e in x],
        y=[f[1] for f in x],
        text= [t[0] for t in x])])
        fig3.update_xaxes(visible=False)
        fig3.show()
        print('\n')