In [1]:
%matplotlib notebook

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from iso3166 import countries
import plotly.express as px
import plotly.graph_objects as go


In [None]:
#read 
job_postings = pd.read_csv('Resources/job_postings.csv')
job_postings.head()

In [None]:
#verify that columns and rows 
job_postings.shape

In [None]:
#drop if the job posting is fraudulent
#job_postings = job_postings[~job_postings['fraudulent'].isin(['1'])]
#job_postings.head()

In [None]:
#verify that fraudulent data has been dropped
job_postings.shape

In [None]:
# dropping null value rows in location to avoid errors 
job_postings = job_postings[pd.notnull(job_postings['location'])]

#verify that null data has been dropped
job_postings.shape


In [None]:
# new data frame with split value columns 
df2 = job_postings["location"].str.split(" ", n = 2, expand = True) 
df2.head

In [None]:
#display names of new columns
list(df2.columns) 
df2

In [None]:
# change names of new columns
df2.rename(columns={0:"Country", 1:"State", 2:"City"}, inplace=True)

In [None]:
# add columns from df2 to the job postings data 
job_postings['City'], job_postings['State'], job_postings['Country'] = df2['City'], df2['State'], df2['Country']
job_postings

In [None]:
# drop the location column 
job_postings.drop('location', axis=1, inplace=True)

In [None]:
job_postings.columns


In [None]:
job_postings[(job_postings.Country == "US")]


In [None]:
state_fraud = job_postings[(job_postings.Country.isin(["US", "US,"])) & (job_postings.State.isin(["NY,", "CA,", "TX,"]))][["fraudulent", "State", "City"]]

In [None]:
job_postings.Country.value_counts()

In [None]:
state_fraud_counts = pd.DataFrame(state_fraud.groupby(["State", "fraudulent"]).count())
state_fraud_counts.reset_index(inplace = True)
state_fraud_counts.columns = ["State", "Fraudulent", "Count"]
state_fraud_counts

In [None]:
labels = state_fraud_counts.State.unique() 
fraud_counts = state_fraud_counts[state_fraud_counts.Fraudulent == 1]["Count"]
valid_counts = state_fraud_counts[state_fraud_counts.Fraudulent == 0]["Count"]
print(labels)
print(fraud_Counts)
print(valid_counts)

In [None]:
N = 3
#menMeans = (1908, 1191, 823)
#womenMeans = (143, 68, 152)
menMeans = (valid_counts)
womenMeans = (fraud_counts)

ind = np.arange(N)    # the x locations for the groups
width = 0.35       # the width of the bars: can also be len(x) sequence

p1 = plt.bar(ind, menMeans, width,)
p2 = plt.bar(ind, womenMeans, width,
             bottom=menMeans,)

plt.ylabel('Jobs')
plt.title('Jobs by state')
plt.xticks(ind, ('CA', 'NY', 'TX'))
plt.yticks(np.arange(0, 2200, 200))
plt.legend((p1[0], p2[0]), ('Real', 'Fraudulent'))

plt.show()

In [None]:
# remove underscores from columns
job_postings.columns = job_postings.columns.str.replace('_', ' ')
# uppercase all columns titles
job_postings.columns = job_postings.columns.str.upper()
job_postings

In [None]:
# remove , in the Country and State columns
job_postings['COUNTRY'] = job_postings['COUNTRY'].str.replace(',', '').astype(str)
job_postings['STATE'] = job_postings['STATE'].str.replace(',', '').astype(str)
job_postings

In [None]:
# counting all the nans in each column
job_postings.isnull().sum(axis = 0)

In [None]:
# save dataframe csv file with no index to folder jupyter notebook is working out of
job_postings.to_csv('job_postings_clean.csv', index = False)

In [None]:
#count = job_postings.groupby(['COUNTRY']).count()
country_groupby = job_postings.groupby("COUNTRY")
country_list = country_groupby["COUNTRY"].first()


In [None]:
country_groups = country_groupby.groups

country_counts = []

for i in country_list:
    country_counts.append(len(country_groups[i]))
    

In [None]:
country_job_df = pd.DataFrame({"Country": country_list, "Job Count": country_counts})

In [None]:
country_job_df

In [None]:
countries.get('us')
three_letters_list = []
country_name = []
for i in country_job_df['Country']:
    three_letters_list.append(countries.get(i)[2])
    country_name.append(countries.get(i)[0])
    


In [None]:
country_job_df["three letter country"] = three_letters_list
country_job_df["country name"] = country_name
country_job_df

In [None]:

fig = px.choropleth(country_job_df, locations="three letter country",
                    color="Job Count", # lifeExp is a column of gapminder
                    hover_name="country name", # column to add to hover information
                    color_continuous_scale=px.colors.sequential.Aggrnyl)
fig.show()

In [None]:
df2

In [None]:
us_df = df2[df2["Country"] == "US,"]
us_df

In [None]:
us_df['Country'] = us_df['Country'].str.replace(',', '').astype(str)
us_df['State'] = us_df['State'].str.replace(',', '').astype(str)
us_df

In [None]:
us_groupby = us_df.groupby("State")
us_list = us_groupby["State"].first()
us_list

In [None]:
state_groups = us_groupby.groups

state_counts = []

for i in us_list:
    state_counts.append(len(state_groups[i]))
state_counts
    

In [None]:
state_counts_df = pd.DataFrame({"States":us_list,"Job Count":state_counts})
us_df = df2[df2["Country"] == "US,"]
us_df

In [None]:
fig = go.Figure(data=go.Choropleth(
    locations=state_counts_df['States'], # Spatial coordinates
    z = state_counts_df['Job Count'].astype(float), # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    colorscale = 'greens',
    colorbar_title = "Job Count",
))

fig.update_layout(
    title_text = 'Jobs Per State',
    geo_scope='usa', # limite map scope to USA
)

fig.show()

In [None]:
ca_count = state_counts_df[state_counts_df['States'] == "CA"]['Job Count'][0]
tx_count = state_counts_df[state_counts_df['States'] == "TX"]['Job Count'][0]
ny_count = state_counts_df[state_counts_df['States'] == "NY"]['Job Count'][0]


In [None]:
fig = px.choropleth(locations=["CA", "TX", "NY"], locationmode="USA-states", color=[ca_count,tx_count,ny_count], scope="usa")
fig.show()