In [None]:
import pandas as pd
import matplotlib.pyplot as plt
salaries_df = pd.DataFrame(pd.read_excel("salary_data_states_corrected.xlsx"))

#Lets see what kind of information this dataframe holds:

salaries_df.info()

In [None]:
#As we can see, we have a "JOB_TITLE_SUBGROUP" category that we can use to determine if the jobs are related to data.

salaries_df["JOB_TITLE_SUBGROUP"].value_counts()

In [None]:
#Indeed, we have two categories called "data analyst" and "data scientist" that we can use.
#Lets create a seperate dataframe so that we can work on those data jobs easily.

data_salaries_df = salaries_df.loc[salaries_df['JOB_TITLE_SUBGROUP'].isin(["data analyst", "data scientist"])]

#Lets also drop rows that have missing state or paid_wage information, since we can't use them on our calculations

data_salaries_df = data_salaries_df.dropna(subset=['WORK_STATE', 'PAID_WAGE_PER_YEAR'])

#We actually don't need to have this here since there are no NaN values associates with these parameters, but it is
#Nice to keep this part just incase if we want to use this code with different data with same structure in the future

In [None]:
#Lets group the salary information by taking average yearly paid wage into account and sort them in descending order

sorted_sal_df = data_salaries_df.groupby('WORK_STATE').mean().sort_values(by="PAID_WAGE_PER_YEAR", ascending=[False])
sorted_sal_df = sorted_sal_df.reset_index() #We need to do this to create a plot later
sorted_sal_df

In [None]:
#As expected, California pays the most when it comes to data related jobs. However, it is surprizing to note that
#Utah is also close to the top, could this be related to massive NSA data center located around Bluffdale? Probably.

#Lets use this information to create a plot so that we can visualy see the difference
#I will use the top 7 states to make the plot comprehensible, this can be changes by changing the variable below

state_count = 7
sorted_sal_df.head(state_count).plot(x ='WORK_STATE', y='PAID_WAGE_PER_YEAR', kind = 'bar')
plt.show()

In [None]:
#Now, lets explore the differences between different job subtypes, namely "data analyst" and "data scientist"