 ## THIRUMALA LAXMI TANUJA


In [495]:
# Download the data locally

import os
from urllib.request import urlretrieve
import gzip

# download the data if we don't have it locally
url = "http://faculty.cs.niu.edu/~dakoop/cs640-2023sp/a1/aam-salary-survey.csv.gz"
local_fname = "aam-salary-survey.csv.gz"
if not os.path.exists(local_fname):
    urlretrieve(url, local_fname)

In [496]:
local_fname

'aam-salary-survey.csv.gz'

## 1. Reformatting the Salary column

In [497]:
import pandas as pd
df = pd.read_csv("aam-salary-survey.csv.gz")
df['Salary'] = df['Salary'].str.replace(',', '').astype(float)

## 2. Updating the Range Columns

In [498]:
# df.iloc[:3]

In [499]:
# updating age column
# Split the Age column into left and right endpoints
df['Age'] = df['Age'].replace({'under ': '0-', 'less than': '0-', 'or over': '- 999'}, regex=True)
endpoints = df['Age'].str.split('-', expand = True)
# Convert the left and right endpoints to numeric data types
left_endpoints = endpoints[0].astype(int)
right_endpoints = endpoints[1].astype(int)
right_endpoints = right_endpoints + 1
# Create an IntervalArray from the left and right endpoints
interval_array = pd.arrays.IntervalArray.from_arrays(left_endpoints, right_endpoints, closed='left')
df = df.assign (Age = interval_array)

In [500]:
df.Age.array.overlaps(pd.Interval(18,45, closed='left'))

array([ True,  True,  True, ...,  True,  True,  True])

In [501]:
# updating ExpOverall and ExpInField
def create_range_intervale(df, col):
    df[col] = df[col].replace({'1 year or less': '0-1', 'years or more' : '-999'}, regex=True)
    endpoints = df[col].str.split('-', expand = True)
    # Convert the left and right endpoints to numeric data types
    left_endpoints = endpoints[0].astype(int)
    right_endpoints = endpoints[1].str.extract('(\d+)', expand = False).astype(int)
    right_endpoints = right_endpoints + 1

    # Create an IntervalArray from the left and right endpoints
    interval_array = pd.arrays.IntervalArray.from_arrays(left_endpoints, right_endpoints, closed='left')
    df[col] = interval_array
    return df
      

In [502]:
df = create_range_intervale(df, 'ExpOverall')
df = create_range_intervale(df, 'ExpInField')

In [503]:
df.ExpOverall.array.overlaps(pd.Interval(18,45, closed='left'))

array([False, False, False, ...,  True, False,  True])

In [504]:
df.ExpInField.array.overlaps(pd.Interval(18,45, closed='left'))

array([False, False, False, ...,  True, False, False])

## 3. Combine and Clean the Currency Columns

In [505]:
import pandas as pd
import numpy as np
excel_df = pd.read_excel('list-one.xls')
# excel_df

In [506]:
other_currency = df['CurrencyOther']
pattern = r'\b([A-Z]{3})\b'
other_currency = other_currency.str.extract(pattern, expand = True)
other_currency = other_currency.rename(columns = {0 : 'curr'})
other_currency['is_in_codes'] = other_currency['curr'].isin(excel_df['Unnamed: 2'])
other_currency.loc[other_currency['is_in_codes'] == False, 'curr'] = np.nan
# other_currency

In [507]:
df['Currency'] = df['Currency'].where(df['Currency'] != 'Other', other_currency['curr'])
df = df.drop('CurrencyOther', axis = 1)
# df[df['Currency'] == "Other"]

## 4. Salary Cleaning

In [508]:
# df[(df['Currency'] == 'USD') & (df['Salary'] > 2 ) & (df['Salary'] < 500)]


In [509]:
# """The reason why the salaries are very low is people might gave hourly income or weekly or monthly rather than yearly.
# we can check the avg income based on the job role and designation and if the salary is very low, then it might me an error.
# for this assignment, I am assuming that people might enter their pay rate in different freq and I am conevrting them to anual income.
# """

In [510]:
# df['Salary'].mean()

In [511]:
# df.loc[(df['Currency'] == 'USD') & (df['Salary'] != 0 ), 'Salary'] .mean()

In [512]:
# replacing the column values with mean if 0
df.loc[(df['Currency'] == 'USD') & (df['Salary'] == 0 ), 'Salary'] = 96478.81265583355
# assuming minimun hourly wage is 7, so replcaing values less than 7 with 7 as hourly income
df.loc[(df['Currency'] == 'USD') & (df['Salary'] <= 7 ), 'Salary'] *= (2080 * 7)
# Assuming below ranges and converting them to annual income.
df.loc[(df['Currency'] == 'USD') & (df['Salary'] > 80 ) & (df['Salary'] <= 160), 'Salary'] *= 2080
df.loc[(df['Currency'] == 'USD') & (df['Salary'] > 8 ) & (df['Salary'] <= 80), 'Salary'] *= 52
df.loc[(df['Currency'] == 'USD') & (df['Salary'] > 160 ) & (df['Salary'] <= 400), 'Salary'] *= 24
df.loc[(df['Currency'] == 'USD') & (df['Salary'] > 400 ) & (df['Salary'] <= 500), 'Salary'] *= 12


In [514]:
df

Unnamed: 0,Timestamp,Age,Industry,JobTitle,JobDetails,Salary,ExtraComp,Currency,IncomeDetails,Country,State,City,ExpOverall,ExpInField,Education,Gender,Race
0,4/27/2021 11:02:10,"[25, 35)",Education (Higher Education),Research and Instruction Librarian,,55000.000000,0.0,USD,,United States,Massachusetts,Boston,"[5, 8)","[5, 8)",Master's degree,Woman,White
1,4/27/2021 11:02:22,"[25, 35)",Computing or Tech,Change & Internal Communications Manager,,54600.000000,4000.0,GBP,,United Kingdom,,Cambridge,"[8, 11)","[5, 8)",College degree,Non-binary,White
2,4/27/2021 11:02:38,"[25, 35)","Accounting, Banking & Finance",Marketing Specialist,,34000.000000,,USD,,US,Tennessee,Chattanooga,"[2, 5)","[2, 5)",College degree,Woman,White
3,4/27/2021 11:02:41,"[25, 35)",Nonprofits,Program Manager,,62000.000000,3000.0,USD,,USA,Wisconsin,Milwaukee,"[8, 11)","[5, 8)",College degree,Woman,White
4,4/27/2021 11:02:42,"[25, 35)","Accounting, Banking & Finance",Accounting Manager,,60000.000000,7000.0,USD,,US,South Carolina,Greenville,"[8, 11)","[5, 8)",College degree,Woman,White
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27920,12/22/2022 17:43:19,"[25, 35)",Entertainment,Analyst,Analyst in A&R Admin Department at record label,60000.000000,,USD,,United States,New York,NYC,"[2, 5)","[2, 5)",College degree,Woman,Black or African American
27921,12/29/2022 7:54:18,"[55, 65)",Health care,Physical therapist assistant,Home health,62400.000000,,USD,,Usa,Colorado,Denver,"[31, 41)","[21, 31)",Some college,Woman,White
27922,1/4/2023 9:11:41,"[35, 45)",Retail,Marketing Automation Developer,,68000.000000,,USD,,United States,Indiana,Fort Wayne,"[11, 21)","[11, 21)",College degree,Man,White
27923,1/8/2023 12:16:15,"[18, 25)",Computing or Tech,Computer Scientist,,10000.000000,,TRY,,Turkey,,Istanbul,"[2, 5)","[2, 5)",,Man,"Middle Eastern or Northern African, White"


## 5. Salary by State

In [515]:
top_five_industries = df['Industry'].value_counts().head().index.tolist()
top_five_industries

['Computing or Tech',
 'Education (Higher Education)',
 'Nonprofits',
 'Health care',
 'Government and Public Administration']

In [516]:
filtered_df = df[df['Industry'].isin(top_five_industries)]
# filtered_df

In [517]:
unnested_df = filtered_df.assign(State = filtered_df['State'].str.split(',')).explode('State')
unnested_df['State'] = unnested_df['State'].str.strip()
# unnested_df

In [1]:
grouped = unnested_df.groupby(['State', 'Industry'])['Salary'].mean().reset_index()
# grouped

NameError: name 'unnested_df' is not defined

In [519]:
pivoted = grouped.pivot(index = 'State', columns = 'Industry', values = 'Salary')
pivoted

Industry,Computing or Tech,Education (Higher Education),Government and Public Administration,Health care,Nonprofits
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama,100240.0,59901.45,101750.0,82167.2,58632.666667
Alaska,112500.0,71767.875,67080.0,91013.333333,59720.0
Arizona,104242.105263,62746.102564,67860.357143,72963.242424,50446.666667
Arkansas,81682.3,44726.666667,64578.142857,129482.857143,39970.0
California,155275.162822,80739.906832,96808.638462,114633.264901,81215.100478
Colorado,127571.707483,63277.727273,71830.05,90076.857143,68196.382979
Connecticut,128375.0,77238.733333,78730.0,155199.24,66763.666667
Delaware,116600.0,68036.25,65681.5,115025.0,54533.333333
District of Columbia,132327.929825,73573.842105,117827.828571,211568.823529,89242.438127
Florida,113047.363636,73820.363636,61968.566667,85137.166667,63111.678571
