# Calculate Salary Change - Team Gannett Peak

### Team Members: Congda Xu, Binqi Shen,  Matthew Ko, Isaac Choi

In [1]:
# load packages
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [2]:
# load main data
master = pd.read_csv(r'gwtable.csv')

In [3]:
master.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16338178 entries, 0 to 16338177
Data columns (total 26 columns):
 #   Column            Dtype  
---  ------            -----  
 0   job_id            int64  
 1   vertical          float64
 2   company           object 
 3   post_date         object 
 4   salary            float64
 5   location          object 
 6   city              object 
 7   state             object 
 8   state_long        object 
 9   zip               object 
 10  county            object 
 11  region_state      object 
 12  latitude          float64
 13  longitude         float64
 14  company_ref       object 
 15  company_parent    object 
 16  sic_primary       object 
 17  naics_primary     float64
 18  ticker            object 
 19  scrape_timestamp  object 
 20  modify_timestamp  object 
 21  meta_num_roles    float64
 22  meta_num_tags     float64
 23  meta_num_titles   float64
 24  salary_modeled    float64
 25  role_primary      float64
dtypes: float64(1

In [4]:
# only take three columns interested, sort by post date ascending
salary = master[['post_date', 'salary', 'ticker']].sort_values('post_date')

In [5]:
salary

Unnamed: 0,post_date,salary,ticker
8325295,2016-04-25 20:56:41,107500.0,BSX
7106378,2016-04-26 20:55:17,112500.0,ECL
7731850,2016-04-26 20:55:17,112500.0,MDT
6830582,2016-04-26 20:55:17,81000.0,TRI
9221846,2016-04-26 20:55:17,36500.0,ELMD
...,...,...,...
4600902,,,KBR
4600929,,,UPS
4600972,,,JCP
4606316,,,JCP


In [6]:
# filter for days before 2020-01-31
salary = salary[salary.post_date < '2020-02-01']
salary['post_date']= pd.to_datetime(salary['post_date'])

In [7]:
salary.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14288604 entries, 8325295 to 3406831
Data columns (total 3 columns):
 #   Column     Dtype         
---  ------     -----         
 0   post_date  datetime64[ns]
 1   salary     float64       
 2   ticker     object        
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 436.1+ MB


In [8]:
salary

Unnamed: 0,post_date,salary,ticker
8325295,2016-04-25 20:56:41,107500.0,BSX
7106378,2016-04-26 20:55:17,112500.0,ECL
7731850,2016-04-26 20:55:17,112500.0,MDT
6830582,2016-04-26 20:55:17,81000.0,TRI
9221846,2016-04-26 20:55:17,36500.0,ELMD
...,...,...,...
3406825,2020-01-31 23:52:00,87500.0,KGHI
3406827,2020-01-31 23:53:00,133000.0,BIOC
3406830,2020-01-31 23:53:00,42500.0,NTRA
3406828,2020-01-31 23:53:00,27875.0,DPSGY


In [9]:
# change date interval
salary.post_date = salary.post_date.astype('datetime64[s]').dt.to_period('M')

In [11]:
# take average salary per ticker per month
salary = salary.groupby(['ticker', 'post_date']).mean()
salary = salary.reset_index()

In [12]:
salary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89201 entries, 0 to 89200
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype    
---  ------     --------------  -----    
 0   ticker     89201 non-null  object   
 1   post_date  89201 non-null  period[M]
 2   salary     86840 non-null  float64  
dtypes: float64(1), object(1), period[M](1)
memory usage: 2.0+ MB


In [13]:
# add one column to store previous month mean salary
salary['prev_salary'] = salary.groupby('ticker')['salary'].shift(1)

# add one column to store salary change in percentage
salary['salary_change'] = 100 * (salary.salary - salary.prev_salary)/ salary.prev_salary

In [14]:
# test
salary

Unnamed: 0,ticker,post_date,salary,prev_salary,salary_change
0,6098,2018-02,82100.800000,,
1,6098,2018-03,96600.500000,82100.800000,17.660851
2,6098,2018-04,107001.000000,96600.500000,10.766507
3,6098,2018-06,135500.666667,107001.000000,26.634954
4,6098,2018-07,104500.000000,135500.666667,-22.878608
...,...,...,...,...,...
89196,ZVLO,2017-03,48000.000000,36500.000000,31.506849
89197,ZVLO,2017-11,99001.000000,48000.000000,106.252083
89198,ZVLO,2018-01,99200.400000,99001.000000,0.201412
89199,ZVLO,2018-03,77500.000000,99200.400000,-21.875315


In [17]:
salary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89201 entries, 0 to 89200
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype    
---  ------         --------------  -----    
 0   ticker         89201 non-null  object   
 1   post_date      89201 non-null  period[M]
 2   salary         86840 non-null  float64  
 3   prev_salary    83693 non-null  float64  
 4   salary_change  82185 non-null  float64  
dtypes: float64(3), object(1), period[M](1)
memory usage: 3.4+ MB


In [15]:
# TEST: make sure the first record of prev_salary for each company is NA
salary[salary.ticker == 'ZVLO']

Unnamed: 0,ticker,post_date,salary,prev_salary,salary_change
89195,ZVLO,2017-02,36500.0,,
89196,ZVLO,2017-03,48000.0,36500.0,31.506849
89197,ZVLO,2017-11,99001.0,48000.0,106.252083
89198,ZVLO,2018-01,99200.4,99001.0,0.201412
89199,ZVLO,2018-03,77500.0,99200.4,-21.875315
89200,ZVLO,2018-05,117167.0,77500.0,51.183226


In [16]:
# export to csv
salary.to_csv(r'CleanSalary.csv', index = False, date_format='%Y-%m-%d')