# Preperation

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import math
import scipy.spatial
import warnings
import sklearn as sk

warnings.simplefilter("ignore")
# Enable inline mode for matplotlib so that Jupyter displays graphs
%matplotlib inline

pd.__version__ #print which version of pandas you're using

'0.24.2'

## read and combine files

In [2]:
h1b_17 = pd.read_csv('H-1B_2017.csv')
h1b_17.shape

(624650, 53)

In [3]:
h1b_17 = h1b_17.drop('Unnamed: 0',axis=1)
h1b_17.head()

Unnamed: 0,CASE_NUMBER,CASE_STATUS,CASE_SUBMITTED,DECISION_DATE,VISA_CLASS,EMPLOYMENT_START_DATE,EMPLOYMENT_END_DATE,EMPLOYER_NAME,EMPLOYER_BUSINESS_DBA,EMPLOYER_ADDRESS,...,H1B_DEPENDENT,WILLFUL_VIOLATOR,SUPPORT_H1B,LABOR_CON_AGREE,PUBLIC_DISCLOSURE_LOCATION,WORKSITE_CITY,WORKSITE_COUNTY,WORKSITE_STATE,WORKSITE_POSTAL_CODE,ORIGINAL_CERT_DATE
0,I-200-16055-173457,CERTIFIED-WITHDRAWN,2016-02-24,2016-10-01,H-1B,2016-08-10,2019-08-10,DISCOVER PRODUCTS INC.,,2500 LAKE COOK ROAD,...,N,N,,Y,,RIVERWOODS,LAKE,IL,60015,2016-03-01
1,I-200-16064-557834,CERTIFIED-WITHDRAWN,2016-03-04,2016-10-01,H-1B,2016-08-16,2019-08-16,DFS SERVICES LLC,,2500 LAKE COOK ROAD,...,N,N,,Y,,RIVERWOODS,LAKE,IL,60015,2016-03-08
2,I-200-16063-996093,CERTIFIED-WITHDRAWN,2016-03-10,2016-10-01,H-1B,2016-09-09,2019-09-09,EASTBANC TECHNOLOGIES LLC,,1211 31ST ST. NW,...,Y,N,Y,,,WASHINGTON,,DC,20007,2016-03-16
3,I-200-16272-196340,WITHDRAWN,2016-09-28,2016-10-01,H-1B,2017-01-26,2020-01-25,INFO SERVICES LLC,,17177 NORTH LAUREL PARK DR,...,Y,N,Y,,,JERSEY CITY,HUDSON,NJ,7302,
4,I-200-15053-636744,CERTIFIED-WITHDRAWN,2015-02-22,2016-10-02,H-1B,2015-03-01,2018-03-01,BB&T CORPORATION,,223 WEST NASH STREET,...,N,N,,Y,,NEW YORK,NEW YORK,NY,10036,2015-02-26


In [4]:
h1b_18 = pd.read_csv('H-1B_2018.csv')
h1b_18.shape

(654360, 52)

In [11]:
h1b_18.rename(columns={'NEW_CONCURRENT_EMP': 'NEW_CONCURRENT_EMPLOYMENT'}, inplace=True)
h1b_18.head()

Unnamed: 0,CASE_NUMBER,CASE_STATUS,CASE_SUBMITTED,DECISION_DATE,VISA_CLASS,EMPLOYMENT_START_DATE,EMPLOYMENT_END_DATE,EMPLOYER_NAME,EMPLOYER_BUSINESS_DBA,EMPLOYER_ADDRESS,...,H1B_DEPENDENT,WILLFUL_VIOLATOR,SUPPORT_H1B,LABOR_CON_AGREE,PUBLIC_DISCLOSURE_LOCATION,WORKSITE_CITY,WORKSITE_COUNTY,WORKSITE_STATE,WORKSITE_POSTAL_CODE,ORIGINAL_CERT_DATE
0,I-200-18026-338377,CERTIFIED,1/29/18,2/2/18,H-1B,7/28/18,7/27/21,MICROSOFT CORPORATION,,1 MICROSOFT WAY,...,N,N,,,,REDMOND,KING,WA,98052,
1,I-200-17296-353451,CERTIFIED,10/23/17,10/27/17,H-1B,11/6/17,11/6/20,ERNST & YOUNG U.S. LLP,,200 PLAZA DRIVE,...,N,N,,,,SANTA CLARA,SAN JOSE,CA,95110,
2,I-200-18242-524477,CERTIFIED,8/30/18,9/6/18,H-1B,9/10/18,9/9/21,LOGIXHUB LLC,,320 DECKER DRIVE,...,N,N,,,,IRVING,DALLAS,TX,75062,
3,I-200-18070-575236,CERTIFIED,,3/30/18,H-1B,9/10/18,9/9/21,"HEXAWARE TECHNOLOGIES, INC.",,101 WOOD AVENUE SOUTH,...,Y,N,Y,,,NEW CASTLE,NEW CASTLE,DE,19720,
4,I-200-18243-850522,CERTIFIED,8/31/18,9/7/18,H-1B,9/7/18,9/6/21,"ECLOUD LABS,INC.",,120 S WOOD AVENUE,...,Y,N,Y,Y,,BIRMINGHAM,SHELBY,AL,35244,


### deal with type issue on h1b_18

In [12]:
import decimal

def str_to_float(field, raw_data):
    wages = []
    for i,  r in h1b_18.iterrows():
        if type(r[field]) != float:
            wages.append(float(decimal.Decimal(r[field].replace(",",""))))
        else:
            wages.append(r[field])

    raw_data[field] = wages
    return raw_data

In [14]:
h1b_18 = str_to_float('PREVAILING_WAGE', h1b_18)
h1b_18 = str_to_float('WAGE_RATE_OF_PAY_FROM', h1b_18)
h1b_18 = str_to_float('WAGE_RATE_OF_PAY_TO', h1b_18)

### combine files together

In [17]:
frames = [h1b_17, h1b_18]
raw_data = pd.concat(frames, axis=0, sort=False)
print(raw_data.shape)
raw_data.head()

(1279010, 52)


Unnamed: 0,CASE_NUMBER,CASE_STATUS,CASE_SUBMITTED,DECISION_DATE,VISA_CLASS,EMPLOYMENT_START_DATE,EMPLOYMENT_END_DATE,EMPLOYER_NAME,EMPLOYER_BUSINESS_DBA,EMPLOYER_ADDRESS,...,H1B_DEPENDENT,WILLFUL_VIOLATOR,SUPPORT_H1B,LABOR_CON_AGREE,PUBLIC_DISCLOSURE_LOCATION,WORKSITE_CITY,WORKSITE_COUNTY,WORKSITE_STATE,WORKSITE_POSTAL_CODE,ORIGINAL_CERT_DATE
0,I-200-16055-173457,CERTIFIED-WITHDRAWN,2016-02-24,2016-10-01,H-1B,2016-08-10,2019-08-10,DISCOVER PRODUCTS INC.,,2500 LAKE COOK ROAD,...,N,N,,Y,,RIVERWOODS,LAKE,IL,60015,2016-03-01
1,I-200-16064-557834,CERTIFIED-WITHDRAWN,2016-03-04,2016-10-01,H-1B,2016-08-16,2019-08-16,DFS SERVICES LLC,,2500 LAKE COOK ROAD,...,N,N,,Y,,RIVERWOODS,LAKE,IL,60015,2016-03-08
2,I-200-16063-996093,CERTIFIED-WITHDRAWN,2016-03-10,2016-10-01,H-1B,2016-09-09,2019-09-09,EASTBANC TECHNOLOGIES LLC,,1211 31ST ST. NW,...,Y,N,Y,,,WASHINGTON,,DC,20007,2016-03-16
3,I-200-16272-196340,WITHDRAWN,2016-09-28,2016-10-01,H-1B,2017-01-26,2020-01-25,INFO SERVICES LLC,,17177 NORTH LAUREL PARK DR,...,Y,N,Y,,,JERSEY CITY,HUDSON,NJ,7302,
4,I-200-15053-636744,CERTIFIED-WITHDRAWN,2015-02-22,2016-10-02,H-1B,2015-03-01,2018-03-01,BB&T CORPORATION,,223 WEST NASH STREET,...,N,N,,Y,,NEW YORK,NEW YORK,NY,10036,2015-02-26


In [18]:
raw_data.to_csv('h1b.csv', index=False)