In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

Load cleaned up version of dataset from 2012 to 2018.  Drop geographic information about employer and worksite to reduce dimensionality

In [2]:
df = pd.read_pickle('/Users/minse_chang/PycharmProjects/Udacity_ML_Capstone/data/H1B_15-18_new.pickle')
df.drop(columns=['EMPLOYER_CITY','JOB_TITLE','EMPLOYER_NAME'], inplace=True)
df.drop(columns=['EMPLOYER_STATE','WORKSITE_STATE'], inplace=True)


In [3]:
ones_df = df[df['CASE_STATUS']=='DENIED']
zeros_df = df[df['CASE_STATUS']=='CERTIFIED']

In [4]:
# df = pd.concat([ones_df, zeros_df.sample(frac=0.1, random_state=99)])
df = pd.concat([ones_df, zeros_df.sample(frac=0.02, random_state=99)])


In [5]:
ones_df.head()
print(ones_df.shape)

(37310, 11)


In [6]:
zeros_df.head()
print(zeros_df.shape)

(2424938, 11)


We will use column 'CASE_STATUS' as our label.  DENIED will be 1 and CERTIFIED will be 0 after it gets incoded by label_encoder.

Normalize PREVAILING_WAGE using minmax scaler since it varies quite a bit.

In [7]:
scaler = MinMaxScaler()
X = df.drop(columns='CASE_STATUS')

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['CASE_STATUS'])

X[['PREVAILING_WAGE']] = scaler.fit_transform(X[['PREVAILING_WAGE']])
X = pd.get_dummies(X)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)

Check value counts to verify stratify split

In [9]:
# print(y_train.value_counts())
# print(y_test.value_counts())
X_train.head()

Unnamed: 0,PREVAILING_WAGE,WAGE_LOWER_THAN_PW,FULL_TIME_POSITION_N,FULL_TIME_POSITION_Y,H1B_DEPENDENT_N,H1B_DEPENDENT_Y,"NAICS_CODE_11 MM ACRES ENERGY, LLC",NAICS_CODE_11111,NAICS_CODE_111110,NAICS_CODE_11113,...,VISA_CLASS_H-1B,VISA_CLASS_H-1B1 Chile,VISA_CLASS_H-1B1 Singapore,WAGE_UNIT_OF_PAY_Bi-Weekly,WAGE_UNIT_OF_PAY_Hour,WAGE_UNIT_OF_PAY_Month,WAGE_UNIT_OF_PAY_Week,WAGE_UNIT_OF_PAY_Year,WILLFUL_VIOLATOR_N,WILLFUL_VIOLATOR_Y
450514,0.000156,False,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0
27154,6.5e-05,False,0,1,0,1,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0
57143,8.2e-05,True,0,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0
261176,0.000133,False,0,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0
218389,4.6e-05,False,0,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0


Train RandomForestClassifier and predict using X_test

In [10]:
from sklearn.ensemble import RandomForestClassifier


# from sklearn.feature_extraction.text import CountVectorizer

rfc = RandomForestClassifier(n_estimators=290, max_features=18, max_depth=80)
# vec = CountVectorizer()
# X_train_t = vec.fit_transform(X_train)

rfc.fit(X_train, y_train)

rfc_pred = rfc.predict(X_test)

Print out classifer stats

In [11]:
from sklearn.metrics import classification_report, confusion_matrix
print("** Random Forest Classifier Stats (OVERALL testset)")
print(classification_report(y_test, rfc_pred))

** Random Forest Classifier Stats (OVERALL testset)
             precision    recall  f1-score   support

          0       0.74      0.91      0.82      9700
          1       0.83      0.59      0.69      7462

avg / total       0.78      0.77      0.76     17162



In [12]:
import operator
# print(len(rfc.feature_importances_))
# print(len(X_train.columns))

nonzero_features = {}
for i in range(len(rfc.feature_importances_)):
    if rfc.feature_importances_[i] > 0.0:
        nonzero_features[list(X_train.columns)[i]] = rfc.feature_importances_[i]
sorted_result = sorted(nonzero_features.items(), key=operator.itemgetter(1)) 
for i in range(len(sorted_result)):
    print(sorted_result[-(i+1)])

(u'PREVAILING_WAGE', 0.18293026445786104)
('WAGE_LOWER_THAN_PW', 0.13345569838477908)
(u'FULL_TIME_POSITION_Y', 0.10954694495141858)
(u'H1B_DEPENDENT_N', 0.05894220696409722)
(u'H1B_DEPENDENT_Y', 0.05382435880624421)
(u'VISA_CLASS_H-1B', 0.032452745474882416)
('NAICS_CODE_541511', 0.03131912082344506)
(u'VISA_CLASS_E-3 Australian', 0.030757944085887154)
(u'WAGE_UNIT_OF_PAY_Year', 0.02543273555975271)
(u'PW_SOURCE_Other', 0.02237529941055016)
(u'PW_SOURCE_OES', 0.022290768334240322)
(u'WAGE_UNIT_OF_PAY_Hour', 0.019510004469320646)
(u'WILLFUL_VIOLATOR_N', 0.01884607994961627)
(u'FULL_TIME_POSITION_N', 0.00983542437845741)
('SOC_CODE_15-1121', 0.00956304822171267)
('SOC_CODE_15-1132', 0.009293945920107013)
('SOC_CODE_15-1199', 0.006894301902283089)
('SOC_CODE_15-1131', 0.006326105150225144)
(u'WAGE_UNIT_OF_PAY_Month', 0.0034633504294871555)
(u'NAICS_CODE_CAPGEMINI AMERICA INC', 0.003153465838678722)
('SOC_CODE_29-1069', 0.0028178256960007557)
('SOC_CODE_11-1021', 0.0027027697878396224)
('

(u'NAICS_CODE_HANON SYSTEMS ALABAMA CORPORATION', 2.6688598838106976e-06)
(u'NAICS_CODE_MAXX SPORTS TECHNOLOGIES LTD.', 2.660810472821524e-06)
(u'NAICS_CODE_UNIVERSITY OF NORTH CAROLINA AT ASHEVILLE', 2.657284167152158e-06)
(u'NAICS_CODE_NO. 1 INSURANCE BROKERAGE, INC.', 2.6569089833669165e-06)
('NAICS_CODE_519000', 2.6548926094067693e-06)
(u'NAICS_CODE_PINECREST ACADEMY OF NEVADA', 2.6494016836905265e-06)
(u'NAICS_CODE_ILLUMINATION TECHNOLOGY, INC', 2.646144777858531e-06)
('NAICS_CODE_721310', 2.6423990816651414e-06)
('NAICS_CODE_221112', 2.634033171784171e-06)
(u'NAICS_CODE_GNU LEGACY L.L.C.', 2.6317358945022453e-06)
(u'NAICS_CODE_CRYPTONOMOS', 2.628910332084237e-06)
(u'NAICS_CODE_CONTRA COSTA COUNTY', 2.628491265876403e-06)
(u'NAICS_CODE_URBAN JUSTICE CENTER', 2.6275952568710567e-06)
(u'NAICS_CODE_RICK MAR LLC', 2.626934771910995e-06)
(u'NAICS_CODE_BELZONA INC.', 2.618292385092815e-06)
('NAICS_CODE_624210', 2.6151185552824493e-06)
('SOC_CODE_37-3013', 2.613146764993922e-06)
(u'NAICS

Output the feature importance vector and sort by importance

Evaluate stats for the entire test set

In [16]:
from sklearn.metrics import precision_score, recall_score, f1_score, fbeta_score, accuracy_score, confusion_matrix

print("** Random Forest Classifier Scores (OVERALL testset)")
print("Precision: %s"% precision_score(y_test, rfc_pred))
print("Recall: %s"% recall_score(y_test, rfc_pred))
print("Accuracy score: %s"% accuracy_score(y_test, rfc_pred))
print("F-1 score: %s"% f1_score(y_test, rfc_pred))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_test, rfc_pred, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_test, rfc_pred, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_test, rfc_pred))

** Random Forest Classifier Scores (OVERALL testset)
Precision: 0.8278982092365693
Recall: 0.5885821495577593
Accuracy score: 0.7679174921337839
F-1 score: 0.6880238113887367
F-beta score with beta=0.5: 0.7656369848685587
F-beta score with beta=0.2: 0.8151505849936111
Confusion Matrix: 
[[8787  913]
 [3070 4392]]


Evaluate stats for testset with label 1 (DENIED cases only)

In [17]:
import numpy as np
ones_index = y_test.nonzero()
x_ones = X_test.iloc[ones_index]
y_ones = np.take(y_test, ones_index)[0]

rfc_pred_oneonly = rfc.predict(x_ones)

In [18]:
print("** Random Forest Classifier Scores (Only rows with label DENIED->1)")
print("Precision: %s"% precision_score(y_ones, rfc_pred_oneonly))
print("Recall: %s"% recall_score(y_ones, rfc_pred_oneonly))
print("Accuracy score: %s"% accuracy_score(y_ones, rfc_pred_oneonly))
print("F-1 score: %s"% f1_score(y_ones, rfc_pred_oneonly))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_ones, rfc_pred_oneonly, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_ones, rfc_pred_oneonly, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_ones, rfc_pred_oneonly))

** Random Forest Classifier Scores (Only rows with label DENIED->1)
Precision: 1.0
Recall: 0.5885821495577593
Accuracy score: 0.5885821495577593
F-1 score: 0.7410156909060234
F-beta score with beta=0.5: 0.877347183379944
F-beta score with beta=0.2: 0.9738193106036055
Confusion Matrix: 
[[   0    0]
 [3070 4392]]


Evaluate stats for testset with label 0 (CERTIFIED cases only)

In [19]:
zeros_index = np.arange(len(y_test))[(y_test==0)]

x_zeros = X_test.iloc[zeros_index]
y_zeros = np.take(y_test, zeros_index)
rfc_pred_zeroonly = rfc.predict(x_zeros)

In [20]:
print("** Random Forest Classifier Scores (Only rows with label CERTIFIED->0)")
print("Precision: %s"% precision_score(y_zeros, rfc_pred_zeroonly))
print("Recall: %s"% recall_score(y_zeros, rfc_pred_zeroonly))
print("Accuracy score: %s"% accuracy_score(y_zeros, rfc_pred_zeroonly))
print("F-1 score: %s"% f1_score(y_zeros, rfc_pred_zeroonly))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_zeros, rfc_pred_zeroonly, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_zeros, rfc_pred_zeroonly, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_zeros, rfc_pred_zeroonly))


** Random Forest Classifier Scores (Only rows with label CERTIFIED->0)
Precision: 0.0
Recall: 0.0
Accuracy score: 0.9058762886597939
F-1 score: 0.0
F-beta score with beta=0.5: 0.0
F-beta score with beta=0.2: 0.0
Confusion Matrix: 
[[8787  913]
 [   0    0]]


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [21]:
import datetime
datetime.datetime.now()

datetime.datetime(2018, 12, 15, 12, 17, 5, 608957)