In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

Load cleaned up version of dataset from 2015 to 2018.  Drop geographic information about employer and worksite to reduce dimensionality

In [3]:
df = pd.read_pickle('/Users/ml/Desktop/Udacity_ML_Capstone/data/H1B_15-18_FINAL.pickle')
df.drop(columns=['EMPLOYER_CITY','EMPLOYER_STATE','WORKSITE_STATE'], inplace=True)

Show first 5 lines of the dataset

In [4]:
df.head()

Unnamed: 0,CASE_STATUS,FULL_TIME_POSITION,H1B_DEPENDENT,NAICS_CODE,PREVAILING_WAGE,PW_SOURCE,SOC_CODE,VISA_CLASS,WAGE_UNIT_OF_PAY,WILLFUL_VIOLATOR,WAGE_LOWER_THAN_PW
3,CERTIFIED,Y,N,335314,70413.2662,OES,17-2072,H-1B,Year,N,False
4,CERTIFIED,Y,N,522294,103390.0783,OES,15-1131,H-1B,Year,N,False
7,CERTIFIED,Y,N,522294,68372.2865,OES,15-2031,H-1B,Year,N,False
9,DENIED,N,N,424460,23.962574,Other,13-2011,H-1B,Hour,N,False
11,DENIED,Y,N,541330,368080.5,DBA,13-2011,H-1B,Year,N,True


We will use column 'CASE_STATUS' as our label.  DENIED will be 1 and CERTIFIED will be 0 after it gets incoded by label_encoder.

Normalize PREVAILING_WAGE using minmax scaler since it varies quite a bit.

In [6]:
scaler = MinMaxScaler()
X = df.drop(columns='CASE_STATUS')

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['CASE_STATUS'])

X[['PREVAILING_WAGE']] = scaler.fit_transform(X[['PREVAILING_WAGE']])
X = pd.get_dummies(X)

In [53]:
ones_index

(array([     9,     20,     39, ..., 305219, 305246, 305279]),)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)

Check value counts to verify stratify split

In [11]:
X_train.head()

Unnamed: 0,PREVAILING_WAGE,WAGE_LOWER_THAN_PW,FULL_TIME_POSITION_N,FULL_TIME_POSITION_Y,H1B_DEPENDENT_N,H1B_DEPENDENT_Y,NAICS_CODE_106888,"NAICS_CODE_11 MM ACRES ENERGY, LLC",NAICS_CODE_111100,NAICS_CODE_11111,...,VISA_CLASS_H-1B,VISA_CLASS_H-1B1 Chile,VISA_CLASS_H-1B1 Singapore,WAGE_UNIT_OF_PAY_Bi-Weekly,WAGE_UNIT_OF_PAY_Hour,WAGE_UNIT_OF_PAY_Month,WAGE_UNIT_OF_PAY_Week,WAGE_UNIT_OF_PAY_Year,WILLFUL_VIOLATOR_N,WILLFUL_VIOLATOR_Y
248404,0.000108,False,0,1,0,1,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0
256348,6.5e-05,False,0,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0
542928,8.7e-05,False,0,1,0,1,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0
401956,4.9e-05,False,0,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0
133647,8.4e-05,False,0,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0


Train DecisionTreeClassifier and predict using X_test

In [13]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
# print(clf.feature_importances_)
print(clf.score(X_test, y_test))
tree_pred = clf.predict(X_test)
print(tree_pred)
# for i in range(len(y_hat)):
#     if y_test[i]==1:
#         print("{}:: {}".format(y_test[i],y_hat[i]))

0.9785590053160304
[0 0 0 ... 0 0 0]


Train RandomForestClassifier and predict using X_test

In [15]:
from sklearn.ensemble import RandomForestClassifier


# from sklearn.feature_extraction.text import CountVectorizer

rfc = RandomForestClassifier()
# vec = CountVectorizer()
# X_train_t = vec.fit_transform(X_train)

rfc.fit(X_train, y_train)

rfc_pred = rfc.predict(X_test)

  from numpy.core.umath_tests import inner1d


In [110]:
from sklearn.metrics import classification_report, confusion_matrix
print("** Decision tree Classifier Stats (OVERALL testset)")
print(classification_report(y_test, tree_pred))

print("** Random Forest Classifier Stats (OVERALL testset)")
print(classification_report(y_test, rfc_pred))

** Decision tree Classifier Stats (OVERALL testset)
             precision    recall  f1-score   support

          0       0.98      0.99      0.99    297848
          1       0.61      0.35      0.44      7455

avg / total       0.97      0.98      0.98    305303

** Random Forest Classifier Stats (OVERALL testset)
             precision    recall  f1-score   support

          0       0.98      1.00      0.99    297848
          1       0.64      0.34      0.44      7455

avg / total       0.98      0.98      0.98    305303



Output the feature importance vector and sort by importance

In [21]:
import operator
# print(len(rfc.feature_importances_))
# print(len(X_train.columns))

nonzero_features = {}
for i in range(len(rfc.feature_importances_)):
    if rfc.feature_importances_[i] > 0.0:
        nonzero_features[list(X_train.columns)[i]] = rfc.feature_importances_[i]
sorted_result = sorted(nonzero_features.items(), key=operator.itemgetter(1)) 
for i in range(len(sorted_result)):
    print(sorted_result[-(i+1)])

(u'PREVAILING_WAGE', 0.30899545494138314)
('WAGE_LOWER_THAN_PW', 0.20799220356298184)
(u'VISA_CLASS_H-1B', 0.07309553354675245)
(u'VISA_CLASS_E-3 Australian', 0.03703164660393674)
(u'WAGE_UNIT_OF_PAY_Year', 0.00723855015847052)
(u'WAGE_UNIT_OF_PAY_Hour', 0.005921642991539291)
(u'PW_SOURCE_OES', 0.004133008749398209)
('NAICS_CODE_541511', 0.00412464229878589)
(u'NAICS_CODE_CAPGEMINI AMERICA INC', 0.003896443595411902)
(u'H1B_DEPENDENT_N', 0.003580174662405468)
(u'PW_SOURCE_Other', 0.003464124034837413)
(u'H1B_DEPENDENT_Y', 0.003291746312051908)
(u'WAGE_UNIT_OF_PAY_Month', 0.0031273966585035617)
('SOC_CODE_13-2011', 0.002850158925323578)
('SOC_CODE_13-1161', 0.0023485437600034914)
('SOC_CODE_15-1132', 0.0021930398079176606)
(u'FULL_TIME_POSITION_Y', 0.00198517436002186)
(u'VISA_CLASS_H-1B1 Chile', 0.0017893808141050162)
(u'WAGE_UNIT_OF_PAY_Week', 0.0017785117487997455)
('NAICS_CODE_611310', 0.0017709225062104022)
(u'FULL_TIME_POSITION_N', 0.0016646008900755502)
('SOC_CODE_15-1121', 0.001

('NAICS_CODE_332322', 8.983143392006371e-05)
('NAICS_CODE_323110', 8.971531970851924e-05)
('NAICS_CODE_561920', 8.966183273412067e-05)
('SOC_CODE_21-2011', 8.951401065192182e-05)
(u'NAICS_CODE_GINA GROUP LLC', 8.949020328487493e-05)
('NAICS_CODE_541300', 8.945130640854293e-05)
('NAICS_CODE_333993', 8.925339512541101e-05)
('SOC_CODE_17-2121', 8.92370197837054e-05)
(u'NAICS_CODE_ADIGE LLC', 8.914111960488775e-05)
('NAICS_CODE_237120', 8.891699253120905e-05)
('NAICS_CODE_325413', 8.889177740392139e-05)
('NAICS_CODE_44711', 8.888486284747715e-05)
('NAICS_CODE_926140', 8.888293484439252e-05)
('NAICS_CODE_339914', 8.877678194493834e-05)
(u'NAICS_CODE_CURATA, INC.', 8.864044412350383e-05)
('NAICS_CODE_813312', 8.847495743897644e-05)
(u'NAICS_CODE_AIG GLOBAL SERVICES, INC.', 8.845045642455586e-05)
('SOC_CODE_21-2021', 8.827132055851378e-05)
('NAICS_CODE_72231', 8.822592691383975e-05)
(u'NAICS_CODE_GOOD SAMARITAN HOSPITAL MEDICAL CENTER', 8.793640173784482e-05)
('SOC_CODE_41-9022', 8.7884256269

(u'NAICS_CODE_JPX DENVER LLC', 4.780979712883921e-05)
(u"NAICS_CODE_CHILDREN'S HOSPITAL MEDICAL CENTER OF AKRON, INC.", 4.7809012723573635e-05)
(u'NAICS_CODE_H2M ARCHITECTS,ENGINEERS,LAND SURVEYING AND LANDSCAPE ARCHITECTURE,DP', 4.7802236901255625e-05)
('NAICS_CODE_339000', 4.779700900113533e-05)
(u'NAICS_CODE_HARRI USA, LLC', 4.779083297495542e-05)
(u'NAICS_CODE_FUSEGLOBAL PARTNERS, INC.', 4.778184193956192e-05)
(u'NAICS_CODE_ARMENTROUT MATHENY THURMOND, P.C.', 4.7780501444537064e-05)
(u'NAICS_CODE_HARRISON FRENCH & ASSOCIATES, LTD.', 4.7779393624625866e-05)
(u'NAICS_CODE_MILES MENG, LLC', 4.7768182310075674e-05)
(u'NAICS_CODE_ASCEND LEARNING, LLC', 4.775602767437483e-05)
('NAICS_CODE_332510', 4.775330793628186e-05)
(u'NAICS_CODE_NEW WEIMING LAW GROUP, PLLC', 4.774484414940638e-05)
('NAICS_CODE_326111', 4.7741606684279903e-05)
(u'NAICS_CODE_JOHN KELLY FURNITURE DESIGN, LLC', 4.771856499334459e-05)
(u'NAICS_CODE_FRACTAL THERAPEUTICS, INC.', 4.7713109197328285e-05)
(u'NAICS_CODE_UUSTAR

(u'NAICS_CODE_FOUNTAIN FOODS, INC', 2.676356008605008e-05)
(u'NAICS_CODE_HAMILTON MANUFACTURING CORP.', 2.6750247488868446e-05)
('NAICS_CODE_326160', 2.673450104399513e-05)
(u'NAICS_CODE_GARANCE DORE LLC', 2.6727789069545e-05)
('NAICS_CODE_311711', 2.6709068840026332e-05)
(u'NAICS_CODE_GATEWAY EDUCATION USA CORP.', 2.6677234659872516e-05)
('NAICS_CODE_441210', 2.662122030244506e-05)
('NAICS_CODE_339950', 2.660669740236879e-05)
('SOC_CODE_53-5021', 2.6603366796538942e-05)
('NAICS_CODE_446130', 2.6597453099120374e-05)
('SOC_CODE_27-1013', 2.654305033808243e-05)
(u'NAICS_CODE_SMARTAX GROUP INC.', 2.6513412357058864e-05)
('SOC_CODE_23-2092', 2.648287766300124e-05)
('NAICS_CODE_441110', 2.6398727195666843e-05)
(u'NAICS_CODE_LAHEY CLINIC, INC.', 2.6384849894221426e-05)
(u'NAICS_CODE_LAW OFFICES OF AMY GHOSH, APC', 2.6365934547326183e-05)
('NAICS_CODE_332813', 2.6292089741890658e-05)
('NAICS_CODE_51449', 2.6291316749643027e-05)
('NAICS_CODE_325612', 2.6283555244334823e-05)
('NAICS_CODE_331318

(u'NAICS_CODE_AGE OF LEARNING, INC.', 4.8317282808554185e-06)
('NAICS_CODE_32199', 4.8306182143331204e-06)
(u'NAICS_CODE_LEARNING TREE CENTER, INC.', 4.80966117693037e-06)
('SOC_CODE_51-9151', 4.801952707957862e-06)
(u'NAICS_CODE_SHAWNEE STATE UNIVERSITY', 4.798263638833083e-06)
(u'NAICS_CODE_INFOTECH PEOPLE', 4.791744619342171e-06)
(u'NAICS_CODE_ADVANCED SYSTEMS FOR POWER ENGINEERING, INC.', 4.791501428271111e-06)
(u'NAICS_CODE_PRACTICE BY NUMBERS INC.', 4.7833560520475635e-06)
('NAICS_CODE_111110', 4.782877471874582e-06)
(u'NAICS_CODE_URSULA COMMERCIAL SALT WATER FISHING VESSEL', 4.769269786273243e-06)
(u'NAICS_CODE_HORIZON INTERNATIONAL TRD. INC', 4.73727350774911e-06)
('SOC_CODE_25-1061', 4.7366695842229896e-06)
(u'NAICS_CODE_ALOHA SOFT LLC', 4.681071337267265e-06)
(u'NAICS_CODE_MSM LUXURY ESTATES', 4.67782166363627e-06)
('NAICS_CODE_42494', 4.671031492357318e-06)
(u'NAICS_CODE_FUSION MONKEYS LLC', 4.65743219540563e-06)
(u'NAICS_CODE_WEB THREE INC.', 4.646418217303754e-06)
(u'NAICS

('NAICS_CODE_212312', 2.895892992692134e-08)
('SOC_CODE_29-2055', 2.8919850003559727e-08)
('NAICS_CODE_33451', 2.8523231097497758e-08)
('NAICS_CODE_54219', 2.8425675294915668e-08)
('NAICS_CODE_541311', 2.8384947209319393e-08)
('NAICS_CODE_327124', 2.8212810876267414e-08)
('NAICS_CODE_311512', 2.7889676656875648e-08)
('SOC_CODE_43-4111', 2.7715565906698e-08)
(u'NAICS_CODE_DE NGUYEN', 2.736949406231547e-08)
('NAICS_CODE_71121', 2.731580420963879e-08)
('NAICS_CODE_325990', 2.695072603826017e-08)
('NAICS_CODE_55112', 2.6935547649295165e-08)
('NAICS_CODE_327410', 2.6894842905754462e-08)
('NAICS_CODE_311213', 2.6831561122156558e-08)
(u'SOC_CODE_IOS DEVELOPER', 2.6405868426400934e-08)
('NAICS_CODE_313320', 2.6168596467707464e-08)
('NAICS_CODE_42371', 2.6144881183247725e-08)
(u"NAICS_CODE_ST. JOHN'S CHURCH", 2.6056055002771796e-08)
('NAICS_CODE_11192', 2.5973510986464368e-08)
('NAICS_CODE_33633', 2.5899866981710406e-08)
('NAICS_CODE_21229', 2.5755984984327685e-08)
('NAICS_CODE_11133', 2.565690

In [None]:
import pickle
with open('/tmp/feature_importances_first_800k_rfc.pickle', 'wb') as f:
    pickle.dump(sorted_result, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('/tmp/feature_importances_first_800k_rfc_dict.pickle', 'wb') as f:
    pickle.dump(nonzero_features, f, protocol=pickle.HIGHEST_PROTOCOL)

Output the trained model to a pickle file in case we need it in the future

In [122]:
from sklearn.externals import joblib
joblib.dump(rfc, 'rfc.pkl', compress=9)
joblib.dump(clf, 'tree.pkl', compress=9)

# import cPickle
# with open('rfc_cpickle.pkl', 'wb') as pkl:
#     cPickle.dump(rfc, pkl)
# with open('tree_cpickle.pkl', 'wb') as pkl:
#     cPickle.dump(clf, pkl)    

To load the pre-trained model, run the cell below.

In [124]:
rfc = joblib.load('rfc.pkl')
clf = joblib.load('tree.pkl')

# with open('rfc_cpickle.pkl', 'rb') as pkl:
#     rfc = cPickle.load(pkl)
# with open('tree_cpickle.pkl', 'rb') as pkl:
#     clf = cPickle.load(pkl)

Evaluate stats for the entire test set

In [125]:
from sklearn.metrics import precision_score, recall_score, f1_score, fbeta_score, accuracy_score, confusion_matrix
# print(rfc_pred)

print("** Decision tree Classifier Scores (OVERALL testset)")
print("Precision: %s" % precision_score(y_test, tree_pred))
print("Recall: %s"% recall_score(y_test, tree_pred))
print("Accuracy score: %s"% accuracy_score(y_test, tree_pred))
print("F-1 score: %s"% f1_score(y_test, tree_pred))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_test, tree_pred, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_test, tree_pred, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_test, tree_pred))
print("\n")
print("** Random Forest Classifier Scores (OVERALL testset)")
print("Precision: %s"% precision_score(y_test, rfc_pred))
print("Recall: %s"% recall_score(y_test, rfc_pred))
print("Accuracy score: %s"% accuracy_score(y_test, rfc_pred))
print("F-1 score: %s"% f1_score(y_test, rfc_pred))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_test, rfc_pred, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_test, rfc_pred, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_test, rfc_pred))

** Decision tree Classifier Scores (OVERALL testset)
Precision: 0.606166783461808
Recall: 0.34808853118712274
Accuracy score: 0.9785590053160304
F-1 score: 0.44222903885480574
F-beta score with beta=0.5: 0.5278896619065055
F-beta score with beta=0.2: 0.5893605870020965
Confusion Matrix: 
[[296162   1686]
 [  4860   2595]]


** Random Forest Classifier Scores (OVERALL testset)
Precision: 0.6385118149824032
Recall: 0.340710932260228
Accuracy score: 0.9791911641877086
F-1 score: 0.4443278229685997
F-beta score with beta=0.5: 0.54350151923653
F-beta score with beta=0.2: 0.617744726626444
Confusion Matrix: 
[[296410   1438]
 [  4915   2540]]


In [None]:
import numpy as np
ones_index = y_test.nonzero()
x_ones = X_test.iloc[ones_index]
y_ones = np.take(y_test, ones_index)[0]

tree_pred_oneonly = clf.predict(x_ones)
rfc_pred_oneonly = rfc.predict(x_ones)

Evaluate stats for testset with label 1 (DENIED cases only)

In [120]:
print("** Decision tree Classifier Scores (Only rows with label DENIED->1)")
print("Precision: %s" % precision_score(y_ones, tree_pred_oneonly))
print("Recall: %s"% recall_score(y_ones, tree_pred_oneonly))
print("Accuracy score: %s"% accuracy_score(y_ones, tree_pred_oneonly))
print("F-1 score: %s"% f1_score(y_ones, tree_pred_oneonly))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_ones, tree_pred_oneonly, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_ones, tree_pred_oneonly, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_ones, tree_pred_oneonly))
print("\n")
print("** Random Forest Classifier Scores (Only rows with label DENIED->1)")
print("Precision: %s"% precision_score(y_ones, rfc_pred_oneonly))
print("Recall: %s"% recall_score(y_ones, rfc_pred_oneonly))
print("Accuracy score: %s"% accuracy_score(y_ones, rfc_pred_oneonly))
print("F-1 score: %s"% f1_score(y_ones, rfc_pred_oneonly))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_ones, rfc_pred_oneonly, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_ones, rfc_pred_oneonly, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_ones, rfc_pred_oneonly))

** Decision tree Classifier Scores (Only rows with label DENIED->1)
Precision: 1.0
Recall: 0.34808853118712274
Accuracy score: 0.34808853118712274
F-1 score: 0.5164179104477612
F-beta score with beta=0.5: 0.727502102607233
F-beta score with beta=0.2: 0.9328079635006222
Confusion Matrix: 
[[   0    0]
 [4860 2595]]


** Random Forest Classifier Scores (Only rows with label DENIED->1)
Precision: 1.0
Recall: 0.340710932260228
Accuracy score: 0.340710932260228
F-1 score: 0.5082541270635318
F-beta score with beta=0.5: 0.7209764405336362
F-beta score with beta=0.2: 0.9307307448382779
Confusion Matrix: 
[[   0    0]
 [4915 2540]]


In [114]:
zeros_index = np.arange(len(y_test))[(y_test==0)]

x_zeros = X_test.iloc[zeros_index]
y_zeros = np.take(y_test, zeros_index)
tree_pred_zeroonly = clf.predict(x_zeros)
rfc_pred_zeroonly = rfc.predict(x_zeros)


Evaluate stats for testset with label 0 (CERTIFIED cases only)


In [119]:
print("** Decision tree Classifier Scores (Only rows with label CERTIFIED->0)")
print("Precision: %s" % precision_score(y_zeros, tree_pred_zeroonly))
print("Recall: %s"% recall_score(y_zeros, tree_pred_zeroonly))
print("Accuracy score: %s"% accuracy_score(y_zeros, tree_pred_zeroonly))
print("F-1 score: %s"% f1_score(y_zeros, tree_pred_zeroonly))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_zeros, tree_pred_zeroonly, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_zeros, tree_pred_zeroonly, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_zeros, rfc_pred_zeroonly))
print("\n")
print("** Random Forest Classifier Scores (Only rows with label CERTIFIED->0)")
print("Precision: %s"% precision_score(y_zeros, rfc_pred_zeroonly))
print("Recall: %s"% recall_score(y_zeros, rfc_pred_zeroonly))
print("Accuracy score: %s"% accuracy_score(y_zeros, rfc_pred_zeroonly))
print("F-1 score: %s"% f1_score(y_zeros, rfc_pred_zeroonly))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_zeros, rfc_pred_zeroonly, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_zeros, rfc_pred_zeroonly, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_zeros, rfc_pred_zeroonly))

** Decision tree Classifier Scores (Only rows with label CERTIFIED->0)
Precision: 0.0
Recall: 0.0
Accuracy score: 0.9943393945905294
F-1 score: 0.0
F-beta score with beta=0.5: 0.0
F-beta score with beta=0.2: 0.0
Confusion Matrix: 
[[296410   1438]
 [     0      0]]


** Random Forest Classifier Scores (Only rows with label CERTIFIED->0)
Precision: 0.0
Recall: 0.0
Accuracy score: 0.9951720340576401
F-1 score: 0.0
F-beta score with beta=0.5: 0.0
F-beta score with beta=0.2: 0.0
Confusion Matrix: 
[[296410   1438]
 [     0      0]]


Show ratio between the two classes in the entire dataset (before split into train and test set)

In [116]:
status_stats = df.groupby('CASE_STATUS').size().to_frame()
status_stats['percentage'] = status_stats[0] / status_stats[0].sum()
print(status_stats)

                   0  percentage
CASE_STATUS                     
CERTIFIED    1489240    0.975582
DENIED         37274    0.024418
