In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import RandomizedPCA
from sklearn.decomposition import PCA
from sklearn.svm import SVC


Load cleaned up version of dataset from 2012 to 2018.  Drop geographic information about employer and worksite to reduce dimensionality

In [4]:
df = pd.read_pickle('/Users/minse_chang/PycharmProjects/Udacity_ML_Capstone/data/H1B_15-18_new.pickle')
df.drop(columns=['EMPLOYER_CITY','JOB_TITLE','EMPLOYER_NAME'], inplace=True)
df.drop(columns=['EMPLOYER_STATE','WORKSITE_STATE'], inplace=True)


In [5]:
ones_df = df[df['CASE_STATUS']=='DENIED']
zeros_df = df[df['CASE_STATUS']=='CERTIFIED']

In [6]:
# df = pd.concat([ones_df, zeros_df.sample(frac=0.1, random_state=99)])
df = pd.concat([ones_df, zeros_df.sample(frac=0.02, random_state=99)])


In [5]:
ones_df.head()
print(ones_df.shape)

(37310, 11)


In [6]:
zeros_df.head()
print(zeros_df.shape)

(1656369, 11)


We will use column 'CASE_STATUS' as our label.  DENIED will be 1 and CERTIFIED will be 0 after it gets incoded by label_encoder.

Normalize PREVAILING_WAGE using minmax scaler since it varies quite a bit.

In [7]:
scaler = MinMaxScaler()
X = df.drop(columns='CASE_STATUS')

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['CASE_STATUS'])

X[['PREVAILING_WAGE']] = scaler.fit_transform(X[['PREVAILING_WAGE']])
X = pd.get_dummies(X)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [9]:
n_components = 400
pca = PCA(n_components=n_components, whiten=True, svd_solver='randomized')
pca = pca.fit(X_train)

In [10]:
from time import time
t0 = time()
# eigenfaces = pca.components_.reshape((n_components, h, w))
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

print("done in %0.3fs" % (time() - t0))

done in 20.352s


Check value counts to verify stratify split

In [11]:
# print(y_train.value_counts())
# print(y_test.value_counts())

Train DecisionTreeClassifier and predict using X_test

In [12]:
from sklearn.svm import SVC

svc = SVC(gamma=0.001, C=100)

svc.fit(X_train, y_train)

svc_pred = svc.predict(X_test)

Train RandomForestClassifier and predict using X_test

In [13]:
from sklearn.metrics import classification_report, confusion_matrix
print("** SV Classifier Stats (OVERALL testset)")
print(classification_report(y_test, svc_pred))

** SV Classifier Stats (OVERALL testset)
             precision    recall  f1-score   support

          0       0.69      0.80      0.74      6652
          1       0.79      0.68      0.73      7436

avg / total       0.74      0.74      0.74     14088



In [14]:
from sklearn.externals import joblib
joblib.dump(svc, 'svc_final.pkl', compress=9)

['svc_final.pkl']

Output the trained model to a pickle file in case we need it in the future

In [23]:
# svc = joblib.load('svc.pkl')

To load the pre-trained model, run the cell below.

In [16]:
from sklearn.metrics import precision_score, recall_score, f1_score, fbeta_score, accuracy_score, confusion_matrix

print("** SV Classifier Scores (OVERALL testset)")
print("Precision: %s"% precision_score(y_test, svc_pred))
print("Recall: %s"% recall_score(y_test, svc_pred))
print("Accuracy score: %s"% accuracy_score(y_test, svc_pred))
print("F-1 score: %s"% f1_score(y_test, svc_pred))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_test, svc_pred, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_test, svc_pred, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_test, svc_pred))

** SV Classifier Scores (OVERALL testset)
Precision: 0.7943777637397347
Recall: 0.6764389456697149
Accuracy score: 0.7367972742759795
F-1 score: 0.730679837303893
F-beta score with beta=0.5: 0.7676107923330485
F-beta score with beta=0.2: 0.7890862576627891
Confusion Matrix: 
[[5350 1302]
 [2406 5030]]


Evaluate stats for the entire test set

In [None]:
# pd.DataFrame(data=X_test)

In [17]:
import numpy as np
ones_index = y_test.nonzero()[0]
x_ones = X_test[ones_index,:]
y_ones = np.take(y_test, ones_index)

# print(x_ones.shape)
# print(y_ones.shape)

In [18]:
svc_pred_oneonly = svc.predict(x_ones)

In [19]:
print("** SV Classifier Scores (Only rows with label DENIED->1)")
print("Precision: %s"% precision_score(y_ones, svc_pred_oneonly))
print("Recall: %s"% recall_score(y_ones, svc_pred_oneonly))
print("Accuracy score: %s"% accuracy_score(y_ones, svc_pred_oneonly))
print("F-1 score: %s"% f1_score(y_ones, svc_pred_oneonly))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_ones, svc_pred_oneonly, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_ones, svc_pred_oneonly, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_ones, svc_pred_oneonly))

** SV Classifier Scores (Only rows with label DENIED->1)
Precision: 1.0
Recall: 0.6764389456697149
Accuracy score: 0.6764389456697149
F-1 score: 0.8069950264720038
F-beta score with beta=0.5: 0.9126868921469009
F-beta score with beta=0.2: 0.98193503821723
Confusion Matrix: 
[[   0    0]
 [2406 5030]]


Evaluate stats for testset with label 1 (DENIED cases only)

In [20]:
zeros_index = np.arange(len(y_test))[(y_test==0)]
x_zeros = X_test[zeros_index,:]
y_zeros = np.take(y_test, zeros_index)
svc_pred_zeroonly = svc.predict(x_zeros)

In [21]:
print("** SV Classifier Scores (Only rows with label CERTIFIED->0)")
print("Precision: %s"% precision_score(y_zeros, svc_pred_zeroonly))
print("Recall: %s"% recall_score(y_zeros, svc_pred_zeroonly))
print("Accuracy score: %s"% accuracy_score(y_zeros, svc_pred_zeroonly))
print("F-1 score: %s"% f1_score(y_zeros, svc_pred_zeroonly))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_zeros, svc_pred_zeroonly, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_zeros, svc_pred_zeroonly, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_zeros, svc_pred_zeroonly))


** SV Classifier Scores (Only rows with label CERTIFIED->0)
Precision: 0.0
Recall: 0.0
Accuracy score: 0.8042693926638604
F-1 score: 0.0
F-beta score with beta=0.5: 0.0
F-beta score with beta=0.2: 0.0
Confusion Matrix: 
[[5350 1302]
 [   0    0]]


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [22]:
import datetime
datetime.datetime.now()

datetime.datetime(2018, 12, 19, 11, 36, 58, 534773)

In [7]:
df.groupby(['VISA_CLASS','CASE_STATUS']).size()

VISA_CLASS       CASE_STATUS
E-3 Australian   CERTIFIED        878
                 DENIED          3889
H-1B             CERTIFIED      47457
                 DENIED         32924
H-1B1 Chile      CERTIFIED         75
                 DENIED           291
H-1B1 Singapore  CERTIFIED         89
                 DENIED           206
dtype: int64