# Importing Modules and Checking Directory 

In [2]:
import os
import pandas as pd
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [3]:
pd.__version__

'1.1.5'

In [4]:
os.getcwd()


'/Users/Livi/Projects/Black_mental_health/models'

In [None]:
df = pd.read_csv('../data/interim/features_and_target2_pre_get_dummies.csv', index_col = 0)

In [33]:
y = df['payrate_level']
X = pd.get_dummies(df.drop(columns='payrate_level'))

X.columns = X.columns.str.replace('majority_majority_', 'majority_').str.replace('majority_no_', 'no_')
X.columns

Index(['number_of_hcpcs', 'number_of_services', 'total_beneficiaries',
       'total_submitted_charges', 'avg_beneficiary_age', 'avg_hcc_risk_score',
       'submitted_charges_per_person', 'services_per_person',
       'submitted_charges_per_service', 'total_pop',
       ...
       'subregion_Pacific', 'subregion_South Atlantic',
       'subregion_West North Central', 'subregion_West South Central',
       'majority_asian', 'majority_black', 'majority_hispanic',
       'majority_native_amer', 'majority_white', 'no_majority'],
      dtype='object', length=188)

#### Scaling ONLY CONTINUOUS columns using PowerTransformer

In [34]:
from sklearn.preprocessing import PowerTransformer
import time

start= time.time()

scaler = PowerTransformer()

X_scaled=X.copy()

for col in X.columns:
    if col =='entity_code_I':
        break
    else:
        scaler.fit(np.array(X_scaled[col]).reshape(-1,1))
        i_scaled = scaler.transform(np.array(X_scaled[col]).reshape(-1,1))
        X_scaled[col] = i_scaled
        
        
end= time.time()

print('Time scaled:', (end-start)/60, 'minutes.')

Time scaled: 0.5751500129699707 minutes.


In [35]:
X_scaled.head()

Unnamed: 0,number_of_hcpcs,number_of_services,total_beneficiaries,total_submitted_charges,avg_beneficiary_age,avg_hcc_risk_score,submitted_charges_per_person,services_per_person,submitted_charges_per_service,total_pop,...,subregion_Pacific,subregion_South Atlantic,subregion_West North Central,subregion_West South Central,majority_asian,majority_black,majority_hispanic,majority_native_amer,majority_white,no_majority
0,-0.003948,0.540055,0.851671,1.196155,0.998501,1.105662,0.871076,-0.119979,1.063762,0.781846,...,0,1,0,0,0,0,0,0,1,0
1,-0.170174,0.576787,1.015528,0.573919,2.139463,0.994263,-0.276118,-0.3176,0.179416,-0.074416,...,0,1,0,0,0,0,0,0,1,0
2,-1.402499,-1.478753,-2.03297,-2.079729,-1.181247,-1.398919,-1.041626,0.565058,-1.326987,-0.41331,...,0,1,0,0,0,1,0,0,0,0
3,0.44735,1.158778,1.311877,1.152199,1.833739,-0.416924,0.24284,0.442297,0.106138,0.795532,...,0,1,0,0,0,0,0,0,0,1
4,0.13949,-0.443103,-0.613947,-0.657554,-0.706375,1.649911,-0.268393,0.241994,-0.223406,-0.679061,...,0,1,0,0,0,0,0,0,0,1


#### CV: Half Scaled and ONLY Selected Features

In [73]:
ridge_feats = pd.read_csv('../data/interim/ridge_features.csv',index_col=0)
kbest_feats = pd.read_csv('../data/interim/kbest_features.csv', index_col=0)
ridge_feats.head(1)

Unnamed: 0,0
provider_type_Registered Dietitian or Nutrition Professional,0.090226


In [74]:
kbest_feats.head(1)

Unnamed: 0,0
0,provider_type_Mass Immunizer Roster Biller


In [82]:
ridge_feats = list(ridge_feats.reset_index()['index'])
kbest_feats = list(kbest_feats['0'])

#### K Best Features

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled[kbest_feats], y, test_size = 0.2, random_state=42)

rf = RandomForestClassifier()

start = time.time()

scorer = make_scorer(f1_score, average = 'weighted')
rf_cv3 = cross_val_score(rf, X_train[:100000], y_train[:100000], cv=5, scoring=scorer)

end= time.time()
total = end - start

print('CV F1 Score RF:', rf_cv3)
print("Time:", total/60, "minutes")

start = time.time()

gbc = GradientBoostingClassifier()
scorer = make_scorer(f1_score, average = 'weighted')
cv_scores_gbc3= cross_val_score(gbc,X_train[:10000],y_train[:10000],cv=5,scoring=scorer)


end = time.time()

print ('CV F1 Scores GBC:', cv_scores_gbc3)
print('Time:', (end-start)/60, 'minutes')

from sklearn.linear_model import LogisticRegression

start= time.time()

lr = LogisticRegression(max_iter=500)
scorer = make_scorer(f1_score, average = 'weighted')
cv_scores_lr3 = cross_val_score(lr,X_train[:10000],y_train[:10000],cv=5,scoring=scorer)

end = time.time()

print ('CV F1 Scores LR:', cv_scores_lr3)
print('Time:', (end-start)/60)

CV F1 Score RF: [0.52296647 0.51856622 0.52146914 0.52428817 0.52079646]
Time: 0.4059203664461772 minutes
CV F1 Scores GBC: [0.52880339 0.52122645 0.53637345 0.52749809 0.53274528]
Time: 0.2726573665936788 minutes
CV F1 Scores LR: [0.52880339 0.52122645 0.5347799  0.52749809 0.53139514]
Time: 0.057468048731486004


#### Ridge Features

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled[ridge_feats], y, test_size = 0.2, random_state=42)

rf = RandomForestClassifier()

start = time.time()

scorer = make_scorer(f1_score, average = 'weighted')
rf_cv4 = cross_val_score(rf, X_train[:100000], y_train[:100000], cv=5, scoring=scorer)

end= time.time()
total = end - start

print('CV F1 Score RF:', rf_cv4)
print("Time:", total/60, "minutes")

start = time.time()

gbc = GradientBoostingClassifier()
scorer = make_scorer(f1_score, average = 'weighted')
cv_scores_gbc4= cross_val_score(gbc,X_train[:10000],y_train[:10000],cv=5,scoring=scorer)


end = time.time()

print ('CV F1 Scores GBC:', cv_scores_gbc4)
print('Time:', (end-start)/60, 'minutes')

from sklearn.linear_model import LogisticRegression

start= time.time()

lr = LogisticRegression(max_iter=500)
scorer = make_scorer(f1_score, average = 'weighted')
cv_scores_lr4 = cross_val_score(lr,X_train[:10000],y_train[:10000],cv=5,scoring=scorer)

end = time.time()

print ('CV F1 Scores LR:', cv_scores_lr4)
print('Time:', (end-start)/60)

CV F1 Score RF: [0.47419517 0.4743125  0.47322537 0.47880334 0.47434894]
Time: 1.5162093838055928 minutes
CV F1 Scores GBC: [0.46267757 0.47684456 0.46017945 0.47206531 0.46309023]
Time: 0.3053579847017924 minutes
CV F1 Scores LR: [0.46883799 0.47710803 0.46632294 0.47379844 0.46578955]
Time: 0.04967144727706909


#### Doing CV again, but WITH SCALING the binary columns

In [54]:
start= time.time()

scaler = PowerTransformer()
scaler.fit(X)
X_scaled2 = scaler.transform(X)


X_train, X_test, y_train, y_test = train_test_split(X_scaled2, y, test_size = 0.2, random_state=42)

In [55]:
end = time.time()
print('time:', (end-start)/60, 'minutes')

time: 5.403771114349365 minutes


In [59]:
pd.DataFrame(X_scaled2, columns = X.columns).head(1)

Unnamed: 0,number_of_hcpcs,number_of_services,total_submitted_charges,avg_beneficiary_age,avg_hcc_risk_score,submitted_charges_per_person,services_per_person,submitted_charges_per_service,total_pop,percent_black,...,subregion_Pacific,subregion_South Atlantic,subregion_West North Central,subregion_West South Central,majority_asian,majority_black,majority_hispanic,majority_native_amer,majority_white,no_majority
0,-0.003948,0.540055,1.196155,0.998501,1.105662,0.871076,-0.119979,1.063762,0.781846,0.413996,...,-0.378901,2.036056,-0.285347,-0.331112,-0.0744,-0.225179,-0.234267,-0.033669,0.585053,-0.416918


#### Kbest Feats

In [108]:
X_scaled2 = pd.DataFrame(X_scaled2, columns = X.columns)
X_train, X_test, y_train, y_test = train_test_split(X_scaled2[kbest_feats], y, test_size = 0.2, random_state=42)

rf = RandomForestClassifier()

start = time.time()

scorer = make_scorer(f1_score, average = 'weighted')
rf_cv5 = cross_val_score(rf, X_train[:100000], y_train[:100000], cv=5, scoring=scorer)

end= time.time()
total = end - start

print('CV F1 Score RF:', rf_cv5)
print("Time:", total/60, "minutes")

start = time.time()

gbc = GradientBoostingClassifier()
scorer = make_scorer(f1_score, average = 'weighted')
cv_scores_gbc5= cross_val_score(gbc,X_train[:100000],y_train[:100000],cv=5,scoring=scorer)


end = time.time()

print ('CV F1 Scores GBC:', cv_scores_gbc5)
print('Time:', (end-start)/60, 'minutes')

from sklearn.linear_model import LogisticRegression

start= time.time()

lr = LogisticRegression(max_iter=500)
scorer = make_scorer(f1_score, average = 'weighted')
cv_scores_lr5 = cross_val_score(lr,X_train[:100000],y_train[:100000],cv=5,scoring=scorer)

end = time.time()

print ('CV F1 Scores LR:', cv_scores_lr5)
print('Time:', (end-start)/60)

CV F1 Score RF: [0.52296647 0.51856622 0.52146914 0.52428817 0.52079646]
Time: 0.3692087690035502 minutes
CV F1 Scores GBC: [0.52296647 0.51856622 0.52146914 0.52428817 0.52079646]
Time: 2.4240289171536764 minutes
CV F1 Scores LR: [0.52296647 0.51856622 0.52146914 0.52428817 0.52079646]
Time: 0.496477480729421


#### Ridge Feats

In [109]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled2[ridge_feats], y, test_size = 0.2, random_state=42)

rf = RandomForestClassifier()

start = time.time()

scorer = make_scorer(f1_score, average = 'weighted')
rf_cv6 = cross_val_score(rf, X_train[:100000], y_train[:100000], cv=5, scoring=scorer)

end= time.time()
total = end - start

print('CV F1 Score RF:', rf_cv6)
print("Time:", total/60, "minutes")

start = time.time()

gbc = GradientBoostingClassifier()
scorer = make_scorer(f1_score, average = 'weighted')
cv_scores_gbc6= cross_val_score(gbc,X_train[:100000],y_train[:100000],cv=5,scoring=scorer)


end = time.time()

print ('CV F1 Scores GBC:', cv_scores_gbc6)
print('Time:', (end-start)/60, 'minutes')

from sklearn.linear_model import LogisticRegression

start= time.time()

lr = LogisticRegression(max_iter=500)
scorer = make_scorer(f1_score, average = 'weighted')
cv_scores_lr6 = cross_val_score(lr,X_train[:100000],y_train[:100000],cv=5,scoring=scorer)

end = time.time()

print ('CV F1 Scores LR:', cv_scores_lr6)
print('Time:', (end-start)/60)

CV F1 Score RF: [0.47415974 0.47218686 0.47362479 0.47915108 0.47354599]
Time: 1.3058063507080078 minutes
CV F1 Scores GBC: [0.4604279  0.45706118 0.4599994  0.46110714 0.45985048]
Time: 3.534658714135488 minutes
CV F1 Scores LR: [0.46035023 0.45715765 0.46022398 0.46136849 0.45938331]
Time: 0.4018767992655436


#### Model Metrics

In [110]:
cv_metrics = pd.DataFrame({'models':['Random Forest','Gradient Boosting','Logistic Regression'],
             'CV: Scale1, kbest' : [rf_cv3.mean(), cv_scores_gbc3.mean(), cv_scores_lr3.mean()], 
            'CV: Scale1, ridge feats' : [rf_cv4.mean(), cv_scores_gbc4.mean(), cv_scores_lr4.mean()], 
             'CV: Scale2, kbest feats' : [rf_cv5.mean(), cv_scores_gbc5.mean(), cv_scores_lr5.mean()],
            'CV: Scale2, ridge feats' : [rf_cv6.mean(), cv_scores_gbc6.mean(), cv_scores_lr6.mean()], 
             })

cv_metrics

Unnamed: 0,models,"CV: Scale1, all feats","CV: Scale1, kbest","CV: Scale1, ridge feats","CV: Scale2, all feats","CV: Scale2, kbest feats","CV: Scale2, ridge feats"
0,Random Forest,0.590946,0.530178,0.474977,0.59132,0.521617,0.474534
1,Gradient Boosting,0.606442,0.529329,0.466971,0.606779,0.521617,0.459689
2,Logistic Regression,0.59683,0.528741,0.470371,0.596491,0.521617,0.459697


    If I do limit the features, I may want to use KBest features rather than Ridge features, because they scored much better. The models with limited features performed much worse than the ones with all features.