In [1]:
# This notebook is for exploratory data analysis and data cleaning

import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
from scipy.stats.stats import pearsonr
from sklearn.preprocessing import OrdinalEncoder
import warnings

pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', 300)
warnings.filterwarnings("ignore")
random.seed(42)

In [2]:
filename = "../input/combine4.csv"
df = pd.read_csv(filename, nrows=0) 
all_cols = df.columns.values

n = 10_000_000 #number of records in file
s = 1_000_000 #desired sample size

skip = sorted(random.sample(range(n),n-s))
df = pd.read_csv(filename, skiprows=skip)

df = pd.read_csv(filename, nrows=s, skiprows=skip) # sample 1/10 of data
df.columns = all_cols
TARGET= "acsc_flag"

all_cols = df.columns.values.tolist()
all_cols.remove(TARGET)
all_cols.remove("ADMD")
print(f"Total number of feature (excluding target) is {len(all_cols)}")

print(f"Data fram shape is {df.shape}")
df.head()

Total number of feature (excluding target) is 229
Data fram shape is (1000000, 231)


Unnamed: 0,ADMD,LOSD1,UNIT100,UNIT110,UNIT114,UNIT115,UNIT116,UNIT117,UNIT118,UNIT119,UNIT170,UNIT172,UNIT173,UNIT174,UNIT200,UNIT203,UNIT204,UNIT206,UNIT207,UNIT210,UNIT214,CHG001,CHG260,CHG480,CHG290,CHG530,CHG490,CHG540,CHG500,CHG550,CHG510,CHG610,CHG520,CHG710,CHG570,CHG720,CHG650,CHG730,CHG820,CHG740,CHG750,CHG100,CHG760,CHG240,CHG770,CHG110,CHG771,CHG114,CHG790,CHG115,CHG800,CHG116,CHG810,CHG117,CHG900,CHG118,CHG940,CHG119,CHG943,CHG220,CHG944,CHG990,CHG960,CHG170,CHG971,CHG172,CHG981,CHG173,CHG481,CHG174,CHG670,CHG200,CHG203,CHG204,CHG206,CHG207,CHG210,CHG214,CHG230,CHG250,CHG270,CHG280,CHG300,CHG320,CHG330,CHG331,CHG333,CHG340,CHG350,CHG360,CHG370,CHG380,CHG400,CHG404,CHG410,CHG420,CHG430,CHG440,CHG450,CHG460,CHG020,CHG681,CHG682,CHG683,CHG684,CHG689,CHG930,CHG950,CHG1000,CHG2100,CHG3100,SPC1,SPC2,SPC3,DISD,PPROCD,SPROC1D,SPROC2D,SPROC3D,SPROC4D,SPROC5D,SPROC6D,SPROC7D,SPROC8D,SPROC9D,SPROC10D,SPROC11D,SPROC12D,ADM_TYPE,ADMS,DISP,PPOA,SPOA1,SPOA2,SPOA3,SPOA4,SPOA5,SPOA6,SPOA7,SPOA8,SPOA9,SPOA10,SPOA11,SPOA12,SPOA13,SPOA14,COUNTY,PDIAG10,ADM_DIAG10,SDIAG10_1,SDIAG10_2,SDIAG10_3,SDIAG10_4,SDIAG10_5,SDIAG10_6,SDIAG10_7,SDIAG10_8,SDIAG10_9,SDIAG10_10,SDIAG10_11,SDIAG10_12,SDIAG10_13,SDIAG10_14,PPROC10,SPROC10_1,SPROC10_2,SPROC10_3,SPROC10_4,SPROC10_5,SPROC10_6,SPROC10_7,SPROC10_8,SPROC10_9,SPROC10_10,SPROC10_11,SPROC10_12,PECODE10,SECODE10,MSDRG,MDC,APDRG20,ER,IP,RFA_ID,DISYEAR,ADMYEAR,INTERVAL,APDRGDSC,DISMTH,DISDAY,ADMMTH,ADMDAY,PDATE,SDATE1,SDATE2,SDATE3,SDATE4,SDATE5,SDATE6,SDATE7,SDATE8,SDATE9,SDATE10,SDATE11,SDATE12,acsc_flag,CC_GRP_1,CC_GRP_2,CC_GRP_3,CC_GRP_4,CC_GRP_5,CC_GRP_6,CC_GRP_7,CC_GRP_8,CC_GRP_9,CC_GRP_10,CC_GRP_11,CC_GRP_12,CC_GRP_13,CC_GRP_14,CC_GRP_15,CC_GRP_16,CC_GRP_17,TOT_GRP,totalcc,wgtcc,sex1,urstat1,race1,age,payor2
0,22MAR2016,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,746.029999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,8.03,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,738.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0,0,0,0,,,,20535,,,,,,,,,,,,,,1,1,1.0,,,,,,,,,,,,,,,,7.0,K029,K088,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,12922,2016,2016,,.,3,3,3,3,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,3,1
1,30MAR2016,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3240.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,208.0,336.0,0,1131.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1565.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0,0,0,0,,,,20543,20543.0,20543.0,,,,,,,,,,,,1,1,1.0,,,,,,,,,,,,,,,,10.0,K529,,,,,,,,,,,,,,,,XW043H4,069Y3ZZ,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,,,,,,1.0,,12904,2016,2016,0.0,.,3,4,3,4,1.0,1.0,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,3,1
2,24AUG2016,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4861.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,88.0,116.0,0,1600.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1521.0,0.0,0.0,0.0,0.0,0.0,1536.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0,0,0,0,,,,20690,20690.0,,,,,,,,,,,,,1,1,1.0,,,,,,,,,,,,,,,,18.0,O2341,,O99331,F17200,Z3A01,,,,,,,,,,,,BY49ZZZ,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,,,,,,1.0,,12909,2016,2016,0.0,.,8,4,8,4,1.0,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,3
3,14SEP2016,8,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,39665.949951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,97.0,0.0,0,1390.0,0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0,9992.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,5950.949997,690.0,0,9754.0,0.0,0,0.0,0.0,0.0,0.0,8429.0,0.0,2920.0,0.0,0.0,0.0,284.0,159.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0,0,0,0,IM,,,20719,20713.0,20713.0,20717.0,,,,,,,,,,,2,2,6.0,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,E,E,10.0,L89154,L89154,E43,G8221,M869,N390,K592,L89224,D638,N319,D500,K5900,R252,F1210,Z86718,Z7901,0QB10ZZ,0KBP0ZZ,02HV33Z,,,,,,,,,,,,,579.0,9.0,364.0,,1.0,12894,2016,2016,196.0,OTHER SKIN SUBCUTANEOUS TISSUE & RELATED PROCE...,9,5,9,4,3.0,3.0,7.0,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,2,3,1
4,24JUN2016,14,0,4,0,0,0,0,0,0,0,0,0,0,10,0,0,0,0,0,0,426110.0,0.0,0.0,0.0,2130.0,0.0,0.0,0.0,0,0.0,0.0,0,2898.0,0,0.0,0,1004.0,0.0,0.0,0.0,0,0.0,0,0,11164.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,77470.0,0.0,0.0,0.0,0,0.0,0.0,0.0,69157.0,56763.0,0,48094.0,16503.0,0,0.0,0.0,0.0,37063.0,43757.0,4788.0,2937.0,0.0,0.0,13576.0,4250.0,1704.0,0.0,6258.0,131.0,0,26463.0,0.0,0.0,0,0.0,0,0,0,0,0,,OSM,,20643,20633.0,20630.0,20630.0,20636.0,20630.0,20629.0,20629.0,20636.0,20632.0,,,,,5,1,1.0,Y,N,Y,Y,Y,Y,Y,N,Y,Y,Y,,,,,18.0,S52182B,S52182B,J14,S272XXA,S24102A,S22039A,S2232XA,G8221,D62,R339,E8809,S51802A,,,,,0PSJ04Z,0W9D00Z,0WJG0ZZ,0W9B00Z,0W9900Z,5A1945Z,0BH17EZ,0W9B00Z,06H03DZ,,,,,W320XXA,,957.0,24.0,911.0,,1.0,12894,2016,2016,334.0,EXTENSIVE ABDOMINAL/THORACIC PROCEDURES FOR MU...,7,6,6,6,5.0,2.0,2.0,8.0,2.0,1.0,1.0,8.0,4.0,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,2,3,1


In [3]:
# remove high missing rate features (missing rate > 99%)
high_miss_cols = []
for col in tqdm(all_cols):
    missing_rate = df[col].isnull().sum() / len(df)
    if missing_rate > 0.99:
        high_miss_cols.append(col)
print(f"{len(high_miss_cols)} cols are removed!")
use_cols = [col for col in all_cols if col not in high_miss_cols]
print(f"The number of cols used for now is {len(use_cols)}")

df[use_cols].head()

HBox(children=(IntProgress(value=0, max=229), HTML(value='')))


8 cols are removed!
The number of cols used for now is 221


Unnamed: 0,LOSD1,UNIT100,UNIT110,UNIT114,UNIT115,UNIT116,UNIT117,UNIT118,UNIT119,UNIT170,UNIT172,UNIT173,UNIT174,UNIT200,UNIT203,UNIT204,UNIT206,UNIT207,UNIT210,UNIT214,CHG001,CHG260,CHG480,CHG290,CHG530,CHG490,CHG540,CHG500,CHG550,CHG510,CHG610,CHG520,CHG710,CHG570,CHG720,CHG650,CHG730,CHG820,CHG740,CHG750,CHG100,CHG760,CHG240,CHG770,CHG110,CHG771,CHG114,CHG790,CHG115,CHG800,CHG116,CHG810,CHG117,CHG900,CHG118,CHG940,CHG119,CHG943,CHG220,CHG944,CHG990,CHG960,CHG170,CHG971,CHG172,CHG981,CHG173,CHG481,CHG174,CHG670,CHG200,CHG203,CHG204,CHG206,CHG207,CHG210,CHG214,CHG230,CHG250,CHG270,CHG280,CHG300,CHG320,CHG330,CHG331,CHG333,CHG340,CHG350,CHG360,CHG370,CHG380,CHG400,CHG404,CHG410,CHG420,CHG430,CHG440,CHG450,CHG460,CHG020,CHG681,CHG682,CHG683,CHG684,CHG689,CHG930,CHG950,CHG1000,CHG2100,CHG3100,SPC1,SPC2,SPC3,DISD,PPROCD,SPROC1D,SPROC2D,SPROC3D,SPROC4D,SPROC5D,SPROC6D,SPROC7D,SPROC8D,ADM_TYPE,ADMS,DISP,PPOA,SPOA1,SPOA2,SPOA3,SPOA4,SPOA5,SPOA6,SPOA7,SPOA8,SPOA9,SPOA10,SPOA11,SPOA12,SPOA13,SPOA14,COUNTY,PDIAG10,ADM_DIAG10,SDIAG10_1,SDIAG10_2,SDIAG10_3,SDIAG10_4,SDIAG10_5,SDIAG10_6,SDIAG10_7,SDIAG10_8,SDIAG10_9,SDIAG10_10,SDIAG10_11,SDIAG10_12,SDIAG10_13,SDIAG10_14,PPROC10,SPROC10_1,SPROC10_2,SPROC10_3,SPROC10_4,SPROC10_5,SPROC10_6,SPROC10_7,SPROC10_8,SPROC10_9,SPROC10_10,SPROC10_11,SPROC10_12,PECODE10,SECODE10,MSDRG,MDC,APDRG20,ER,IP,RFA_ID,DISYEAR,ADMYEAR,INTERVAL,APDRGDSC,DISMTH,DISDAY,ADMMTH,ADMDAY,PDATE,SDATE1,SDATE2,SDATE3,SDATE4,SDATE5,SDATE6,SDATE7,SDATE8,CC_GRP_1,CC_GRP_2,CC_GRP_3,CC_GRP_4,CC_GRP_5,CC_GRP_6,CC_GRP_7,CC_GRP_8,CC_GRP_9,CC_GRP_10,CC_GRP_11,CC_GRP_12,CC_GRP_13,CC_GRP_14,CC_GRP_15,CC_GRP_16,CC_GRP_17,TOT_GRP,totalcc,wgtcc,sex1,urstat1,race1,age,payor2
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,746.029999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,8.03,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,738.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0,0,0,0,,,,20535,,,,,,,,,,1,1,1.0,,,,,,,,,,,,,,,,7.0,K029,K088,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,12922,2016,2016,,.,3,3,3,3,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,3,1
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3240.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,208.0,336.0,0,1131.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1565.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0,0,0,0,,,,20543,20543.0,20543.0,,,,,,,,1,1,1.0,,,,,,,,,,,,,,,,10.0,K529,,,,,,,,,,,,,,,,XW043H4,069Y3ZZ,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,,,,,,1.0,,12904,2016,2016,0.0,.,3,4,3,4,1.0,1.0,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,3,1
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4861.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,88.0,116.0,0,1600.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1521.0,0.0,0.0,0.0,0.0,0.0,1536.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0,0,0,0,,,,20690,20690.0,,,,,,,,,1,1,1.0,,,,,,,,,,,,,,,,18.0,O2341,,O99331,F17200,Z3A01,,,,,,,,,,,,BY49ZZZ,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,NO CODE,,,,,,1.0,,12909,2016,2016,0.0,.,8,4,8,4,1.0,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,3
3,8,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,39665.949951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,97.0,0.0,0,1390.0,0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0,9992.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,5950.949997,690.0,0,9754.0,0.0,0,0.0,0.0,0.0,0.0,8429.0,0.0,2920.0,0.0,0.0,0.0,284.0,159.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0,0,0,0,IM,,,20719,20713.0,20713.0,20717.0,,,,,,,2,2,6.0,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,E,E,10.0,L89154,L89154,E43,G8221,M869,N390,K592,L89224,D638,N319,D500,K5900,R252,F1210,Z86718,Z7901,0QB10ZZ,0KBP0ZZ,02HV33Z,,,,,,,,,,,,,579.0,9.0,364.0,,1.0,12894,2016,2016,196.0,OTHER SKIN SUBCUTANEOUS TISSUE & RELATED PROCE...,9,5,9,4,3.0,3.0,7.0,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,2,3,1
4,14,0,4,0,0,0,0,0,0,0,0,0,0,10,0,0,0,0,0,0,426110.0,0.0,0.0,0.0,2130.0,0.0,0.0,0.0,0,0.0,0.0,0,2898.0,0,0.0,0,1004.0,0.0,0.0,0.0,0,0.0,0,0,11164.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,77470.0,0.0,0.0,0.0,0,0.0,0.0,0.0,69157.0,56763.0,0,48094.0,16503.0,0,0.0,0.0,0.0,37063.0,43757.0,4788.0,2937.0,0.0,0.0,13576.0,4250.0,1704.0,0.0,6258.0,131.0,0,26463.0,0.0,0.0,0,0.0,0,0,0,0,0,,OSM,,20643,20633.0,20630.0,20630.0,20636.0,20630.0,20629.0,20629.0,20636.0,20632.0,5,1,1.0,Y,N,Y,Y,Y,Y,Y,N,Y,Y,Y,,,,,18.0,S52182B,S52182B,J14,S272XXA,S24102A,S22039A,S2232XA,G8221,D62,R339,E8809,S51802A,,,,,0PSJ04Z,0W9D00Z,0WJG0ZZ,0W9B00Z,0W9900Z,5A1945Z,0BH17EZ,0W9B00Z,06H03DZ,,,,,W320XXA,,957.0,24.0,911.0,,1.0,12894,2016,2016,334.0,EXTENSIVE ABDOMINAL/THORACIC PROCEDURES FOR MU...,7,6,6,6,5.0,2.0,2.0,8.0,2.0,1.0,1.0,8.0,4.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,2,3,1


In [4]:
# fill null value and encode categorical features
for col in tqdm(use_cols):
    if df[col].dtype == 'O':
        df[col].fillna("NA", inplace=True)
        enc = OrdinalEncoder()
        df[col] = enc.fit_transform(df[col].values.reshape(-1, 1).astype(str))
    else:
        df[col].fillna(-1, inplace=True)
        
df[use_cols].head()

HBox(children=(IntProgress(value=0, max=221), HTML(value='')))




Unnamed: 0,LOSD1,UNIT100,UNIT110,UNIT114,UNIT115,UNIT116,UNIT117,UNIT118,UNIT119,UNIT170,UNIT172,UNIT173,UNIT174,UNIT200,UNIT203,UNIT204,UNIT206,UNIT207,UNIT210,UNIT214,CHG001,CHG260,CHG480,CHG290,CHG530,CHG490,CHG540,CHG500,CHG550,CHG510,CHG610,CHG520,CHG710,CHG570,CHG720,CHG650,CHG730,CHG820,CHG740,CHG750,CHG100,CHG760,CHG240,CHG770,CHG110,CHG771,CHG114,CHG790,CHG115,CHG800,CHG116,CHG810,CHG117,CHG900,CHG118,CHG940,CHG119,CHG943,CHG220,CHG944,CHG990,CHG960,CHG170,CHG971,CHG172,CHG981,CHG173,CHG481,CHG174,CHG670,CHG200,CHG203,CHG204,CHG206,CHG207,CHG210,CHG214,CHG230,CHG250,CHG270,CHG280,CHG300,CHG320,CHG330,CHG331,CHG333,CHG340,CHG350,CHG360,CHG370,CHG380,CHG400,CHG404,CHG410,CHG420,CHG430,CHG440,CHG450,CHG460,CHG020,CHG681,CHG682,CHG683,CHG684,CHG689,CHG930,CHG950,CHG1000,CHG2100,CHG3100,SPC1,SPC2,SPC3,DISD,PPROCD,SPROC1D,SPROC2D,SPROC3D,SPROC4D,SPROC5D,SPROC6D,SPROC7D,SPROC8D,ADM_TYPE,ADMS,DISP,PPOA,SPOA1,SPOA2,SPOA3,SPOA4,SPOA5,SPOA6,SPOA7,SPOA8,SPOA9,SPOA10,SPOA11,SPOA12,SPOA13,SPOA14,COUNTY,PDIAG10,ADM_DIAG10,SDIAG10_1,SDIAG10_2,SDIAG10_3,SDIAG10_4,SDIAG10_5,SDIAG10_6,SDIAG10_7,SDIAG10_8,SDIAG10_9,SDIAG10_10,SDIAG10_11,SDIAG10_12,SDIAG10_13,SDIAG10_14,PPROC10,SPROC10_1,SPROC10_2,SPROC10_3,SPROC10_4,SPROC10_5,SPROC10_6,SPROC10_7,SPROC10_8,SPROC10_9,SPROC10_10,SPROC10_11,SPROC10_12,PECODE10,SECODE10,MSDRG,MDC,APDRG20,ER,IP,RFA_ID,DISYEAR,ADMYEAR,INTERVAL,APDRGDSC,DISMTH,DISDAY,ADMMTH,ADMDAY,PDATE,SDATE1,SDATE2,SDATE3,SDATE4,SDATE5,SDATE6,SDATE7,SDATE8,CC_GRP_1,CC_GRP_2,CC_GRP_3,CC_GRP_4,CC_GRP_5,CC_GRP_6,CC_GRP_7,CC_GRP_8,CC_GRP_9,CC_GRP_10,CC_GRP_11,CC_GRP_12,CC_GRP_13,CC_GRP_14,CC_GRP_15,CC_GRP_16,CC_GRP_17,TOT_GRP,totalcc,wgtcc,sex1,urstat1,race1,age,payor2
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,746.029999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,8.03,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,738.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0,0,0,0,57.0,57.0,56.0,20535,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1,0.0,1.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,7.0,3790.0,2672.0,5813.0,5020.0,4516.0,4132.0,3906.0,3580.0,3362.0,3096.0,2954.0,2788.0,2598.0,2428.0,2209.0,2060.0,6128.0,5792.0,4502.0,3457.0,2685.0,1770.0,1782.0,1393.0,1105.0,894.0,754.0,602.0,532.0,0.0,0.0,-1.0,-1.0,-1.0,1.0,-1.0,12922,2016,2016,-1.0,0.0,3,3,3,3,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,3,1
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3240.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,208.0,336.0,0,1131.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1565.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0,0,0,0,57.0,57.0,56.0,20543,20543.0,20543.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1,0.0,1.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,10.0,4068.0,4528.0,5813.0,5020.0,4516.0,4132.0,3906.0,3580.0,3362.0,3096.0,2954.0,2788.0,2598.0,2428.0,2209.0,2060.0,6144.0,935.0,4503.0,3458.0,2686.0,1771.0,1783.0,1394.0,1106.0,895.0,755.0,603.0,533.0,0.0,0.0,-1.0,-1.0,-1.0,1.0,-1.0,12904,2016,2016,0.0,0.0,3,4,3,4,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,3,1
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4861.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,88.0,116.0,0,1600.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1521.0,0.0,0.0,0.0,0.0,0.0,1536.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0,0,0,0,57.0,57.0,56.0,20690,20690.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1,0.0,1.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,18.0,6647.0,4528.0,6395.0,1290.0,8796.0,4132.0,3906.0,3580.0,3362.0,3096.0,2954.0,2788.0,2598.0,2428.0,2209.0,2060.0,5975.0,5793.0,4503.0,3458.0,2686.0,1771.0,1783.0,1394.0,1106.0,895.0,755.0,603.0,533.0,0.0,0.0,-1.0,-1.0,-1.0,1.0,-1.0,12909,2016,2016,0.0,0.0,8,4,8,4,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,3
3,8,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,39665.949951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,97.0,0.0,0,1390.0,0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0,9992.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,5950.949997,690.0,0,9754.0,0.0,0,0.0,0.0,0.0,0.0,8429.0,0.0,2920.0,0.0,0.0,0.0,284.0,159.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0,0,0,0,49.0,57.0,56.0,20719,20713.0,20713.0,20717.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2,1.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,1.0,1.0,10.0,4653.0,3302.0,1106.0,1762.0,4141.0,3964.0,2580.0,2633.0,451.0,2949.0,324.0,1802.0,3019.0,536.0,3828.0,3444.0,3463.0,2718.0,209.0,3457.0,2685.0,1770.0,1782.0,1393.0,1105.0,894.0,754.0,602.0,532.0,0.0,0.0,579.0,9.0,364.0,-1.0,1.0,12894,2016,2016,196.0,245.0,9,5,9,4,3.0,3.0,7.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,2,3,1
4,14,0,4,0,0,0,0,0,0,0,0,0,0,10,0,0,0,0,0,0,426110.0,0.0,0.0,0.0,2130.0,0.0,0.0,0.0,0,0.0,0.0,0,2898.0,0,0.0,0,1004.0,0.0,0.0,0.0,0,0.0,0,0,11164.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,77470.0,0.0,0.0,0.0,0,0.0,0.0,0.0,69157.0,56763.0,0,48094.0,16503.0,0,0.0,0.0,0.0,37063.0,43757.0,4788.0,2937.0,0.0,0.0,13576.0,4250.0,1704.0,0.0,6258.0,131.0,0,26463.0,0.0,0.0,0,0.0,0,0,0,0,0,57.0,75.0,56.0,20643,20633.0,20630.0,20630.0,20636.0,20630.0,20629.0,20629.0,20636.0,20632.0,5,0.0,1.0,6.0,2.0,6.0,6.0,6.0,6.0,6.0,2.0,6.0,6.0,6.0,3.0,3.0,3.0,3.0,18.0,10033.0,6953.0,3212.0,7089.0,6362.0,5743.0,5348.0,1227.0,448.0,3806.0,595.0,3886.0,2598.0,2428.0,2209.0,2060.0,3402.0,4358.0,3295.0,2406.0,1787.0,1375.0,405.0,839.0,140.0,894.0,754.0,602.0,532.0,985.0,0.0,957.0,24.0,911.0,-1.0,1.0,12894,2016,2016,334.0,76.0,7,6,6,6,5.0,2.0,2.0,8.0,2.0,1.0,1.0,8.0,4.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,2,3,1


In [5]:
# remove features with highly imbalanced (more than 99.5%)
imbalanced_cols = []
for col in tqdm(use_cols):
    val_hist = df[col].value_counts() / len(df)
    if val_hist.iloc[0] > 0.995:
        imbalanced_cols.append(col)
print(f"{len(imbalanced_cols)} cols are removed!")

use_cols = [col for col in use_cols if col not in imbalanced_cols]
print(f"The number of cols used for now is {len(use_cols)}")

df[use_cols].head()

HBox(children=(IntProgress(value=0, max=221), HTML(value='')))


73 cols are removed!
The number of cols used for now is 148


Unnamed: 0,LOSD1,UNIT110,UNIT114,UNIT170,UNIT200,UNIT206,UNIT210,UNIT214,CHG001,CHG260,CHG480,CHG530,CHG500,CHG510,CHG610,CHG710,CHG720,CHG730,CHG740,CHG750,CHG760,CHG110,CHG771,CHG114,CHG800,CHG940,CHG960,CHG170,CHG981,CHG481,CHG200,CHG206,CHG210,CHG214,CHG250,CHG270,CHG300,CHG320,CHG340,CHG350,CHG360,CHG370,CHG380,CHG400,CHG410,CHG420,CHG430,CHG440,CHG450,CHG460,SPC1,SPC2,SPC3,DISD,PPROCD,SPROC1D,SPROC2D,SPROC3D,SPROC4D,SPROC5D,SPROC6D,SPROC7D,SPROC8D,ADM_TYPE,ADMS,DISP,PPOA,SPOA1,SPOA2,SPOA3,SPOA4,SPOA5,SPOA6,SPOA7,SPOA8,SPOA9,SPOA10,SPOA11,SPOA12,SPOA13,SPOA14,COUNTY,PDIAG10,ADM_DIAG10,SDIAG10_1,SDIAG10_2,SDIAG10_3,SDIAG10_4,SDIAG10_5,SDIAG10_6,SDIAG10_7,SDIAG10_8,SDIAG10_9,SDIAG10_10,SDIAG10_11,SDIAG10_12,SDIAG10_13,SDIAG10_14,PPROC10,SPROC10_1,SPROC10_2,SPROC10_3,SPROC10_4,SPROC10_5,SPROC10_6,SPROC10_7,SPROC10_8,SPROC10_9,SPROC10_10,SPROC10_11,SPROC10_12,PECODE10,SECODE10,MSDRG,MDC,APDRG20,ER,IP,RFA_ID,DISYEAR,ADMYEAR,INTERVAL,APDRGDSC,DISMTH,DISDAY,ADMMTH,ADMDAY,PDATE,SDATE1,SDATE2,SDATE3,SDATE4,SDATE5,SDATE6,SDATE7,SDATE8,CC_GRP_2,CC_GRP_4,CC_GRP_6,CC_GRP_10,TOT_GRP,totalcc,wgtcc,sex1,urstat1,race1,age,payor2
0,1,0,0,0,0,0,0,0,746.029999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,738.0,0.0,57.0,57.0,56.0,20535,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1,0.0,1.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,7.0,3790.0,2672.0,5813.0,5020.0,4516.0,4132.0,3906.0,3580.0,3362.0,3096.0,2954.0,2788.0,2598.0,2428.0,2209.0,2060.0,6128.0,5792.0,4502.0,3457.0,2685.0,1770.0,1782.0,1393.0,1105.0,894.0,754.0,602.0,532.0,0.0,0.0,-1.0,-1.0,-1.0,1.0,-1.0,12922,2016,2016,-1.0,0.0,3,3,3,3,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0,0,0,0,0,0,0,1,0,1,3,1
1,1,0,0,0,0,0,0,0,3240.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,208.0,336.0,1131.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1565.0,0.0,57.0,57.0,56.0,20543,20543.0,20543.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1,0.0,1.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,10.0,4068.0,4528.0,5813.0,5020.0,4516.0,4132.0,3906.0,3580.0,3362.0,3096.0,2954.0,2788.0,2598.0,2428.0,2209.0,2060.0,6144.0,935.0,4503.0,3458.0,2686.0,1771.0,1783.0,1394.0,1106.0,895.0,755.0,603.0,533.0,0.0,0.0,-1.0,-1.0,-1.0,1.0,-1.0,12904,2016,2016,0.0,0.0,3,4,3,4,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0,0,0,0,0,0,0,1,0,1,3,1
2,1,0,0,0,0,0,0,0,4861.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,88.0,116.0,1600.0,0.0,0.0,0.0,0.0,0.0,0.0,1521.0,0.0,0.0,0.0,0.0,1536.0,0.0,57.0,57.0,56.0,20690,20690.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1,0.0,1.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,18.0,6647.0,4528.0,6395.0,1290.0,8796.0,4132.0,3906.0,3580.0,3362.0,3096.0,2954.0,2788.0,2598.0,2428.0,2209.0,2060.0,5975.0,5793.0,4503.0,3458.0,2686.0,1771.0,1783.0,1394.0,1106.0,895.0,755.0,603.0,533.0,0.0,0.0,-1.0,-1.0,-1.0,1.0,-1.0,12909,2016,2016,0.0,0.0,8,4,8,4,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0,0,0,0,0,0,0,0,1,2,3,3
3,8,8,0,0,0,0,0,0,39665.949951,0.0,0.0,0.0,0.0,97.0,0.0,1390.0,0.0,0.0,0.0,0.0,0.0,9992.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5950.949997,690.0,9754.0,0.0,0.0,0.0,8429.0,0.0,2920.0,0.0,0.0,284.0,159.0,0.0,0.0,0.0,49.0,57.0,56.0,20719,20713.0,20713.0,20717.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2,1.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,1.0,1.0,10.0,4653.0,3302.0,1106.0,1762.0,4141.0,3964.0,2580.0,2633.0,451.0,2949.0,324.0,1802.0,3019.0,536.0,3828.0,3444.0,3463.0,2718.0,209.0,3457.0,2685.0,1770.0,1782.0,1393.0,1105.0,894.0,754.0,602.0,532.0,0.0,0.0,579.0,9.0,364.0,-1.0,1.0,12894,2016,2016,196.0,245.0,9,5,9,4,3.0,3.0,7.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0,0,0,0,0,0,0,1,1,2,3,1
4,14,4,0,0,10,0,0,0,426110.0,0.0,0.0,2130.0,0.0,0.0,0.0,2898.0,0.0,1004.0,0.0,0.0,0.0,11164.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,77470.0,0.0,0.0,0.0,69157.0,56763.0,48094.0,16503.0,0.0,37063.0,43757.0,4788.0,2937.0,0.0,13576.0,4250.0,1704.0,0.0,6258.0,131.0,57.0,75.0,56.0,20643,20633.0,20630.0,20630.0,20636.0,20630.0,20629.0,20629.0,20636.0,20632.0,5,0.0,1.0,6.0,2.0,6.0,6.0,6.0,6.0,6.0,2.0,6.0,6.0,6.0,3.0,3.0,3.0,3.0,18.0,10033.0,6953.0,3212.0,7089.0,6362.0,5743.0,5348.0,1227.0,448.0,3806.0,595.0,3886.0,2598.0,2428.0,2209.0,2060.0,3402.0,4358.0,3295.0,2406.0,1787.0,1375.0,405.0,839.0,140.0,894.0,754.0,602.0,532.0,985.0,0.0,957.0,24.0,911.0,-1.0,1.0,12894,2016,2016,334.0,76.0,7,6,6,6,5.0,2.0,2.0,8.0,2.0,1.0,1.0,8.0,4.0,0,0,0,0,0,0,0,1,1,2,3,1


In [6]:
# remove low pearson correlation features (p-value > 0.01)
low_corr_cols = []
for col in tqdm(use_cols):
    corr_val, p_val = pearsonr(df[col].values, df[TARGET])
    if p_val > 1e-1:
        low_corr_cols.append(col)

print(f"{len(low_corr_cols)} cols are removed!")
use_cols = [col for col in use_cols if col not in low_corr_cols]
print(f"The number of cols used for now is {len(use_cols)}")

HBox(children=(IntProgress(value=0, max=148), HTML(value='')))


10 cols are removed!
The number of cols used for now is 138


In [7]:
use_cols += [TARGET]
pd.DataFrame(use_cols).to_csv("../input/use_cols.csv")