In [None]:
import os
import subprocess

java_home = subprocess.check_output(["/usr/libexec/java_home", "-v", "17"]).strip().decode('utf-8')

# Set JAVA_HOME and PATH
os.environ["JAVA_HOME"] = java_home
os.environ["PATH"] = os.path.join(java_home, "bin") + ":" + os.environ["PATH"]
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local pyspark-shell"

# Verify JAVA_HOME and Java version
print("JAVA_HOME:", os.environ['JAVA_HOME'])
!java -version

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("CBRFSS") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.cores", "4") \
    .config("spark.executor.instances", "4") \
    .getOrCreate()

file_path = "LLCP2023.ASC / "
df = spark.read.text(file_path)
df.printSchema()
df.show(5)

We preselected features that definetly had no relation with diabetes, leaving us with 139 features to be preprocessed and wrotte them to a parquet file for more efficient processing. 

In [None]:
df = df.select(
    # expr("substring(value, 1, 2) as _STATE"),  # _STATE (1-2)
    # expr("substring(value, 17, 2) as FMONTH"),  # FMONTH (17-18)
    # expr("substring(value, 19, 8) as IDATE"),  # IDATE (19-26)
    # expr("substring(value, 32, 4) as DISPCODE"),  # DISPCODE (36-39)
    # expr("substring(value, 36, 10) as SEQNO"),  # SEQNO (40-49)
    expr("substring(value, 40, 10) as _PSU"),  # _PSU (40-49)
    # expr("substring(value, 63, 1) as CTELENM1"),  # CTELENM1 (63)
    # expr("substring(value, 64, 1) as PVTRESD1"),  # PVTRESD1 (64)
    # expr("substring(value, 66, 1) as STATERE1"),  # STATERE1 (66)
    # expr("substring(value, 67, 1) as CELPHON1"),  # CELPHON1 (67)
    # expr("substring(value, 68, 1) as LADULT1"),  # LADULT1 (68)
    # expr("substring(value, 69, 2) as NUMADULT"),  # NUMADULT (69-70)
    # expr("substring(value, 72, 1) as LANDSEX2"),  # LANDSEX2 (72)
    # expr("substring(value, 73, 1) as LNDSXBRT"),  # LNDSXBRT (73)
    # expr("substring(value, 74, 1) as SAFETIME"),  # SAFETIME (74)
    # expr("substring(value, 75, 1) as CTELNUM1"),  # CTELNUM1 (75)
    # expr("substring(value, 76, 1) as CELLFON5"),  # CELLFON5 (76)
    # expr("substring(value, 77, 1) as CADULT1"),  # CADULT1 (77)
    # expr("substring(value, 78, 1) as CELLSEX2"),  # CELLSEX2 (78)
    # expr("substring(value, 79, 1) as CELSXBRT"),  # CELSXBRT (79)
    # expr("substring(value, 80, 1) as PVTRESD3"),  # PVTRESD3 (80)
    # expr("substring(value, 81, 1) as CCLGHOUS"),  # CCLGHOUS (81)
    # expr("substring(value, 82, 1) as CSTATE1"),  # CSTATE1 (82)
    # expr("substring(value, 85, 1) as LANDLINE"),  # LANDLINE (85)
    # expr("substring(value, 86, 2) as HHADULT"),  # HHADULT (86-87)
    expr("substring(value, 88, 1) as SEXVAR"),  # SEXVAR (88)
    expr("substring(value,101 ,1) as GENHLTH"),  # GENHLTH (101)
    expr("substring(value ,102 ,2) as PHYSHLTH"),  # PHYSHLTH (102-103)
    expr("substring(value ,104 ,2) as MENTHLTH"),  # MENTHLTH (104-105)
    # expr("substring(value ,106 ,2) as POORHLTH"),  # POORHLTH (106-107)
    # expr("substring(value ,108 ,2) as PRIMINS1"),  # PRIMINS1 (108-109)
    # expr("substring(value ,110 ,1) as PERSDOC3"),  # PERSDOC3 (110)
    # expr("substring(value ,111 ,1) as MEDCOST1"),  # MEDCOST1 (111)
    expr("substring(value ,112 ,1) as CHECKUP1 "),  # CHECKUP1(112)
    # expr("substring(value ,113 ,1 )as EXERANY2 "),  # EXERANY2(113)
    expr(" substring( value ,114 ,2 )as EXRACT12 "),  # EXRACT12(114-115)
    expr(" substring( value ,116 ,3 )as EXEROFT "),  # EXEROFT(116-118)
    expr(" substring( value ,119 ,3 )as EXERHMM "),  # EXERHMM(119-121)
    # expr(" substring( value ,122 ,2 )as EXRACT22 "),  # EXRACT22(122-123)
    # expr(" substring( value ,124 ,3 )as EXEROFT2 "),  # EXEROFT2(124-126)
    # expr(" substring( value ,127 ,3 )as EXERHMM2 "),  # EXERHMM2(127-129)
    # expr(" substring( value ,130 ,3 )as STRENGTH "),  # STRENGTH(130-132)
    expr(" substring( value ,133 ,1 )as BPHIGH6 "),  # BPHIGH6(133)
    expr(" substring( value ,134 ,1 )as BPMEDS "),  ## BPMEDS(134),
    # expr("substring(value, 135, 1) as CHOLCHK3"),  # CHOLCHK3 (135)
    expr("substring(value, 136, 1) as TOLDHI3"),  # TOLDHI3 (136)
    expr("substring(value, 137, 1) as CHOLMED3"),  # CHOLMED3 (137)
    expr("substring(value, 138, 1) as CVDINFR4"),  # CVDINFR4 (138)
    expr("substring(value, 139, 1) as CVDCRHD4"),  # CVDCRHD4 (139)
    expr("substring(value, 140, 1) as CVDSTRK3"),  # CVDSTRK3 (140)
    expr("substring(value, 141, 1) as ASTHMA3"),  # ASTHMA3 (141)
    # expr("substring(value, 142, 1) as ASTHNOW"),  # ASTHNOW (142)
    expr("substring(value, 143, 1) as CHCSCNC1"),  # CHCSCNC1 (143)
    expr("substring(value, 144, 1) as CHCOCNC1"),  # CHCOCNC1 (144)
    expr("substring(value, 145, 1) as CHCCOPD3"),  # CHCCOPD3 (145)
    expr("substring(value, 146, 1) as ADDEPEV3"),  # ADDEPEV3 (146)
    expr("substring(value, 147, 1) as CHCKDNY2"),  # CHCKDNY2 (147)
    expr("substring(value, 148, 1) as HAVARTH4"),  # HAVARTH4 (148)
    expr("substring(value, 149, 1) as DIABETE4"),  # DIABETE4 (149)
    # expr("substring(value, 150, 2) as DIABAGE4"),  # DIABAGE4 (150-151)
    expr("substring(value, 186, 1) as MARITAL"),  # MARITAL (186)
    expr("substring(value, 187, 1) as EDUCA"),  # EDUCA (187)
    # expr("substring(value, 188, 1) as RENTHOM1"),  # RENTHOM1 (188)
    # expr("substring(value, 197, 1) as NUMHHOL4"),  # NUMHHOL4 (197)
    # expr("substring(value, 198, 1) as NUMPHON4"),  # NUMPHON4 (198)
    # expr("substring(value, 199, 1) as CPDEMO1C"),  # CPDEMO1C (199)
    # expr("substring(value, 200, 1) as VETERAN3"),  # VETERAN3 (200)
    expr("substring(value, 201, 1) as EMPLOY1"),  # EMPLOY1 (201)
    # expr("substring(value, 202, 2) as CHILDREN"),  # CHILDREN (202-203)
    expr("substring(value,204 ,2 )as INCOME3 "),  # INCOME3(204-205)
    expr(" substring( value ,206 ,1 )as PREGNANT "),  # PREGNANT(206)
    expr(" substring( value ,207 ,4 )as WEIGHT2 "),  # WEIGHT2(207-210)
    # expr(" substring( value ,211 ,4 )as HEIGHT3 "),  # HEIGHT3(211-214)
    # expr(" substring( value ,215 ,1 )as DEAF "),  # DEAF(215)
    # expr(" substring( value ,216 ,1 )as BLIND "),  # BLIND(216)
    # expr(" substring( value ,217 ,1 )as DECIDE "),  ## DECIDE(217)
    expr(" substring( value ,218 ,1 )as DIFFWALK "),  ## DIFFWALK(218)
    # expr(" substring( value ,219 ,1 )as DIFFDRES "),  ## DIFFDRES(219)
    expr(" substring( value ,220 ,1 )as DIFFALON "),  ## DIFFALON(220)
    expr(" substring( value ,221 ,2 )as FALL12MN "),  ## FALL12MN(221-222)
    # expr(" substring( value ,223 ,2 )as FALLINJ5 "),  ## FALLINJ5(223-224)
    expr(" substring( value ,225 ,1 )as SMOKE100 "),  ## SMOKE100(225)
    expr(" substring( value ,226 ,1 )as SMOKDAY2 "),  ## SMOKDAY2(226)
    expr(" substring( value ,227 ,1 )as USENOW3 "),  ## USENOW3(227)
    expr(" substring( value ,228 ,1 )as ECIGNOW2 "),  ## ECIGNOW2(228)
    expr(" substring( value ,229 ,3 )as ALCDAY4 "),  ## ALCDAY4(229-231)
    expr(" substring( value ,232 ,2 )as AVEDRNK3 "),  ## AVEDRNK3(232-233)
    # expr(" substring( value ,234 ,2 )as DRNK3GE5 "),  ## DRNK3GE5(234-235)
    # expr(" substring( value ,236 ,2 )as MAXDRNKS "),  ## MAXDRNKS(236-237)
    expr(" substring( value ,238 ,1 )as FLUSHOT7 "),  ## FLUSHOT7(238)
    # expr(" substring( value ,239 ,6 )as FLSHTMY3 "),  ## FLSHTMY3(239-244)
    expr(" substring( value ,245 ,1 )as PNEUVAC4 "),  ## PNEUVAC4(245)
    expr(" substring( value ,246 ,1 )as SHINGLE2 "),  ## SHINGLE2(246)
    expr(" substring( value ,247 ,1 )as HIVTST7 "),  ## HIVTST7(247)
    expr(" substring( value ,248 ,6 )as HIVTSTD3 "),  ## HIVTSTD3(248-253)
    # expr(" substring( value ,254 ,1 )as SEATBELT "),  ## SEATBELT(254)
    # expr("substring(value, 255, 2) as DRNKDRI2"),  # DRNKDRI2 (255-256)
    expr("substring(value, 257, 1) as COVIDPO1"),  # COVIDPO1 (257)
    expr("substring(value, 258, 1) as COVIDSM1"),  # COVIDSM1 (258)
    # expr("substring(value, 259, 1) as COVIDACT"),  # COVIDACT (259)
    expr("substring(value, 260, 1) as PDIABTS1"),  # PDIABTS1 (260)
    expr("substring(value, 261, 1) as PREDIAB2"),  # PREDIAB2 (261)
    expr("substring(value, 262, 1) as DIABTYPE"),  # DIABTYPE (262)
    expr("substring(value, 263, 1) as INSULIN1"),  # INSULIN1 (263)
    expr("substring(value, 264, 2) as CHKHEMO3"),  # CHKHEMO3 (264-265)
    expr("substring(value, 266, 1) as EYEEXAM1"),  # EYEEXAM1 (266)
    expr("substring(value, 267, 1) as DIABEYE1"),  # DIABEYE1 (267)
    expr("substring(value, 268, 1) as DIABEDU1"),  # DIABEDU1 (268)
    expr("substring(value, 269, 1) as FEETSORE"),  # FEETSORE (269)
    expr("substring(value, 270, 1) as ARTHEXER"),  # ARTHEXER (270)
    # expr("substring(value, 271, 1) as ARTHEDU"),  # ARTHEDU (271)
    # expr("substring(value, 272, 1) as LMTJOIN3"),  # LMTJOIN3 (272)
    # expr("substring(value, 273, 1) as ARTHDIS2"),  # ARTHDIS2 (273)
    # expr("substring(value, 274, 2) as JOINPAI2"),  # JOINPAI2 (274-275)
    # expr("substring(value, 276, 3) as LCSFIRST"),  # LCSFIRST (276-278)
    expr("substring(value, 279, 3) as LCSLAST"),  # LCSLAST (279-281)
    expr("substring(value, 282, 3) as LCSNUMCG"),  # LCSNUMCG (282-284)
    # expr("substring(value, 285, 1) as LCSCTSC1"),  # LCSCTSC1 (285)
    # expr("substring(value, 286, 1) as LCSSCNCR"),  # LCSSCNCR (286)
    # expr("substring(value, 287, 1) as LCSCTWHN"),  # LCSCTWHN (287)
    expr("substring(value, 288, 1) as HADMAM"),  # HADMAM (288)
    # expr("substring(value, 289, 1) as HOWLONG"),  # HOWLONG (289)
    # expr("substring(value ,290 ,1 )as CERVSCRN "),  ## CERVSCRN(290)
    # expr(" substring( value ,291 ,1 )as CRVCLCNC "),  ## CRVCLCNC(291)
    # expr(" substring( value ,292 ,1 )as CRVCLPAP "),  ## CRVCLPAP(292)
    # expr(" substring( value ,293 ,1 )as CRVCLHPV "),  ## CRVCLHPV(293)
    # expr(" substring( value ,294 ,1 )as HADHYST2 "),  ## HADHYST2(294)
    # expr(" substring( value ,295 ,1 )as PSATEST1 "),  ## PSATEST1(295)
    expr(" substring( value ,296 ,1 )as PSATIME1 "),  ## PSATIME(296)
    # expr(" substring( value ,297 ,1 )as PCPSARS2 "),  ## PCPSARS2(297)
    # expr(" substring( value ,298 ,1 )as PSASUGS2 "),  ## PSASUGS2(298)
    # expr(" substring( value ,299 ,1 )as PCSTALK2 "),  ## PCSTALK2(299)
    # expr(" substring( value ,300 ,1 )as HADSIGM4 "),  ## HADSIGM4(300)
    expr(" substring( value ,301 ,1 )as COLNSIGM "),  ## COLNSIGM(301)
    expr(" substring( value ,302 ,1 )as COLNTES1"),  ## COLNTES(302)
    expr("substring(value, 303, 1) as SIGMTES1"),  # SIGMTES1 (303)
    # expr("substring(value, 304, 1) as LASTSIG4"),  # LASTSIG4 (304)
    expr("substring(value, 305, 1) as COLNCNCR"),  # COLNCNCR (305)
    # expr("substring(value, 306, 1) as VIRCOLO1"),  # VIRCOLO1 (306)
    expr("substring(value, 307, 1) as VCLNTES2"),  # VCLNTES2 (307)
    # expr("substring(value, 308, 1) as SMALSTOL"),  # SMALSTOL (308)
    expr("substring(value, 309, 1) as STOLTEST"),  # STOLTEST (309)
    # expr("substring(value, 310, 1) as STOOLDN2"),  # STOOLDN2 (310)
    # expr("substring(value, 311, 1) as BLDSTFIT"),  # BLDSTFIT (311)
    expr("substring(value, 312, 1) as SDNATES1"),  # SDNATES1 (312)
    expr("substring(value, 313, 1) as CNCRDIFF"),  # CNCRDIFF (313)
    # expr("substring(value, 314, 2) as CNCRAGE"),  # CNCRAGE (314-315)
    expr("substring(value, 316, 2) as CNCRTYP2"),  # CNCRTYP2 (316-317)
    expr("substring(value, 318, 1) as CSRVTRT3"),  # CSRVTRT3 (318)
    expr("substring(value, 319, 2) as CSRVDOC1"),  # CSRVDOC1 (319-320)
    # expr("substring(value, 321, 1) as CSRVSUM"),  # CSRVSUM (321)
    # expr("substring(value, 322, 1) as CSRVRTRN"),  # CSRVRTRN (322)
    # expr("substring(value, 323, 1) as CSRVINST"),  # CSRVINST (323)
    # expr("substring(value, 324, 1) as CSRVINSR"),  # CSRVINSR (324)
    # expr("substring(value, 325, 1) as CSRVDEIN"),  # CSRVDEIN (325)
    # expr("substring(value, 326, 1) as CSRVCLIN"),  # CSRVCLIN (326)
    # expr("substring(value, 327, 1) as CSRVPAIN"),  # CSRVPAIN (327)
    expr("substring(value, 328, 1) as CSRVCTL2"),  # CSRVCTL2 (328)
    expr("substring(value, 329, 3) as INDORTAN"),  # INDORTAN (329-331)
    expr("substring(value, 332, 3) as NUMBURN"),  # NUMBURN (332-334)
    # expr("substring(value, 335, 1) as SUNPRTCT"),  # SUNPRTCT (335)
    # expr("substring(value, 336, 2) as WKDAYOUT"),  # WKDAYOUT (336-337)
    # expr("substring(value, 338, 2) as WKENDOUT"),  # WKENDOUT (338-339)
    expr("substring(value, 340, 1) as CIMEMLO1"),  # CIMEMLO1 (340)
    # expr("substring(value, 341, 1) as CDWORRY"),  # CDWORRY (341)
    # expr("substring(value, 342, 1) as CDDISCU1"),  # CDDISCU1 (342)
    expr("substring(value, 343, 1) as CDHOUS1"),  # CDHOUS1 (343)
    expr("substring(value, 344, 1) as CDSOCIA1"),  # CDSOCIA1 (344)
    # expr("substring(value, 345, 1) as CAREGIV1"),  # CAREGIV1 (345)
    # expr("substring(value, 346, 2) as CRGVREL4"),  # CRGVREL4 (346-347)
    # expr("substring(value, 348, 1) as CRGVLNG1"),  # CRGVLNG1 (348)
    # expr("substring(value, 349, 1) as CRGVHRS1"),  # CRGVHRS1 (349)
    # expr("substring(value, 350, 2) as CRGVPRB3"),  # CRGVPRB3 (350-351)
    # expr("substring(value, 352, 1) as CRGVALZD"),  # CRGVALZD (352)
    # expr("substring(value, 353, 1) as CRGVPER1"),  # CRGVPER1 (353)
    # expr("substring(value, 354, 1) as CRGVHOU1"),  # CRGVHOU1 (354)
    # expr("substring(value, 355, 2) as LASTSMK2"),  # LASTSMK2 (355-356)
    expr("substring(value, 358, 1) as STOPSMK2"),  # STOPSMK2 (358)
    # expr("substring(value, 359, 1) as MENTCIGS"),  # MENTCIGS (359)
    # # expr("substring(value, 360, 1) as MENTECIG"),  # MENTECIG (360)
    # expr("substring(value, 361, 1) as HEATTBCO"),  # HEATTBCO (361)
    # expr("substring(value, 362, 1) as FIREARM5"),  # FIREARM5 (362)
    expr("substring(value, 363, 1) as GUNLOAD"),  # GUNLOAD (363)
    # expr("substring(value, 364, 1) as LOADULK2"),  # LOADULK2 (364)
    # expr("substring(value, 565, 1) as HASYMP1"),  # HASYMP1 (565)
    # expr("substring(value, 566, 1) as HASYMP2"),  # HASYMP2 (566)
    # expr("substring(value, 567, 1) as HASYMP3"),  # HASYMP3 (567)
    # expr("substring(value, 568, 1) as HASYMP4"),  # HASYMP4 (568)
    # expr("substring(value, 569, 1) as HASYMP5"),  # HASYMP5 (569)
    # expr("substring(value, 570, 1) as HASYMP6"),  # HASYMP6 (570)
    # expr("substring(value, 571, 1) as STRSYMP1"),  # STRSYMP1 (571)
    # expr("substring(value, 572, 1) as STRSYMP2"),  # STRSYMP2 (572)
    # expr("substring(value, 573, 1) as STRSYMP3"),  # STRSYMP3 (573)
    # expr("substring(value, 574, 1) as STRSYMP4"),  # STRSYMP4 (574)
    # expr("substring(value, 575, 1) as STRSYMP5"),  # STRSYMP5 (575)
    # expr("substring(value, 576, 1) as STRSYMP6"),  # STRSYMP6 (576)
    # expr("substring(value, 577, 1) as FIRSTAID"),  # FIRSTAID (577)
    expr("substring(value, 578, 1) as ASPIRIN"),  # ASPIRIN (578)
    # expr("substring(value, 579, 1) as BIRTHSEX"),  # BIRTHSEX (579)
    # expr("substring(value, 580, 1) as SOMALE"),  # SOMALE (580)
    # expr("substring(value, 581, 1) as SOFEMALE"),  # SOFEMALE (581)
    # expr("substring(value, 582, 1) as TRNSGNDR"),  # TRNSGNDR (582)
    # expr("substring(value, 583, 2) as MARIJAN1"),  # RFDRHV8(2104) (583-584)
    # expr("substring(value, 585, 1) as MARJSMOK"),  # MARJSMOK (585)
    # expr("substring(value, 586, 1) as MARJEAT"),  # MARJEAT (586)
    # expr("substring(value, 587, 1) as MARJVAPE"),  # MARJVAPE (587)
    # expr("substring(value, 588, 1) as MARJDAB"),  # MARJDAB (588)
    # expr("substring(value, 589, 1) as MARJOTHR"),  # MARJOTHR (589)
    # expr("substring(value, 590, 1) as USEMRJN4"),  # USEMRJN4 (590)
    # expr("substring(value, 591, 1) as ACEDEPRS"),  # ACEDEPRS (591)
    expr("substring(value, 592, 1) as ACEDRINK"),  # ACEDRINK (592)
    expr("substring(value, 593, 1) as ACEDRUGS"),  # ACEDRUGS (593)
    # expr("substring(value, 594, 1) as ACEPRISN"),  # ACEPRISN (594)
    # expr("substring(value, 595, 1) as ACEDIVRC"),  # ACEDIVRC (595)
    # expr("substring(value, 596, 1) as ACEPUNCH"),  # ACEPUNCH (596)
    # expr("substring(value, 597, 1) as ACEHURT1"),  # ACEHURT1 (597)
    # expr("substring(value, 598, 1) as ACESWEAR"),  # ACESWEAR (598)
    # expr("substring(value, 599, 1) as ACETOUCH"),  # ACETOUCH (599)
    # expr("substring(value, 600, 1) as ACETTHEM"),  # ACETTHEM (600)
    # expr("substring(value, 601, 1) as ACEHVSEX"),  # ACEHVSEX (601)
    # expr("substring(value, 602, 1) as ACEADSAF"),  # ACEADSAF (602)
    # expr("substring(value, 603, 1) as ACEADNED"),  # ACEADNED (603)
    # expr("substring(value, 604, 2) as IMFVPLA4"),  # IMFVPLA4 (604-605)
    expr("substring(value, 606, 1) as HPVADVC4"),  # HPVADVC4 (606)
    expr("substring(value, 607, 2) as HPVADSHT"),  # HPVADSHT (607-608)
    expr("substring(value, 609, 1) as TETANUS1"),  # TETANUS1 (609)
    # expr("substring(value, 610, 1) as COVIDVA1"),  # COVIDVA1 (610)
    # expr("substring(value, 611, 1) as COVACGE1"),  # COVACGE1 (611)
    expr("substring(value, 612, 1) as COVIDNU2"),  # COVIDNU2 (612)
    expr("substring(value, 613, 1) as LSATISFY"),  # LSATISFY (613)
    expr("substring(value, 614, 1) as EMTSUPRT"),  # EMTSUPRT (614)
    expr("substring(value, 615, 1) as SDLONELY"),  # SDLONELY (615)
    expr("substring(value, 616, 1) as SDHEMPLY"),  # SDHEMPLY (616)
    # expr("substring(value, 617, 1) as FOODSTMP"),  # FOODSTMP (617)
    # expr("substring(value, 618, 1) as SDHFOOD1"),  # SDHFOOD1 (618)
    # expr("substring(value, 619, 1) as SDHBILLS"),  # SDHBILLS (619)
    # expr("substring(value, 620, 1) as SDHUTILS"),  # SDHUTILS (620)
    # expr("substring(value, 621, 1) as SDHTRNSP"),  # SDHTRNSP (621)
    # expr("substring(value, 622, 1) as SDHSTRE1"),  # SDHSTRE1 (622)
    expr("substring(value, 623, 2) as RRCLASS3"),  # RRCLASS3 (623-624)
    expr("substring(value, 625, 1) as RRCOGNT2"),  # RRCOGNT2 (625)
    expr("substring(value, 626, 1) as RRTREAT"),  # RRTREAT (626)
    # expr("substring(value, 627, 1) as RRATWRK2"),  # RRATWRK2 (627)
    # expr("substring(value, 628, 1) as RRHCARE4"),  # RRHCARE4 (628)
    # expr("substring(value, 629, 1) as RRPHYSM2"),  # RRPHYSM2 (629)
    # expr("substring(value, 636, 1) as RCSGEND1"),  # RCSGEND1 (636)
    # expr("substring(value, 637, 1) as RCSXBRTH"),  # RCSXBRTH (637)
    # expr("substring(value, 670, 1) as RCSRLTN2"),  # RCSRLTN2 (670)
    # expr("substring(value, 671, 1) as CASTHDX2"),  # CASTHDX2 (671)
    # expr("substring(value, 672, 1) as CASTHNO2"),  # CASTHNO2 (672)
    # expr("substring(value, 677, 2) as QSTVER"),  # QSTVER (677-678)
    # expr("substring(value, 679, 2) as QSTLANG"),  # QSTLANG (679-680)
    # expr("substring(value, 1402, 1) as _METSTAT"),  # _METSTAT (1402)
    # expr("substring(value, 1403, 1) as _URBSTAT"),  # _URBSTAT (1403)
    # expr("substring(value, 1409, 1) as MSCODE"),  # MSCODE (1409)
    expr("substring(value, 1410, 6) as _STSTR"),  # _STSTR (1410-1415)
    # expr("substring(value, 1416, 10) as _STRWT"),  # _STRWT (1416-1425)
    # expr("substring(value, 1446, 10) as _RAWRAKE"),  # _RAWRAKE (1446-1455)
    # expr("substring(value, 1456, 10) as _WT2RAKE"),  # _WT2RAKE (1456-1465)
    expr("substring(value, 1470, 2) as _IMPRACE"),  # _IMPRACE (1470-1471)
    # expr("substring(value, 1480, 1) as _CHISPNC"),  # _CHISPNC (1480)
    # expr("substring(value, 1539, 2) as _CRACE"),  # _CRACE (1539-1540)
    # expr("substring(value, 1567, 1) as CAGEG"),  # CAGEG (1567)
    # expr("substring(value, 1583, 10) as _CLLCPWT"),  # _CLLCPWT (1583-1592)
    # expr("substring(value, 1680, 1) as _DUALUSE"),  # _DUALUSE (1680)
    # expr("substring(value, 1685, 10) as _DUALCOR"),  # _DUALCOR (1685-1694)
    # expr("substring(value, 1695, 10) as _LLCPWT2"),  # _LLCPWT2 (1695-1704)
    # expr("substring(value, 1749, 10) as _LLCPWT"),  # _LLCPWT (1749-1758)
    expr("substring(value, 1897, 1) as _RFHLTH"),  # _RFHLTH (1897)
    expr("substring(value, 1898, 1) as _PHYS14D"),  # _PHYS14D (1898)
    expr("substring(value, 1899, 1) as _MENT14D"),  # _MENT14D (1899)
    expr("substring(value, 1900, 1) as _HLTHPL1"),  # _HLTHPL1 (1900)
    expr("substring(value, 1901, 1) as _HCVU653"),  # _HCVU653 (1901)
    expr("substring(value ,1902 ,1 )as _TOTINDA "),  ## _TOTINDA(1902)
    expr("substring(value, 1902, 3) as METVL12_"),  # METVL12_ (1902-1904)
    # expr("substring(value, 1906, 3) as METVL22_"),  # METVL22_ (1906-1908)
    # expr("substring(value, 1909, 5) as MAXVO21_"),  # MAXVO21_ (1909-1913)
    expr("substring(value, 1914, 5) as FC601_"),  # FC601_ (1914-1918)
    expr("substring(value, 1919, 1) as ACTIN13_"),  # ACTIN13_ (1919)
    # expr("substring(value, 1920, 1) as ACTIN23_"),  # ACTIN23_ (1920)
    expr("substring(value, 1921, 3) as PADUR1_"),  # PADUR1_ (1921-1923)
    # expr("substring(value, 1924, 3) as PADUR2_"),  # PADUR2_ (1924-1926)
    expr("substring(value, 1927, 5) as PAFREQ1_"),  # PAFREQ1_ (1927-1931)
    # expr("substring(value, 1932, 5) as PAFREQ2_"),  # PAFREQ2_ (1932-1936)
    expr("substring(value, 1937, 5) as _MINAC12"),  # _MINAC12 (1937-1941)
    # expr("substring(value, 1942, 5) as _MINAC22"),  # _MINAC22 (1942-1946)
    expr("substring(value, 1947, 5) as STRFREQ_"),  # STRFREQ_ (1947-1951)
    expr("substring(value, 1952, 1) as PAMISS3_"),  # PAMISS3_ (1952)
    expr("substring(value, 1953, 5) as PAMIN13_"),  # PAMIN13_ (1953-1957)
    # expr("substring(value, 1958, 5) as PAMIN23_"),  # PAMIN23_ (1958-1962)
    expr("substring(value, 1963, 5) as PA3MIN_"),  # PA3MIN_ (1963-1967)
    expr("substring(value, 1968, 5) as PAVIG13_"),  # PAVIG13_ (1968-1972)
    # expr("substring(value, 1973, 5) as PAVIG23_"),  # PAVIG23_ (1973-1977)
    expr("substring(value, 1978, 5) as PA3VIGM_"),  # PA3VIGM_ (1978-1982)
    expr("substring(value ,1983 ,1 )as _PACAT3 "),  ## _PACAT3(1983)
    # expr(" substring( value ,1984 ,1 )as _PAINDX3 "),  ## _PAINDX3(1984)
    # expr(" substring( value ,1985 ,1 )as _PA150R4 "),  ## _PA150R4(1985)
    # expr(" substring( value ,1986 ,1 )as _PA300R4 "),  ## _PA300R4(1986)
    # expr(" substring( value ,1987 ,1 )as _PA30023 "),  ## _PA30023(1987)
    # expr(" substring( value ,1988 ,1 )as _PASTRNG "),  ## _PASTRNG(1988)
    # expr(" substring( value ,1989 ,1 )as _PAREC3 "),  ## _PAREC3(1989)
    # expr(" substring( value ,1990 ,1 )as _PASTAE3 "),  ## _PASTAE3(1990)
    expr(" substring( value ,1991 ,1 )as _RFHYPE6 "),  ## _RFHYPE6(1991)
    expr(" substring( value ,1992 ,1 )as _CHOLCH3 "),  ## _CHOLCH3(1992)
    expr(" substring( value ,1993 ,1 )as _RFCHOL3 "),  ## _RFCHOL3(1993)
    expr(" substring( value ,1994 ,1 )as _MICHD "),  ## _MICHD(1994)
    expr(" substring( value ,1995 ,1 )as _LTASTH1 "),  ## _LTASTH1(1995)
    expr(" substring( value ,1996 ,1 )as _CASTHM1 "),  ## _CASTHM1(1996)
    expr(" substring( value ,1997 ,1 )as _ASTHMS1 "),  ## _ASTHMS1(1997)
    expr(" substring( value ,1998 ,1 )as _DRDXAR2 "),  ## _DRDXAR2(1998)
    expr("substring(value, 2055, 2) as _MRACE1"),  # _MRACE1 (2055-2056)
    # expr("substring(value, 2059, 1) as _HISPANC"),  # _HISPANC (2059)
    expr("substring(value, 2060, 1) as _RACE"),  # _RACE (2060)
    # expr("substring(value, 2061, 1) as _RACEG21"),  # _RACEG21 (2061)
    expr("substring(value, 2062, 1) as _RACEGR3"),  # _RACEGR3 (2062)
    expr("substring(value, 2063, 1) as _RACEPRV"),  # _RACEPRV (2063)
    expr("substring(value, 2064, 1) as _SEX"),  # _SEX (2064)
    expr("substring(value, 2065, 2) as _AGEG5YR"),  # _AGEG5YR (2065-2066)
    # expr("substring(value, 2067, 1) as _AGE65YR"),  # _AGE65YR (2067)
    # expr("substring(value, 2068, 2) as _AGE80"),  # _AGE80 (2068-2069)
    # expr("substring(value, 2070, 1) as _AGE_G"),  # _AGE_G (2070)
    # expr("substring(value, 2071, 3) as HTIN4"),  # HTIN4 (2071-2073)
    expr("substring(value, 2074, 3) as HTM4"),  # HTM4 (2074-2076)
    expr("substring(value, 2077, 5) as WTKG3"),  # WTKG3 (2077-2081)
    expr("substring(value, 2082, 4) as _BMI5"),  # _BMI5 (2082-2085)
    expr("substring(value, 2086, 1) as _BMI5CAT"),  # _BMI5CAT (2086)
    expr("substring(value, 2087, 1) as _RFBMI5"),  # _RFBMI5 (2087)
    # expr("substring(value, 2088, 1) as _CHLDCNT"),  # _CHLDCNT (2088)
    expr("substring(value, 2089, 1) as _EDUCAG"),  # _EDUCAG (2089)
    expr("substring(value, 2090, 1) as _INCOMG1"),  # _INCOMG1 (2090)
    expr("substring(value, 2091, 1) as _SMOKER3"),  # _SMOKER3 (2091)
    # expr("substring(value, 2092, 1) as _RFSMOK3"),  # _RFSMOK3 (2092)
    expr("substring(value, 2093, 1) as _CURECI2"),  # _CURECI2 (2093)
    # expr("substring(value, 2094, 1) as DRNKANY6"),  # DRNKANY6 (2094)
    # expr("substring(value, 2095, 3) as DROCDY4_"),  # DROCDY4_ (2095-2097)
    # expr("substring(value, 2098, 1) as _RFBING6"),  # _RFBING6 (2098)
    expr("substring(value, 2099, 5) as _DRNKWK2"),  # _DRNKWK2 (2099-2103)
    # expr("substring(value ,2104 ,1 )as _RFDRHV8 "),  ## RFDRHV8(2104)
    # expr(" substring( value ,2105 ,1 )as _FLSHOT7 "),  ## _FLSHOT7(2105)
    # expr(" substring( value ,2106 ,1 )as PNEUMO3 "),  ## PNEUMO3(2106)
    expr(" substring( value ,2107 ,1 )as AIDTST4 "),  ## AIDTST4(2107)
    # expr(" substring( value ,2108 ,1 )as RFSEAT2 "),  ## RFSEAT2(2108)
    # expr(" substring( value ,2109 ,1 )as RFSEAT3 "),  ## RFSEAT3(2109)
    # expr(" substring( value ,2110 ,1 )as DRNKDRV "),  ## DRNKDRV(2110)
)
df.write.parquet("data.parquet")