In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

ROOT = Path(".").resolve() #/CS484/project/ or however yours looks
DATA_DIR = ROOT / "csv_files"
TRAIN_DIR = DATA_DIR / "train"

print("TRAIN_DIR:", TRAIN_DIR)

# ---- Config ----
BASE_FILE = TRAIN_DIR / "train_base.csv"

READ_CHUNK_SIZE = 200_000
CASE_ID = "case_id"

TRAIN_DIR: C:\Users\tahir\CS484\project\csv_files\train


In [2]:
#loading the data in a dataframe

df_base = pd.read_csv(BASE_FILE, low_memory=False)
#print(f"df base\n{df_base}")
df_base20 = df_base.head(20).copy()
base_ids = set(df_base20[CASE_ID].unique())

print(f"Loaded {len(df_base20)} base rows")
print(f"{len(base_ids)} unique IDs exist")


Loaded 20 base rows
20 unique IDs exist


In [3]:
def read_filtered(fp: Path, case_id: str, ids: set) -> pd.DataFrame:
    out = []
    try:
        for chunk in pd.read_csv(fp, chunksize=READ_CHUNK_SIZE, low_memory=False):
            if case_id not in chunk.columns:
                return pd.DataFrame()
            filt = chunk[chunk[case_id].isin(ids)]
            if not filt.empty:
                out.append(filt)
    except pd.errors.EmptyDataError:
        return pd.DataFrame()
    return pd.concat(out, ignore_index=True) if len(out)>0 else pd.DataFrame()


In [4]:
def collapse_rows_per_id(df: pd.DataFrame, case_id: str) -> pd.DataFrame:
    """
    For each case_id, sort by a detected date column if present; otherwise by original order.
    Then, for each column, take the last non-null value across that ID's rows.
    Returns one row per case_id with maximal filled values (no new engineered features).
    """
    df = df.copy()

    # 1) try to detect a date-like column to determine row recency
    dcol = None
    for c in df.columns:
        if c == case_id:
            continue
        if "date" in c.lower():
            parsed = pd.to_datetime(df[c], errors="coerce", infer_datetime_format=True)
            if parsed.notna().any():
                df[c] = parsed
                dcol = c
                break

    # 2) sort by id and recency (if we have a date col)
    if dcol is not None:
        df = df.sort_values([case_id, dcol])
    else:
        # preserve original order within each id
        df["_row_order__"] = np.arange(len(df))
        df = df.sort_values([case_id, "_row_order__"])

    # 3) within each id, forward-fill so the last row holds the latest non-null per column
    #    then take that last row
    def _pick_last_filled(g: pd.DataFrame) -> pd.Series:
        g_ff = g.ffill()  # forward-fill within the group, column-wise
        return g_ff.iloc[-1]  # last row now contains last non-null per column

    collapsed = (
        df.groupby(case_id, as_index=False, sort=False)
          .apply(_pick_last_filled)
          .reset_index(drop=True)
    )

    # cleanup helper column if we created it
    if "_row_order__" in collapsed.columns:
        collapsed = collapsed.drop(columns=["_row_order__"], errors="ignore")

    return collapsed


In [5]:
def integrate_file_into_train_all(train_all: pd.DataFrame, df_part: pd.DataFrame, case_id: str, overwrite: bool = False) -> pd.DataFrame:
    df_pick = collapse_rows_per_id(df_part, case_id)

    train_all[case_id] = pd.to_numeric(train_all[case_id], errors="coerce")
    df_pick[case_id] = pd.to_numeric(df_pick[case_id],   errors="coerce")

    # map by ID for fast column-wise fills
    by_id = df_pick.set_index(case_id)
    for col in by_id.columns:
        if col == case_id:
            continue
        if col not in train_all.columns:
            train_all[col] = np.nan
        mapped = train_all[case_id].map(by_id[col])
        if overwrite:
            train_all[col] = mapped.where(mapped.notna(), train_all[col])
        else:
            train_all[col] = train_all[col].fillna(mapped)
    return train_all



In [6]:
train_all = df_base20.copy()
for fp in sorted(TRAIN_DIR.glob("train_*.csv")):
    #just ignore train_base.csv
    if fp.name == "train_base.csv":
        continue

    print(f"Processing {fp.name} ...")
    df_part = read_filtered(fp, CASE_ID, set(train_all[CASE_ID]))
    if df_part.empty:
        print(f"{df_part} was skipped, no matching IDs or no ID column exist")
        continue

    train_all = integrate_file_into_train_all(train_all, df_part, CASE_ID, overwrite=False)
    print("integrated all files")

print("\n\nDone.")
print(f"train_all shape:, {train_all.shape}")

Processing train_applprev_1_0.csv ...


  parsed = pd.to_datetime(df[c], errors="coerce", infer_datetime_format=True)
  g_ff = g.ffill()  # forward-fill within the group, column-wise
  .apply(_pick_last_filled)
  train_all[col] = train_all[col].fillna(mapped)


integrated all files
Processing train_applprev_1_1.csv ...
Empty DataFrame
Columns: []
Index: [] was skipped, no matching IDs or no ID column exist
Processing train_applprev_2.csv ...


  g_ff = g.ffill()  # forward-fill within the group, column-wise
  .apply(_pick_last_filled)


integrated all files
Processing train_credit_bureau_a_1_0.csv ...
Empty DataFrame
Columns: []
Index: [] was skipped, no matching IDs or no ID column exist
Processing train_credit_bureau_a_1_1.csv ...
Empty DataFrame
Columns: []
Index: [] was skipped, no matching IDs or no ID column exist
Processing train_credit_bureau_a_1_2.csv ...
Empty DataFrame
Columns: []
Index: [] was skipped, no matching IDs or no ID column exist
Processing train_credit_bureau_a_1_3.csv ...
Empty DataFrame
Columns: []
Index: [] was skipped, no matching IDs or no ID column exist
Processing train_credit_bureau_a_2_0.csv ...
Empty DataFrame
Columns: []
Index: [] was skipped, no matching IDs or no ID column exist
Processing train_credit_bureau_a_2_1.csv ...
Empty DataFrame
Columns: []
Index: [] was skipped, no matching IDs or no ID column exist
Processing train_credit_bureau_a_2_10.csv ...
Empty DataFrame
Columns: []
Index: [] was skipped, no matching IDs or no ID column exist
Processing train_credit_bureau_a_2_2.csv

  parsed = pd.to_datetime(df[c], errors="coerce", infer_datetime_format=True)
  g_ff = g.ffill()  # forward-fill within the group, column-wise
  .apply(_pick_last_filled)
  train_all[col] = train_all[col].fillna(mapped)
  train_all[col] = train_all[col].fillna(mapped)
  train_all[col] = train_all[col].fillna(mapped)
  train_all[col] = train_all[col].fillna(mapped)
  train_all[col] = train_all[col].fillna(mapped)


integrated all files
Processing train_person_2.csv ...


  g_ff = g.ffill()  # forward-fill within the group, column-wise
  .apply(_pick_last_filled)


integrated all files
Processing train_static_0_0.csv ...


  parsed = pd.to_datetime(df[c], errors="coerce", infer_datetime_format=True)
  parsed = pd.to_datetime(df[c], errors="coerce", infer_datetime_format=True)
  parsed = pd.to_datetime(df[c], errors="coerce", infer_datetime_format=True)
  parsed = pd.to_datetime(df[c], errors="coerce", infer_datetime_format=True)
  g_ff = g.ffill()  # forward-fill within the group, column-wise
  .apply(_pick_last_filled)
  train_all[col] = np.nan
  train_all[col] = np.nan
  train_all[col] = np.nan
  train_all[col] = np.nan
  train_all[col] = np.nan
  train_all[col] = np.nan
  train_all[col] = np.nan
  train_all[col] = np.nan
  train_all[col] = np.nan
  train_all[col] = np.nan
  train_all[col] = np.nan
  train_all[col] = np.nan
  train_all[col] = np.nan
  train_all[col] = np.nan
  train_all[col] = np.nan
  train_all[col] = np.nan
  train_all[col] = np.nan
  train_all[col] = np.nan
  train_all[col] = np.nan
  train_all[col] = np.nan
  train_all[col] = np.nan
  train_all[col] = np.nan
  train_all[col] = np.n

integrated all files
Processing train_static_0_1.csv ...
Empty DataFrame
Columns: []
Index: [] was skipped, no matching IDs or no ID column exist
Processing train_static_cb_0.csv ...
Empty DataFrame
Columns: []
Index: [] was skipped, no matching IDs or no ID column exist
Processing train_tax_registry_a_1.csv ...
Empty DataFrame
Columns: []
Index: [] was skipped, no matching IDs or no ID column exist
Processing train_tax_registry_b_1.csv ...
Empty DataFrame
Columns: []
Index: [] was skipped, no matching IDs or no ID column exist
Processing train_tax_registry_c_1.csv ...
Empty DataFrame
Columns: []
Index: [] was skipped, no matching IDs or no ID column exist


Done.
train_all shape:, (20, 259)


In [7]:
from IPython.display import display
import pandas as pd

pd.set_option("display.max_rows", None)       # show all 20 rows
pd.set_option("display.max_columns", None)    # show all columns
pd.set_option("display.width", None)

print(f"\nDone. train_all shape: {train_all.shape}")
display(train_all)



Done. train_all shape: (20, 259)


Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target,actualdpd_943P,annuity_853A,approvaldate_319D,byoccupationinc_3656910L,cancelreason_3545846M,childnum_21L,creationdate_885D,credacc_actualbalance_314A,credacc_credlmt_575A,credacc_maxhisbal_375A,credacc_minhisbal_90A,credacc_status_367L,credacc_transactions_402L,credamount_590A,credtype_587L,currdebt_94A,dateactivated_425D,district_544M,downpmt_134A,dtlastpmt_581D,dtlastpmtallstes_3545839D,education_1138M,employedfrom_700D,familystate_726L,firstnonzeroinstldate_307D,inittransactioncode_279L,isbidproduct_390L,isdebitcard_527L,mainoccupationinc_437A,maxdpdtolerance_577P,num_group1,outstandingdebt_522A,pmtnum_8L,postype_4733339M,profession_152M,rejectreason_755M,rejectreasonclient_4145042M,revolvingaccount_394A,status_219L,tenor_203L,cacccardblochreas_147M,conts_type_509L,credacc_cards_status_52L,num_group2,birth_259D,birthdate_87D,childnum_185L,contaddr_district_15M,contaddr_matchlist_1032L,contaddr_smempladdr_334L,contaddr_zipcode_807M,education_927M,empl_employedfrom_271D,empl_employedtotal_800L,empl_industry_691L,empladdr_district_926M,empladdr_zipcode_114M,familystate_447L,gender_992L,housetype_905L,housingtype_772L,incometype_1044T,isreference_387L,language1_981M,mainoccupationinc_384A,maritalst_703L,personindex_1023L,persontype_1072L,persontype_792L,registaddr_district_1083M,registaddr_zipcode_184M,relationshiptoclient_415T,relationshiptoclient_642T,remitter_829L,role_1084L,role_993L,safeguarantyflag_411L,sex_738L,type_25L,addres_district_368M,addres_role_871L,addres_zip_823M,conts_role_79M,empls_economicalst_849M,empls_employedfrom_796D,empls_employer_name_740M,relatedpersons_role_762T,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgmaxdpdlast9m_3716943P,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,bankacctype_710L,cardtype_51L,clientscnt12m_3712952L,clientscnt3m_3712950L,clientscnt6m_3712949L,clientscnt_100L,clientscnt_1022L,clientscnt_1071L,clientscnt_1130L,clientscnt_136L,clientscnt_157L,clientscnt_257L,clientscnt_304L,clientscnt_360L,clientscnt_493L,clientscnt_533L,clientscnt_887L,clientscnt_946L,cntincpaycont9m_3716944L,cntpmts24_3658933L,commnoinclast6m_3546845L,credamount_770A,credtype_322L,currdebt_22A,currdebtcredtyperange_828A,datefirstoffer_1144D,datelastinstal40dpd_247D,datelastunpaid_3546854D,daysoverduetolerancedd_3976961L,deferredmnthsnum_166L,disbursedcredamount_1113A,disbursementtype_67L,downpmt_116A,dtlastpmtallstes_4499206D,eir_270L,equalitydataagreement_891L,equalityempfrom_62L,firstclxcampaign_1125D,firstdatedue_489D,homephncnt_628L,inittransactionamount_650A,inittransactioncode_186L,interestrate_311L,interestrategrace_34L,isbidproduct_1095L,isbidproductrequest_292L,isdebitcard_729L,lastactivateddate_801D,lastapplicationdate_877D,lastapprcommoditycat_1041M,lastapprcommoditytypec_5251766M,lastapprcredamount_781A,lastapprdate_640D,lastcancelreason_561M,lastdelinqdate_224D,lastdependentsnum_448L,lastotherinc_902A,lastotherlnsexpense_631A,lastrejectcommoditycat_161M,lastrejectcommodtypec_5251769M,lastrejectcredamount_222A,lastrejectdate_50D,lastrejectreason_759M,lastrejectreasonclient_4145040M,lastrepayingdate_696D,lastst_736L,maininc_215A,mastercontrelectronic_519L,mastercontrexist_109L,maxannuity_159A,maxannuity_4075009A,maxdbddpdlast1m_3658939P,maxdbddpdtollast12m_3658940P,maxdbddpdtollast6m_4187119P,maxdebt4_972A,maxdpdfrom6mto36m_3546853P,maxdpdinstldate_3546855D,maxdpdinstlnum_3546846P,maxdpdlast12m_727P,maxdpdlast24m_143P,maxdpdlast3m_392P,maxdpdlast6m_474P,maxdpdlast9m_1059P,maxdpdtolerance_374P,maxinstallast24m_3658928A,maxlnamtstart6m_4525199A,maxoutstandbalancel12m_4187113A,maxpmtlast3m_4525190A,mindbddpdlast24m_3658935P,mindbdtollast24m_4525191P,mobilephncnt_593L,monthsannuity_845L,numactivecreds_622L,numactivecredschannel_414L,numactiverelcontr_750L,numcontrs3months_479L,numincomingpmts_3546848L,numinstlallpaidearly3d_817L,numinstls_657L,numinstlsallpaid_934L,numinstlswithdpd10_728L,numinstlswithdpd5_4187116L,numinstlswithoutdpd_562L,numinstmatpaidtearly2d_4499204L,numinstpaid_4499208L,numinstpaidearly3d_3546850L,numinstpaidearly3dest_4493216L,numinstpaidearly5d_1087L,numinstpaidearly5dest_4493211L,numinstpaidearly5dobd_4499205L,numinstpaidearly_338L,numinstpaidearlyest_4493214L,numinstpaidlastcontr_4325080L,numinstpaidlate1d_3546852L,numinstregularpaid_973L,numinstregularpaidest_4493210L,numinsttopaygr_769L,numinsttopaygrest_4493213L,numinstunpaidmax_3546851L,numinstunpaidmaxest_4493212L,numnotactivated_1143L,numpmtchanneldd_318L,numrejects9m_859L,opencred_647L,paytype1st_925L,paytype_783L,payvacationpostpone_4187118D,pctinstlsallpaidearl3d_427L,pctinstlsallpaidlat10d_839L,pctinstlsallpaidlate1d_3546856L,pctinstlsallpaidlate4d_3546849L,pctinstlsallpaidlate6d_3546844L,pmtnum_254L,posfpd10lastmonth_333P,posfpd30lastmonth_3976960P,posfstqpd30lastmonth_3976962P,previouscontdistrict_112M,price_1097A,sellerplacecnt_915L,sellerplacescnt_216L,sumoutstandtotal_3546847A,sumoutstandtotalest_4493215A,totaldebt_9A,totalsettled_863A,totinstallast1m_4525188A,twobodfilling_608L,typesuite_864L,validfrom_1069D
0,0,2019-01-03,201901,0,0,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,,,,,,,,,,,,,,1986-07-01,NaT,,a55475b1,False,False,a55475b1,a55475b1,2017-09-15,MORE_FIVE,OTHER,a55475b1,a55475b1,MARRIED,,,,SALARIED_GOVT,,a55475b1,10800.0,,2.0,5.0,5.0,a55475b1,a55475b1,COLLEAGUE,COLLEAGUE,False,PE,,True,F,PHONE,,,,,,,,,,,1917.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,30000.0,CAL,0.0,0.0,,,,,0.0,30000.0,GBA,0.0,,0.45,,,,NaT,0.0,,CASH,0.45,,False,,,,,a55475b1,a55475b1,,,a55475b1,,,,,a55475b1,a55475b1,,,a55475b1,a55475b1,,,,0.0,0.0,0.0,,,,,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,1.0,,0.0,0.0,0.0,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,,OTHER,OTHER,,,,,,,24.0,0.0,0.0,,a55475b1,,0.0,0.0,,,0.0,0.0,,BO,,
1,1,2019-01-03,201901,0,0,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.0,,,,,,,,,,,,,,1957-08-01,NaT,,a55475b1,False,False,a55475b1,a55475b1,2008-10-29,MORE_FIVE,OTHER,a55475b1,a55475b1,DIVORCED,,,,SALARIED_GOVT,,a55475b1,10000.0,,2.0,5.0,5.0,a55475b1,a55475b1,OTHER_RELATIVE,OTHER_RELATIVE,False,PE,,True,M,PHONE,,,,,,,,,,,3134.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,19999.8,CAL,0.0,0.0,,,,,0.0,19999.8,GBA,0.0,,0.2999,,,,NaT,0.0,,CASH,0.2999,0.0,False,,,,,a55475b1,a55475b1,,,a55475b1,,,,,a55475b1,a55475b1,,,a55475b1,a55475b1,,,,0.0,0.0,0.0,,,,,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,1.0,,0.0,0.0,0.0,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,,OTHER,OTHER,,,,,,,18.0,0.0,0.0,,a55475b1,,0.0,0.0,,,0.0,0.0,,BO,,
2,2,2019-01-04,201901,0,0,0.0,1682.4,NaT,,a55475b1,0.0,2013-04-03,,0.0,,,,,16000.0,CAL,,,P136_108_173,0.0,,,P97_36_170,2010-02-15,SINGLE,2013-05-04,CASH,False,,8200.0,,1.0,,12.0,a55475b1,a55475b1,a55475b1,a55475b1,,D,12.0,,EMPLOYMENT_PHONE,,1.0,1974-12-01,NaT,,a55475b1,False,False,a55475b1,a55475b1,2010-02-15,MORE_FIVE,OTHER,a55475b1,a55475b1,MARRIED,,,,EMPLOYED,,a55475b1,14000.0,,2.0,4.0,4.0,a55475b1,a55475b1,SPOUSE,SPOUSE,False,PE,,True,F,PHONE,,,,,,,,,,,4937.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,78000.0,CAL,0.0,0.0,,,,,0.0,78000.0,GBA,0.0,,0.45,,,,NaT,1.0,,CASH,0.45,,False,,,,2013-04-03,a55475b1,a55475b1,,,a55475b1,,,,,a55475b1,a55475b1,10000.0,2013-04-03,a55475b1,a55475b1,,D,,0.0,0.0,0.0,,,,,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,2.0,,0.0,0.0,0.0,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,False,OTHER,OTHER,,,,,,,36.0,0.0,0.0,,a55475b1,,0.0,0.0,,,0.0,0.0,,BO,AL,
3,3,2019-01-03,201901,0,0,0.0,6140.0,NaT,,P94_109_143,,2019-01-07,,0.0,,,,,59999.8,CAL,,,P131_33_167,0.0,,,P97_36_170,2018-05-15,MARRIED,2019-02-07,CASH,False,,11000.0,,0.0,,12.0,a55475b1,a55475b1,P94_109_143,a55475b1,,D,12.0,,PRIMARY_EMAIL,,2.0,1993-08-01,NaT,,a55475b1,False,False,a55475b1,a55475b1,2018-05-15,MORE_FIVE,OTHER,a55475b1,a55475b1,MARRIED,,,,EMPLOYED,,a55475b1,10000.0,,1.0,4.0,4.0,a55475b1,a55475b1,SPOUSE,SPOUSE,False,PE,,True,F,PHONE,,,,,,,,,,,4643.6,0.0,0.0,1.0,0.0,2.0,0.0,1.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,40000.0,CAL,0.0,0.0,,,,,0.0,40000.0,GBA,0.0,,0.42,True,True,,NaT,0.0,,CASH,0.42,0.0,False,,,,2019-01-07,a55475b1,a55475b1,,,P94_109_143,,,,,a55475b1,a55475b1,59999.8,2019-01-07,P94_109_143,a55475b1,,D,,0.0,0.0,0.0,,,,,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,1.0,,0.0,0.0,0.0,1.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,1.0,False,OTHER,OTHER,,,,,,,12.0,0.0,0.0,,a55475b1,,1.0,1.0,,,0.0,0.0,,BO,AL,
4,4,2019-01-04,201901,0,1,0.0,2556.6,NaT,,P24_27_36,,2019-01-08,,0.0,,,,,40000.0,CAL,,,P194_82_174,0.0,,,a55475b1,,,2019-02-08,CASH,False,,16000.0,,0.0,,24.0,a55475b1,a55475b1,a55475b1,a55475b1,,T,24.0,,HOME_PHONE,,1.0,1994-01-01,NaT,,a55475b1,False,False,a55475b1,a55475b1,2014-12-15,MORE_FIVE,OTHER,a55475b1,a55475b1,MARRIED,,,,EMPLOYED,,a55475b1,24000.0,,2.0,5.0,5.0,a55475b1,a55475b1,SIBLING,SIBLING,False,PE,,True,F,PHONE,,,,,,,,,,,3390.2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,44000.0,CAL,0.0,0.0,,,,,0.0,44000.0,GBA,0.0,,0.45,,,,NaT,1.0,,CASH,0.45,,False,,,,2019-01-08,a55475b1,a55475b1,,,P24_27_36,,,,,a55475b1,a55475b1,,,a55475b1,a55475b1,,T,,0.0,0.0,0.0,,,,,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,1.0,,0.0,0.0,0.0,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,False,OTHER,OTHER,,,,,,,24.0,0.0,0.0,,a55475b1,,0.0,0.0,,,0.0,0.0,,BO,AL,
5,5,2019-01-02,201901,0,0,0.0,,NaT,,P85_114_140,,2019-01-16,,,,,,,,,,,P54_133_26,,,,a55475b1,,,,,False,,62000.0,,0.0,,,a55475b1,a55475b1,a55475b1,a55475b1,,T,,,PRIMARY_MOBILE,,0.0,1979-10-01,NaT,,a55475b1,False,False,a55475b1,a55475b1,2016-01-15,MORE_FIVE,OTHER,a55475b1,a55475b1,MARRIED,,,,PRIVATE_SECTOR_EMPLOYEE,,a55475b1,64000.0,,1.0,5.0,5.0,a55475b1,a55475b1,FRIEND,FRIEND,False,PE,,True,F,PHONE,a55475b1,,a55475b1,a55475b1,a55475b1,,a55475b1,,,,3600.0,0.0,0.0,1.0,0.0,8.0,2.0,1.0,,,,,,,,,,,INSTANT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,,,,60000.0,REL,0.0,0.0,,,,,0.0,0.0,DD,0.0,,,,,,NaT,0.0,0.0,NDF,,,False,,False,,2019-01-16,a55475b1,a55475b1,,,P85_114_140,,,,,a55475b1,a55475b1,,,a55475b1,a55475b1,,T,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,0.0,0.0,1.0,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,False,OTHER,OTHER,,,,,,,,0.0,0.0,0.0,a55475b1,,1.0,1.0,,,0.0,0.0,,FO,,
6,6,2019-01-03,201901,0,0,0.0,4189.6,NaT,1.0,P94_109_143,0.0,2017-12-28,,0.0,,,,,32000.0,CAL,0.0,,P82_154_182,0.0,,,P97_36_170,2013-09-15,SINGLE,2018-01-28,CASH,False,,35000.0,,1.0,0.0,11.0,a55475b1,a55475b1,P94_109_143,a55475b1,,D,11.0,,EMPLOYMENT_PHONE,,1.0,1991-01-01,1991-01-01,0.0,a55475b1,False,False,a55475b1,a55475b1,2013-09-15,MORE_FIVE,EDUCATION,a55475b1,a55475b1,SINGLE,F,,PARENTAL,EMPLOYED,True,a55475b1,20000.0,SINGLE,2.0,5.0,5.0,a55475b1,a55475b1,SIBLING,SIBLING,False,PE,FULL,True,F,PHONE,P204_92_178,PERMANENT,P164_28_170,a55475b1,a55475b1,,a55475b1,OTHER_RELATIVE,0.0,,3110.8,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,,,,,,,,,CA,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,,,,20000.0,CAL,0.0,0.0,,,,,0.0,20000.0,GBA,0.0,,0.45,,,,NaT,0.0,,CASH,0.45,,False,False,,,2018-09-12,a55475b1,a55475b1,,,P94_109_143,,,,,a55475b1,a55475b1,15980.0,2018-09-12,P94_109_143,a55475b1,,D,,0.0,0.0,0.0,,,,,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,3.0,,0.0,0.0,0.0,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,1.0,False,OTHER,OTHER,,,,,,,12.0,0.0,0.0,0.0,a55475b1,,0.0,1.0,,,0.0,0.0,,FO,,
7,7,2019-01-03,201901,0,0,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,,,,,,,,,,,,,0.0,1993-09-01,NaT,,a55475b1,False,False,a55475b1,a55475b1,2018-09-15,LESS_ONE,EDUCATION,a55475b1,a55475b1,SINGLE,,,,SALARIED_GOVT,,a55475b1,46000.0,,2.0,5.0,5.0,a55475b1,a55475b1,FRIEND,FRIEND,False,PE,,True,F,PHONE,a55475b1,,a55475b1,a55475b1,a55475b1,,a55475b1,,,,1218.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,,,,,,,,,,,INSTANT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,20300.0,REL,0.0,0.0,,,,,0.0,0.0,DD,0.0,,,,,,NaT,0.0,0.0,NDF,,,False,,False,,,a55475b1,a55475b1,,,a55475b1,,,,,a55475b1,a55475b1,,,a55475b1,a55475b1,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,0.0,0.0,1.0,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,,OTHER,OTHER,,,,,,,,0.0,0.0,0.0,a55475b1,,0.0,0.0,,,0.0,0.0,,FO,AL,
8,8,2019-01-03,201901,0,0,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,,,,,,,,,,,,,0.0,1982-11-01,NaT,,a55475b1,False,False,a55475b1,a55475b1,2016-05-15,MORE_FIVE,OTHER,a55475b1,a55475b1,MARRIED,,,,EMPLOYED,,a55475b1,90000.0,,2.0,5.0,5.0,a55475b1,a55475b1,FRIEND,FRIEND,False,PE,,True,M,PHONE,a55475b1,,a55475b1,a55475b1,a55475b1,,a55475b1,,,,8254.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,CA,,0.0,0.0,0.0,1.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,40000.0,CAL,0.0,0.0,,,,,0.0,40000.0,GBA,0.0,,0.45,,,,NaT,0.0,,CASH,0.45,,False,,,,,a55475b1,a55475b1,,,a55475b1,,,,,a55475b1,a55475b1,,,a55475b1,a55475b1,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,0.0,0.0,0.0,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,,OTHER,OTHER,,,,,,,6.0,0.0,0.0,0.0,a55475b1,,0.0,0.0,,,0.0,0.0,,FO,AL,
9,9,2019-01-03,201901,0,0,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,0.0,1949-10-01,NaT,,a55475b1,False,False,a55475b1,a55475b1,,,,a55475b1,a55475b1,MARRIED,,,,RETIRED_PENSIONER,,a55475b1,100000.0,,1.0,5.0,5.0,a55475b1,a55475b1,CHILD,CHILD,False,PE,,True,M,PHONE,a55475b1,,a55475b1,a55475b1,a55475b1,,a55475b1,,,,4929.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,CA,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,,,64000.0,CAL,0.0,0.0,,,,,0.0,64000.0,GBA,0.0,,0.45,,,,NaT,0.0,,CASH,0.45,,False,,,,,a55475b1,a55475b1,,,a55475b1,,,,,a55475b1,a55475b1,,,a55475b1,a55475b1,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,0.0,0.0,0.0,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,,OTHER,OTHER,,,,,,,30.0,0.0,0.0,0.0,a55475b1,,0.0,0.0,,,0.0,0.0,,FO,AL,


In [9]:
cont = train_all["target"]
cont.head()

0    0
1    0
2    0
3    0
4    1
Name: target, dtype: int64

In [None]:
print("Base IDs:", df_base.head(20)[CASE_ID].tolist())
print("Train_all IDs:", train_all[CASE_ID].tolist())
