In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from pandas.api.types import is_numeric_dtype

In [None]:
raw = pd.read_parquet("data/us_estdata_expanded.parquet")

In [None]:
NUM_SAMPLE = 10000
ALTERNATIVE_SAMPLE = 200

In [None]:
df = raw.iloc[:NUM_SAMPLE].copy()
del raw


In [None]:
def condense(df):
    for col in df.columns:
        if not is_numeric_dtype(df[col]):
            continue
        max_val = df[col].max()
        min_val = df[col].min()
        if max_val < 1 and min_val > -1:
            df[col] = df[col].astype(np.float32)
        elif max_val < 127 and min_val > -128:
            df[col] = df[col].astype(np.int8)
        elif max_val < 32767 and min_val > -32768:
            df[col] = df[col].astype(np.int16)
        else:
            df[col] = df[col].astype(np.int32)


In [None]:
df["MED"] = np.where(df["NAICS"] == "MED", 1, 0)
df["MFG"] = np.where(df["NAICS"] == "MFG", 1, 0)
df["RET"] = np.where(df["NAICS"] == "RET", 1, 0)
df["EDU"] = np.where(df["NAICS"] == "EDU", 1, 0)
df["ADM"] = np.where(df["NAICS"] == "ADM", 1, 0)
df["FOD"] = np.where(df["NAICS"] == "FOD", 1, 0)
df["PRF"] = np.where(df["NAICS"] == "PRF", 1, 0)
df["TRN"] = np.where(df["NAICS"] == "TRN", 1, 0)
df["SRV"] = np.where(df["NAICS"] == "SRV", 1, 0)
df["FIN"] = np.where(df["NAICS"] == "FIN", 1, 0)
df["WHL"] = np.where(df["NAICS"] == "WHL", 1, 0)
df["AGR"] = np.where(df["NAICS"] == "AGR", 1, 0)
df["PUB"] = np.where(df["NAICS"] == "PUB", 1, 0)
df["INF"] = np.where(df["NAICS"] == "INF", 1, 0)
df["ENT"] = np.where(df["NAICS"] == "ENT", 1, 0)
df["REL"] = np.where(df["NAICS"] == "REL", 1, 0)
df["UTL"] = np.where(df["NAICS"] == "UTL", 1, 0)
df["EXT"] = np.where(df["NAICS"] == "EXT", 1, 0)
df["MNG"] = np.where(df["NAICS"] == "MNG", 1, 0)
df["CON"] = np.where(df["NAICS"] == "CON", 1, 0)
df["NO_IND"] = np.where(df["NAICS"].isna(), 1, 0)

In [None]:
# clean up the database (Biogeme Database can only have numerical values)
df = df.select_dtypes(["number"])
df = df.fillna(0)


In [None]:
condense(df)

In [None]:
# defining the chosen alterantive for each person explicitly (0 to 35, corresponding to staying and moving to one of the many PUMAs)
df["CHOSEN_PUMA"] = df["CHOSEN"]
df["CHOSEN"] = 0
for i in range(1, 201):
    var = "ALT" + str(i) + "_PUMA"
    df["CHOSEN"] = np.where(df[var] == df["CHOSEN_PUMA"], i, df["CHOSEN"])
df["CHOSEN"] = np.where(df["STAY"] == 1, 0, df["CHOSEN"])


In [None]:
df["AGE_18_34"] = np.where(df["AGEP"] <= 34, 1, 0)
df["AGE_35_64"] = np.where((df["AGEP"] >= 35) & (df["AGEP"] <= 64), 1, 0)
df["AGE_OVER_65"] = np.where((df["AGEP"] >= 65), 1, 0)
df["FOREIGN"] = np.where(df["NATIVITY"] == 2, 1, 0)

In [None]:
df["AGE_18_22"] = np.where(df["AGEP"] <= 22, 1, 0)
df["AGE_23_29"] = np.where((df["AGEP"] >= 23) & (df["AGEP"] <= 29), 1, 0)
df["AGE_30_39"] = np.where((df["AGEP"] >= 30) & (df["AGEP"] <= 39), 1, 0)
df["AGE_40_49"] = np.where((df["AGEP"] >= 40) & (df["AGEP"] <= 49), 1, 0)
df["AGE_50_64"] = np.where((df["AGEP"] >= 50) & (df["AGEP"] <= 64), 1, 0)

In [None]:
df["IN_COLLEGE"] = np.where((df["SCHG"] == 15) | (df["SCHG"] == 16), 1, 0)
df["IN_COLLEGE"]

In [None]:
df["EDU_LESS_HIGH"] = np.where(df["SCHL"] <= 15, 1, 0)
df["EDU_HIGH"] = np.where((df["SCHL"] <= 20) & (df["SCHL"] >= 16), 1, 0)
df["EDU_COLLEGE"] = np.where(df["SCHL"] >= 21, 1, 0)
df["EDU_NOCOLLEGE"] = np.where(df["EDU_COLLEGE"] == 0, 1, 0)

In [None]:
df["WOMAN_CHILD"] = np.where((df["PAOC"] >= 1) & (df["PAOC"] <= 3), 1, 0)
df["UNEMPLOYED"] = np.where(df["ESR"] == 3, 1, 0)

In [None]:
df["MALE"] = np.where(df["SEX"] == 1, 1, 0)
df["FEMALE"] = np.where(df["SEX"] == 0, 1, 0)

In [None]:
df["MARRIED"] = np.where(df["MAR"] == 1, 1, 0)

In [None]:
df["REC_NO_MAR"] = np.where((df["MARHD"] == 1) | (df["MARHW"] == 1), 1, 0)
df["REC_NO_MAR"].value_counts()

In [None]:
df["MARHM_new"] = np.where(df["MARHM"] == 2, 0, df["MARHM"])
df["MARHM_new"].value_counts()

In [None]:
df["married_old"] = np.where((df["MARHM"] == df["MARRIED"]), 0, df["MARRIED"])
df["married_old"].value_counts()

In [None]:
df["MILITARY"] = np.where(df["MIL"] == 1, 1, 0)

In [None]:
# up to debate still
df["AGR_EXT"] = np.where((df["AGR"] == 1) | (df["EXT"] == 1), 1, 0)
df["HIGH_ED"] = np.where(
    (df["MED"] == 1)
    | (df["EDU"] == 1)
    | (df["PRF"] == 1)
    | (df["MED"] == 1)
    | (df["FIN"] == 1)
    | (df["INF"] == 1)
    | (df["MED"] == 1),
    1,
    0,
)
df["LICENSE"] = np.where((df["SRV"] == 1) | (df["REL"] == 1), 1, 0)
df["OTHER_JOB"] = np.where(
    (df["AGR_EXT"] == 0)
    & (df["HIGH_ED"] == 0)
    & (df["LICENSE"] == 0)
    & (df["NO_IND"] == 0),
    1,
    0,
)

In [None]:
df["MICRO_adj_ORIG"] = np.where((df["TYPE_ORIG"] == 3) | (df["TYPE_ORIG"] == 2), 1, 0)
df["METRO"] = np.where(df["TYPE_ORIG"] == 1, 1, 0)
df["T34"] = np.where(df["TYPE_ORIG"] == 0, 1, 0)

In [None]:
df["UNEMPLOYED"] = np.where((df["ESR"] == 3) | (df["ESR"] == 6), 1, 0)
df["IN_LF"] = np.where(df["ESR"] == 6, 0, 1)

In [None]:
df["WORK2_MAR"] = np.where(df["FES"] == 1, 1, 0)
df["WORK1_MAR"] = np.where((df["FES"] <= 4) & (df["FES"] >= 2), 1, 0)
df["OTHER_FAMILY"] = np.where((df["HHT"] == 2) | (df["HHT"] == 3), 1, 0)

In [None]:
for i in range(1, 201):
    key = "ALT{0}_".format(i)
    df[key + "MICRO"] = np.where(
        (df[key + "TYPE"] == 2) | (df[key + "TYPE"] == 3), 1, 0
    )
    df[key + "METRO"] = np.where(df[key + "TYPE"] == 1, 1, 0)
    df[key + "T34"] = np.where(df[key + "TYPE"] == 0, 1, 0)


In [None]:
df.loc[df["CBSA_NAME_ORIG"] == -1, "CBSA_NAME_ORIG"] = -2

In [None]:
df["MIGSP_ORIG"] = df["ORIGIN"].astype(str).str.zfill(7).str[0:2].astype(int)

In [None]:
for col in df.columns:
    print(col)


In [None]:
base_df = df.iloc[:, :152]

# Step 2: Generate the ALT column combinations for i in range(1, 201)
alt_columns_list = []
for i in range(1, 201):
    alt_columns = [
        f"ALT{i}_{y}"
        for y in [
            "POP",
            "DENS",
            "COLLEGE",
            "FOREIGN",
            "18_34",
            "35_64",
            "65",
            "MIL_NUM",
            "EMP",
            "ENT",
            "DIST",
            "OWN_JOB",
            "TYPE",
            "CBSA",
            "HH_COST",
            "PINCP",
            "UNEMP_PCT",
            "PERNP",
            "State",
            "PUMA",
            "MICRO",
            "METRO",
            "T34",
        ]
    ]
    alt_columns_list.append(alt_columns)

# Step 3: Concatenate the base columns with each set of ALT columns for all rows
# Concatenate all these combinations vertically
long_df = pd.concat(
    [
        pd.concat([df.loc[:, alt_columns], base_df], axis=1)
        for alt_columns in alt_columns_list
    ],
    axis=0,
    ignore_index=True,
)

# Step 4: Add the original base_df rows to the beginning of the final DataFrame
final_df = pd.concat([base_df, long_df], axis=0, ignore_index=True)


In [None]:
# TODO: create a dataframe with just the necessary columns and then evaluate the logit

In [None]:
stay_vars = [
    "ASC_STAY",
    "AGE_18_22",
    "AGE_23_29",
    "AGE_30_39",
    "AGE_50_64",
    "AGE_OVER_65",
    "child",
    "REC_CHILD",
    "MARHM_new",
    "REC_NO_MAR",
    "WORK2_MAR",
    "WORK1_MAR",
    "OTHER_FAMILY",
    "IN_COLLEGE",
    "MILITARY",
    "OWN_JOB_ORIG",
    "HH_MED_VAL_ORIG",
    "GRNTP_ORIG_ADJ",
    "SMOCP_ORIG_ADJ",
    "FOREIGN",
    "EDU_COLLEGE",
    "EDU_LESS_HIGH",
    "T34",
    "MICRO_adj_ORIG",
]
destchoice_vars = [
    "ASC_LEAVE",
    "ALT_pop_log",
    "ALT_DIST",
    "ALT_logdist",
    "ALT_cbsadist",
    "ALT_hhcost",
    "ALT_COLLEGE",
    "ALT_FOREIGN",
    "ALT_18_34",
    "ALT_35_64",
    "ALT_65",
    "ALT_ENT_18_34",
    "ALT_ENT_35_64",
    "ALT_ENT_OVER_65",
    "ALT_unemp",
    "ALT_T34_Metro",
    "ALT_T34_Micro",
    "ALT_Metro_T34",
    "ALT_Metro_Metro",
    "ALT_Metro_Micro",
    "ALT_Micro_T34",
    "ALT_Micro_Metro",
    "ALT_Micro_Micro",
    "ALT_OWN_JOB_AGREXT",
    "ALT_OWN_JOB_HIGHED",
    "ALT_OWN_JOB_LICENSE",
    "ALT_OWN_JOB_OTHER",
    "ALT_MIL",
    "ALT_birthstate",
    "ALT_samestate",
]


In [None]:
for col in stay_vars:
    long_data[col] = long_data[col] * (long_data["alt"] == 0)
    gc.collect()
gc.collect()

In [None]:
for col in destchoice_vars:
    long_data[col] = long_data[col] * (long_data["alt"] != 0)
    gc.collect()
gc.collect()

In [None]:
for i in long_data.dtypes:
    print(i)

In [None]:
import numpy as np
import pandas as pd
from xlogit import MultinomialLogit

In [None]:
estdata = pd.read_parquet("temp.parquet.gzip")

In [None]:
stay_vars = [
    "ASC_STAY",
    "AGE_18_22",
    "AGE_23_29",
    "AGE_30_39",
    "AGE_50_64",
    "AGE_OVER_65",
    "child",
    "REC_CHILD",
    "MARHM_new",
    "REC_NO_MAR",
    "WORK2_MAR",
    "WORK1_MAR",
    "OTHER_FAMILY",
    "IN_COLLEGE",
    "MILITARY",
    "OWN_JOB_ORIG",
    "HH_MED_VAL_ORIG",
    "GRNTP_ORIG_ADJ",
    "SMOCP_ORIG_ADJ",
    "FOREIGN",
    "EDU_COLLEGE",
    "EDU_LESS_HIGH",
    "T34",
    "MICRO_adj_ORIG",
]
destchoice_vars = [
    "ASC_LEAVE",
    "ALT_pop_log",
    "ALT_DIST",
    "ALT_logdist",
    "ALT_cbsadist",
    "ALT_hhcost",
    "ALT_COLLEGE",
    "ALT_FOREIGN",
    "ALT_18_34",
    "ALT_35_64",
    "ALT_65",
    "ALT_ENT_18_34",
    "ALT_ENT_35_64",
    "ALT_ENT_OVER_65",
    "ALT_unemp",
    "ALT_T34_Metro",
    "ALT_T34_Micro",
    "ALT_Metro_T34",
    "ALT_Metro_Metro",
    "ALT_Metro_Micro",
    "ALT_Micro_T34",
    "ALT_Micro_Metro",
    "ALT_Micro_Micro",
    "ALT_OWN_JOB_AGREXT",
    "ALT_OWN_JOB_HIGHED",
    "ALT_OWN_JOB_LICENSE",
    "ALT_OWN_JOB_OTHER",
    "ALT_MIL",
    "ALT_birthstate",
    "ALT_samestate",
]


In [None]:
init = [
    15.8,
    -0.426,
    -0.571,
    -0.317,
    0.231,
    0.795,
    -0.471,
    -0.0526,
    -1.18,
    -0.57,
    1.02,
    0.899,
    0.632,
    0.692,
    -0.693,
    0.97,
    0.000000753,
    -0.000915,
    0.000689,
    0.204,
    -0.174,
    0.0563,
    0.579,
    -0.208,
    0,
    1,
    -0.000000302,
    -0.35,
    -0.0000278,
    -2.06,
    0.0000654,
    0.684,
    3.68,
    1.39,
    4.62,
    1.24,
    2.98,
    2.58,
    -2.1,
    1.05,
    1.17,
    0.0409,
    1.25,
    0.808,
    -0.0367,
    0.572,
    2.01,
    6.35,
    1.54,
    5.23,
    2.45,
    0.00018,
    0.285,
    2.5,
]


In [None]:
varnames = stay_vars + destchoice_vars

In [None]:
estdata["ALT_pop_log"] = np.log(estdata["ALT_POP"] + 1).astype(np.float32)

In [None]:
model = MultinomialLogit()

In [None]:
model.fit(
    X=estdata[varnames],
    y=estdata["CHOICE_CODE"],
    varnames=varnames,
    alts=estdata["alt"],
    ids=estdata["custom_id"],
    avail=estdata["av"],
    init_coeff=init,
)
