In [1]:
import os
import pandas as pd

from sklearn.compose import ColumnTransformer, make_column_transformer

from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import MultiTaskLassoCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor



In [2]:
file_path = os.path.join("clean_student_data.csv")

df = pd.read_csv(file_path, index_col="STU_ID")

## Create X and y arrays

In [3]:
X = df[[column for column in df.columns if column not in ["F2HSSTAT", "F2EVERDO", "F1RGPP2"]]].values

y = df[["F2HSSTAT", "F2EVERDO", "F1RGPP2"]].values

## Create the preprocessor

In [20]:
# initialize variables for imputer

max_iterations = 20
nearest_features = 5
non_category_initial_strategy = "median"
category_initial_strategy = "most_frequent"
sparse_bool = False


# Create the preprocessing pipelines for numeric data.

BYHMWRK_feature = ["BYHMWRK"]
BYHMWRK_transformer = Pipeline(steps=[
#     ("imputer", SimpleImputer(strategy='median')),
    ("imputer", IterativeImputer(missing_values=-1, max_iter=max_iterations, n_nearest_features=nearest_features, initial_strategy=non_category_initial_strategy, min_value=0, max_value=45)),
    ("scaler", StandardScaler())
])

F1RGPP2_feature = ["F1RGPP2"]
F1RGPP2_transformer = Pipeline(steps=[
#     ("imputer", SimpleImputer(strategy='median')),
    ("imputer", IterativeImputer(missing_values=-1, max_iter=max_iterations, n_nearest_features=nearest_features, initial_strategy=non_category_initial_strategy, min_value=0, max_value=6)),
    ("scaler", StandardScaler())
])


# Create the preprocessing pipelines for categorical data.

# 0 and 1
zero_one_features = ["BYSTLANG", "BYS23C", "BYS84D", "BYS84I"]
zero_one_transformer = Pipeline(steps=[
    ("imputer", IterativeImputer(missing_values=-1, max_iter=max_iterations, n_nearest_features=nearest_features, initial_strategy=category_initial_strategy, min_value=0, max_value=1)),
    ("onehot", OneHotEncoder(drop="first", sparse=sparse_bool))
])

# 1 and 2
one_two_features = ["BYSEX"]
one_two_transformer = Pipeline(steps=[
    ("imputer", IterativeImputer(missing_values=-1, max_iter=max_iterations, n_nearest_features=nearest_features, initial_strategy=category_initial_strategy, min_value=1, max_value=2)),
    ("onehot", OneHotEncoder(drop="first", sparse=sparse_bool))
])

# 1 through 3
one_three_features = ["BYURBAN", "BYS90D", "BYS54I"]
one_three_transformer = Pipeline(steps=[
    ("imputer", IterativeImputer(missing_values=-1, max_iter=max_iterations, n_nearest_features=nearest_features, initial_strategy=category_initial_strategy, min_value=1, max_value=3)),
    ("onehot", OneHotEncoder(drop="first", sparse=sparse_bool))
])

# 1 through 4
one_four_features = ["BYREGION", "BYS44C", "BYS20E", "BYS87C", "BYS20D", "BYS37", "BYS27I", "BYS38A", "BYS20J", "BYS85A"]
one_four_transformer = Pipeline(steps=[
    ("imputer", IterativeImputer(missing_values=-1, max_iter=max_iterations, n_nearest_features=nearest_features, initial_strategy=category_initial_strategy, min_value=1, max_value=4)),
    ("onehot", OneHotEncoder(drop="first", sparse=sparse_bool))
])

# 0 through 5
zero_five_features = ["BYRISKFC"]
zero_five_transformer = Pipeline(steps=[
    ("imputer", IterativeImputer(missing_values=-1, max_iter=max_iterations, n_nearest_features=nearest_features, initial_strategy=category_initial_strategy, min_value=0, max_value=5)),
    ("onehot", OneHotEncoder(drop="first", sparse=sparse_bool))
])

# 1 through 5
one_five_features = ["BYS24C", "BYS24D"]
one_five_transformer = Pipeline(steps=[
    ("imputer", IterativeImputer(missing_values=-1, max_iter=max_iterations, n_nearest_features=nearest_features, initial_strategy=category_initial_strategy, min_value=1, max_value=5)),
    ("onehot", OneHotEncoder(drop="first", sparse=sparse_bool))
])

# 0 through 6
zero_six_features = ["BYS46B"]
zero_six_transformer = Pipeline(steps=[
    ("imputer", IterativeImputer(missing_values=-1, max_iter=max_iterations, n_nearest_features=nearest_features, initial_strategy=category_initial_strategy, min_value=0, max_value=6)),
    ("onehot", OneHotEncoder(drop="first", sparse=sparse_bool))
])

# 1 through 7
one_seven_features = ["BYRACE"]
one_seven_transformer = Pipeline(steps=[
    ("imputer", IterativeImputer(missing_values=-1, max_iter=max_iterations, n_nearest_features=nearest_features, initial_strategy=category_initial_strategy, min_value=1, max_value=7)),
    ("onehot", OneHotEncoder(drop="first", sparse=sparse_bool))
])

# 0 through 8
zero_eight_features = ["BYTVVIGM"]
zero_eight_transformer = Pipeline(steps=[
    ("imputer", IterativeImputer(missing_values=-1, max_iter=max_iterations, n_nearest_features=nearest_features, initial_strategy=category_initial_strategy, min_value=0, max_value=8)),
    ("onehot", OneHotEncoder(drop="first", sparse=sparse_bool))
])

# 1 through 8
one_eight_features = ["BYPARED"]
one_eight_transformer = Pipeline(steps=[
    ("imputer", IterativeImputer(missing_values=-1, max_iter=max_iterations, n_nearest_features=nearest_features, initial_strategy=category_initial_strategy, min_value=1, max_value=8)),
    ("onehot", OneHotEncoder(drop="first", sparse=sparse_bool))
])

# 0 through 9
zero_nine_features = ["BYWRKHRS"]
zero_nine_transformer = Pipeline(steps=[
    ("imputer", IterativeImputer(missing_values=-1, max_iter=max_iterations, n_nearest_features=nearest_features, initial_strategy=category_initial_strategy, min_value=0, max_value=9)),
    ("onehot", OneHotEncoder(drop="first", sparse=sparse_bool))
])

# 1 through 13
one_thirteen_features = ["BYINCOME"]
one_thirteen_transformer = Pipeline(steps=[
    ("imputer", IterativeImputer(missing_values=-1, max_iter=max_iterations, n_nearest_features=nearest_features, initial_strategy=category_initial_strategy, min_value=1, max_value=13)),
    ("onehot", OneHotEncoder(drop="first", sparse=sparse_bool))
])

# 0 through 21
zero_twentyone_features = ["BYS42", "BYS43"]
zero_twentyone_transformer = Pipeline(steps=[
    ("imputer", IterativeImputer(missing_values=-1, max_iter=max_iterations, n_nearest_features=nearest_features, initial_strategy=category_initial_strategy, min_value=0, max_value=21)),
    ("onehot", OneHotEncoder(drop="first", sparse=sparse_bool))
])


# Create the preprocessor column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("BYHMWRK", BYHMWRK_transformer, BYHMWRK_feature),
        ("F1RGPP2", F1RGPP2_transformer, F1RGPP2_feature),
        ("zero_one", zero_one_transformer, zero_one_features),
        ("one_two", one_two_transformer, one_two_features),
        ("one_three", one_three_transformer, one_three_features),
        ("one_four", one_four_transformer, one_four_features),
        ("zero_five", zero_five_transformer, zero_five_features),
        ("one_five", one_five_transformer, one_five_features),
        ("zero_six", zero_six_transformer, zero_six_features),
        ("one_seven", one_seven_transformer, one_seven_features),
        ("zero_eight", zero_eight_transformer, zero_eight_features),
        ("one_eight", one_eight_transformer, one_eight_features),
        ("zero_nine", zero_nine_transformer, zero_nine_features),
        ("one_thirteen", one_thirteen_transformer, one_thirteen_features),
        ("zero_twentyone", zero_twentyone_transformer, zero_twentyone_features)
    ]
)

# preprocessor = make_column_transformer(
#     (BYHMWRK_transformer, ["BYHMWRK"]),
#     (F1RGPP2_transformer, ["F1RGPP2"]),
#     (zero_one_transformer, zero_one_features),
#     (one_two_transformer, one_two_features),
#     (one_three_transformer, one_three_features),
#     (one_four_transformer, one_four_features),
#     (zero_five_transformer, zero_five_features),
#     (one_five_transformer, one_five_features),
#     (zero_six_transformer, zero_six_features),
#     (one_seven_transformer, one_seven_features),
#     (zero_eight_transformer, zero_eight_features),
#     (one_eight_transformer, one_eight_features),
#     (zero_nine_transformer, zero_nine_features),
#     (one_thirteen_transformer, one_thirteen_features),
#     (zero_twentyone_transformer, zero_twentyone_features)
# )

# preprocessor = make_column_transformer(
#     (StandardScaler(), ["BYHMWRK", "F1RGPP2"]),
#     (OneHotEncoder(), [column for column in df.columns if column not in ["BYHMWRK", "F2HSSTAT", "BYHMWRK", "F1RGPP2"]])
# )

# initialize variables for imputer

max_iterations = 20
nearest_features = 5
non_category_initial_strategy = "median"
category_initial_strategy = "most_frequent"
sparse_bool = False


# Create the preprocessing pipelines for numeric data.

numeric_features = ["BYHMWRK", "F1RGPP2"]
numeric_transformer  = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='median')) # IterativeImputer(missing_values=-1, max_iter=max_iterations, n_nearest_features=nearest_features, initial_strategy=non_category_initial_strategy, min_value=0, max_value=45)),
#     ("scaler", StandardScaler())
])


# Create the preprocessing pipelines for categorical data.
categorical_features = [
    "BYSEX",
    "BYRACE",
    "BYSTLANG",
    "BYPARED",
    "BYINCOME",
    "BYURBAN",
    "BYREGION",
    "BYRISKFC",
    "BYWRKHRS",
    "BYS42",
    "BYS43",
    "BYTVVIGM",
    "BYS46B",
    "BYS44C",
    "BYS20E",
    "BYS87C",
    "BYS20D",
    "BYS23C",
    "BYS37",
    "BYS27I",
    "BYS90D",
    "BYS38A",
    "BYS20J",
    "BYS24C",
    "BYS24D",
    "BYS54I",
    "BYS84D",
    "BYS84I",
    "BYS85A",
    "F2HSSTAT",
    "F2EVERDO"
    ]

categorical_transformer  = Pipeline(steps=[
    ("imputer", IterativeImputer(missing_values=-1, max_iter=max_iterations, n_nearest_features=nearest_features, initial_strategy=category_initial_strategy, min_value=0)),
    ("onehot", OneHotEncoder(drop="first", sparse=sparse_bool))
])


# Create the preprocessor column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# preprocessor = make_column_transformer(
#     (BYHMWRK_transformer, ["BYHMWRK"]),
#     (F1RGPP2_transformer, ["F1RGPP2"]),
#     (zero_one_transformer, zero_one_features),
#     (one_two_transformer, one_two_features),
#     (one_three_transformer, one_three_features),
#     (one_four_transformer, one_four_features),
#     (zero_five_transformer, zero_five_features),
#     (one_five_transformer, one_five_features),
#     (zero_six_transformer, zero_six_features),
#     (one_seven_transformer, one_seven_features),
#     (zero_eight_transformer, zero_eight_features),
#     (one_eight_transformer, one_eight_features),
#     (zero_nine_transformer, zero_nine_features),
#     (one_thirteen_transformer, one_thirteen_features),
#     (zero_twentyone_transformer, zero_twentyone_features)
# )

# preprocessor = make_column_transformer(
#     (StandardScaler(), ["BYHMWRK", "F1RGPP2"]),
#     (OneHotEncoder(), [column for column in df.columns if column not in ["BYHMWRK", "F2HSSTAT", "BYHMWRK", "F1RGPP2"]])
# )

## Apply the preprocessor and Create the full prediction pipeline

In [21]:
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

y_train.shape

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

ValueError: Specifying the columns using strings is only supported for pandas DataFrames

In [29]:
## Null Model
dr = DummyRegressor()
cross_validate(dr, X, y, scoring="neg_mean_squared_error")["test_score"].mean()

-1.4456114146808647

In [21]:
## Logistic Regression
lr = LogisticRegression()
cross_validate(lr, X, y, scoring="neg_mean_squared_error")["test_score"].mean()

ValueError: bad input shape (12957, 3)

ValueError: bad input shape (12957, 3)

ValueError: bad input shape (12958, 3)

ValueError: bad input shape (12958, 3)

ValueError: bad input shape (12958, 3)



nan

In [7]:
## Linear Regression
lr = LinearRegression()
cross_validate(lr, X, y, scoring="neg_mean_squared_error")["test_score"].mean()

-1.1274961750631853

In [10]:
## PCA
pca_regression = make_pipeline(
    PCA(3),
    LinearRegression()
)
cross_validate(pca_regression, X, y, scoring="neg_mean_squared_error")["test_score"].mean()

-1.33907038114679

In [11]:
## Lasso Regression
lasso = MultiTaskLassoCV()
cross_validate(lasso, X, y, scoring="neg_mean_squared_error")["test_score"].mean()

-1.1298758698157534

In [12]:
## Lasso Regression
lasso_scaled = make_pipeline(StandardScaler(), MultiTaskLassoCV())
cross_validate(lasso_scaled, X, y, scoring="neg_mean_squared_error")["test_score"].mean()

-1.1293678080033718

In [8]:
## Decision Tree
dt = DecisionTreeRegressor()
cross_validate(dt, X, y, scoring="neg_mean_squared_error")["test_score"].mean()

-2.220596736545072

In [9]:
## Random Forest
rf = RandomForestRegressor()
cross_validate(rf, X, y, scoring="neg_mean_squared_error")["test_score"].mean()

-1.1414634681438707