<a href="https://colab.research.google.com/github/xyhan-github/STATS335_F23/blob/master/final/TabularDemo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Imports

In [1]:
# install and import Huggingface Datasets
! pip install datasets
from datasets import load_dataset

# load necessary functions
# Numpy
import numpy as np

# Scikit-Learn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error

# Pandas
import pandas as pd

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


# Experiment Settings and Parameters

In [2]:
# Experiment specification
dataset_name = 'housing' # 'adult_income', 'housing', or 'covertype'
method_name  = 'Linear' # 'Linear', 'XGBoost', or 'LightGBM'

# Method configs
config = {"depth": 2,
          "lambda": 0.25,
          "learning_rate": 0.25,
          "num_rounds": 50}

# Dataloading

In [3]:
# Load Dataset

# Load the necessary dataframes
if dataset_name == 'adult_income':
  dataset = load_dataset("scikit-learn/adult-census-income")
  dataset = dataset['train']
  df = pd.DataFrame(dataset)
  y_df = df[['income']]
  X_df = df.drop('income', axis=1)

elif dataset_name == 'housing':
  dataset = load_dataset("leostelon/california-housing")
  dataset = dataset['train']
  df = pd.DataFrame(dataset)
  y_df = df[['median_house_value']]
  X_df = df.drop('median_house_value', axis=1)

elif dataset_name == 'covertype':
  dataset = load_dataset("mstz/covertype", "covertype")
  dataset = dataset['train']

  # Trick to load big datasets faster by loading in chunks
  from tqdm import tqdm
  batch_size = 10000
  dfs = []
  for i in tqdm(range(0, len(dataset), batch_size)):
      batch = dataset[i:i + batch_size]
      dfs.append(pd.DataFrame(batch))

  df = pd.concat(dfs, ignore_index=True)
  y_df = df[['cover_type']]
  X_df = df.drop('cover_type', axis=1)
else:
  raise Exception("Invalid Dataset Name!")


Downloading readme:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.42M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

# Data Cleaning

In [4]:
#@title Define `prepare_data(X_df, y_df)` function
def prepare_data(X_df, y_df):

  """
  Preprocesses and prepares the input data and target labels for machine learning tasks.

  Parameters:
      X_df (pd.DataFrame): DataFrame containing the input features, including categorical columns.
      y_df (pd.DataFrame): DataFrame containing the target labels.

  Returns:
      X (np.ndarray): NumPy array of preprocessed input features after one-hot encoding categorical variables.
      y (np.ndarray): NumPy array of target labels, either as a numeric array (for regression) or encoded as
                      integers (for classification).
      prob_type (str): Indicates the type of the machine learning problem:
                      - 'reg' for regression (numeric target)
                      - 'bin' for binary classification
                      - 'mult' for multiclass classification
      num_classes (int or None): Number of classes for classification tasks. None for regression.

  The function first one-hot encodes categorical columns in X_df, then converts y_df into an appropriate format based on
  its data type:
  - If y_df contains numeric values, it's treated as a regression problem.
  - If y_df contains categorical values (including strings), it's treated as a classification problem. The target
    labels are encoded as integers, and the number of classes is determined. If there are 2 classes, it's binary
    classification; otherwise, it's multiclass classification.

  Example:
      X, y, prob_type, num_classes = prepare_data(X_df, y_df)
  """

  # Remove rows with NaNs
  X_df = X_df.dropna()
  y_df = y_df.loc[X_df.index]

  # Convert categorical columns into 0-1 variables
  object_cols = X_df.select_dtypes(include='object').columns
  X_df_encoded = pd.get_dummies(X_df, columns=object_cols)

  # Create data array
  X = X_df_encoded.values

  # Convert y into target array
  y_array = y_df.iloc[:, 0].to_numpy()

  # Create target vector
  if np.issubdtype(y_array.dtype, np.number):
      y = y_array
      prob_type = 'reg'
      num_classes = None
  else:
      # If y is categorical (including strings), use LabelEncoder for encoding
      encoder = LabelEncoder()
      y = encoder.fit_transform(y_array)
      num_classes = len(encoder.classes_)
      prob_type = 'bin' if num_classes == 2 else 'mult'

  return X, y, prob_type, num_classes

In [5]:
# Prepare the data
X, y, prob_type, num_classes = prepare_data(X_df, y_df)

In [6]:
# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Run Regressor or Classifier

In [7]:
# Run method

if method_name == "Linear":
  from sklearn.linear_model import LinearRegression, LogisticRegression

  params = {'C': 1/config["lambda"],  # Inverse of regularization strength
            'max_iter': config["num_rounds"]}  # Number of training iterations

  if prob_type == 'reg':
      model = LinearRegression()
  elif prob_type == 'bin':
      model = LogisticRegression(**params)
  elif prob_type == 'mult':
      params['multi_class'] = 'multinomial'
      model = LogisticRegression(**params)

elif method_name == "XGBoost":
  import xgboost as xgb
  params = {'learning_rate':config["learning_rate"],
            'reg_lambda':config["lambda"],
            'max_depth':config["depth"],
              'n_estimators':config["num_rounds"]}

  # set objective
  if prob_type == 'reg':
    params['objective'] = 'reg:squarederror'
  elif prob_type == 'bin':
    params['objective'] = 'binary:logistic'
  elif prob_type == 'mult':
    params['objective'] = 'multi:softprob'
    params['num_classes'] = num_classes

  model = xgb.XGBRegressor(**params) if prob_type == 'reg' else xgb.XGBClassifier(**params)

elif method_name == "LightGBM":
  import lightgbm as lgb

  params = {'learning_rate':config["learning_rate"],
            'lambda_l2':config["lambda"],
            'max_depth':config["depth"],
            'n_estimators':config["num_rounds"]}

  # set objective
  if prob_type == 'reg':
    params['objective'] = 'regression'
  elif prob_type == 'bin':
    params['objective'] = 'binary'
  elif prob_type == 'mult':
    params['objective'] = 'multiclass'
    params['num_classes'] = num_classes

  model = lgb.LGBMRegressor(**params) if prob_type == 'reg' else lgb.LGBMClassifier(**params)
else:
  raise Exception("Invalid Method Name!")
model.fit(X_train, y_train)

# Make predictions on the test set
test_preds = model.predict(X_test)
if prob_type in ['bin', 'mult']:
  test_predictions = [1 if x > 0.5 else 0 for x in test_preds]
  result = accuracy_score(y_test, test_predictions)
elif prob_type == 'reg':
  result = mean_squared_error(y_test, test_preds)

print(result)

4919230125.0656
