In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import string

def generate_synthetic_data(rows=1000, cols=30, seed=42):
    """
    Generates a synthetic DataFrame with mixed data types:
    - Boolean
    - Integer (numeric)
    - Float
    - Text
    - Categorical
    - Date

    Parameters:
    - rows: int, number of rows
    - cols: int, total number of columns

    Returns:
    - df: pd.DataFrame, synthetic dataset
    """
    np.random.seed(seed)
    random.seed(seed)

    col_types = ['bool', 'int', 'float', 'text', 'category', 'date']
    type_counts = {k: cols // len(col_types) for k in col_types}
    remainder = cols % len(col_types)
    for i in range(remainder):
        type_counts[col_types[i]] += 1

    data = {}

    for i in range(type_counts['bool']):
        data[f'bool_col_{i}'] = np.random.choice([True, False], size=rows)

    for i in range(type_counts['int']):
        data[f'int_col_{i}'] = np.random.randint(0, 1000, size=rows)

    for i in range(type_counts['float']):
        data[f'float_col_{i}'] = np.random.uniform(0, 1000, size=rows)

    for i in range(type_counts['text']):
        data[f'text_col_{i}'] = [''.join(random.choices(string.ascii_letters, k=10)) for _ in range(rows)]

    for i in range(type_counts['category']):
        data[f'cat_col_{i}'] = pd.Series(np.random.choice(['A', 'B', 'C', 'D'], size=rows)).astype("category")

    base_date = datetime(2020, 1, 1)
    for i in range(type_counts['date']):
        data[f'date_col_{i}'] = [base_date + timedelta(days=np.random.randint(0, 3650)) for _ in range(rows)]

    data[f'target'] = np.random.uniform(0, 1000, size=rows)

    df = pd.DataFrame(data)
    df['target'] = df['target'].astype(float)
    return df

# Generate and preview
main_df = generate_synthetic_data()
df = main_df.copy()
df.head()

Unnamed: 0,bool_col_0,bool_col_1,bool_col_2,bool_col_3,bool_col_4,int_col_0,int_col_1,int_col_2,int_col_3,int_col_4,...,cat_col_1,cat_col_2,cat_col_3,cat_col_4,date_col_0,date_col_1,date_col_2,date_col_3,date_col_4,target
0,True,False,True,False,False,946,839,945,801,955,...,C,D,B,B,2025-08-29,2020-05-08,2028-12-02,2025-11-30,2029-12-21,12.137458
1,False,True,False,False,True,686,123,70,588,599,...,A,D,A,A,2027-09-07,2024-01-10,2025-05-17,2026-10-08,2020-09-21,488.643897
2,True,True,False,False,True,750,992,363,809,755,...,C,A,C,C,2025-02-06,2027-04-15,2026-12-04,2028-02-13,2020-11-28,644.205482
3,True,True,False,False,True,771,112,569,648,380,...,C,B,D,C,2025-06-02,2025-09-13,2022-07-10,2026-03-27,2026-02-13,918.191773
4,True,True,False,True,False,902,11,801,617,328,...,B,A,C,D,2024-10-02,2027-11-06,2023-12-01,2029-03-08,2022-12-22,381.335817


In [2]:
def convert_str_to_numeric(df, errors='coerce'):
    """
    Converts columns with strings that look like numbers into numeric dtype.
    `errors='coerce'` will convert invalid parsing to NaN.
    """
    df_converted = df.copy()
    for col in df_converted.columns:
        if df_converted[col].dtype == object:
            try:
                df_converted[col] = pd.to_numeric(df_converted[col], errors=errors)
            except:
                pass
    return df_converted

def lowercase_string_columns(df):
    """
    Lowercases all string (object) column values.
    """
    df_cleaned = df.copy()
    str_cols = df_cleaned.select_dtypes(include='object').columns
    for col in str_cols:
        df_cleaned[col] = df_cleaned[col].astype(str).str.lower().str.strip()
    return df_cleaned

def handle_boolean_columns(df):
    """
    Converts columns with boolean-looking strings or integers into proper bool dtype.
    """
    df_bool = df.copy()
    for col in df_bool.columns:
        if df_bool[col].dtype == object:
            unique_vals = df_bool[col].dropna().astype(str).str.lower().unique()
            if set(unique_vals).issubset({'true', 'false', 'yes', 'no', '1', '0'}):
                df_bool[col] = df_bool[col].astype(str).str.lower().map({
                    'true': True, '1': True, 'yes': True,
                    'false': False, '0': False, 'no': False
                })
        elif df_bool[col].dtype in [int, float] and set(df_bool[col].dropna().unique()).issubset({0, 1}):
            df_bool[col] = df_bool[col].astype(bool)
    return df_bool
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

def normalize_minmax(df, columns=None):
    """Applies Min-Max normalization to selected numeric columns."""
    df_norm = df.copy()
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns
    scaler = MinMaxScaler()
    df_norm[columns] = scaler.fit_transform(df_norm[columns])
    return df_norm

def standardize_zscore(df, columns=None):
    """Standardizes selected numeric columns using Z-score (mean=0, std=1)."""
    df_scaled = df.copy()
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns
    scaler = StandardScaler()
    df_scaled[columns] = scaler.fit_transform(df_scaled[columns])
    return df_scaled

def robust_scale(df, columns=None):
    """Applies robust scaling using median and IQR (resistant to outliers)."""
    df_robust = df.copy()
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns
    scaler = RobustScaler()
    df_robust[columns] = scaler.fit_transform(df_robust[columns])
    return df_robust

def log_transform(df, columns=None, add_constant=True):
    """
    Applies log transformation to selected numeric columns.
    - add_constant: Add 1 to avoid log(0) if True.
    """
    df_log = df.copy()
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns
    for col in columns:
        try:
            if add_constant:
                df_log[col] = np.log1p(df_log[col])  # log(1 + x)
            else:
                df_log[col] = np.log(df_log[col])
        except Exception as e:
            print(f"Skipping log transform on {col} due to error: {e}")
    return df_log
def drop_duplicates(df):
    """Remove duplicate rows from the DataFrame."""
    return df.drop_duplicates()

def drop_constant_columns(df):
    """Remove columns with only a single unique value."""
    return df.loc[:, df.apply(lambda col: col.nunique(dropna=False) > 1)]

def drop_columns_by_null_threshold(df, threshold=0.5):
    """
    Drop columns with missing values above the given threshold.
    - threshold (float): Max allowable proportion of nulls (0 to 1).
    """
    null_fraction = df.isnull().mean()
    cols_to_keep = null_fraction[null_fraction <= threshold].index
    return df[cols_to_keep]

def impute_missing_values(df, strategy="mean"):
    """
    Fill missing values in numeric and categorical columns.
    - strategy (str): 'mean', 'median', or 'mode'.
    """
    df_clean = df.copy()
    
    for col in df.columns:
        if df[col].isnull().sum() == 0:
            continue
        
        if pd.api.types.is_numeric_dtype(df[col]):
            if strategy == "mean":
                fill_value = df[col].mean()
            elif strategy == "median":
                fill_value = df[col].median()
            elif strategy == "mode":
                fill_value = df[col].mode()[0]
            else:
                raise ValueError("strategy must be one of: 'mean', 'median', 'mode'")
            df_clean[col] = df[col].fillna(fill_value)
        else:
            # Categorical or text-based column: use mode only
            fill_value = df[col].mode()[0]
            df_clean[col] = df[col].fillna(fill_value)
    
    return df_clean

def strip_whitespace_string_columns(df):
    """Trim whitespace in string or object-type columns."""
    df_clean = df.copy()
    str_cols = df_clean.select_dtypes(include=["object", "string"]).columns
    for col in str_cols:
        df_clean[col] = df_clean[col].astype(str).str.strip()
    return df_clean

numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
categorical_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
date_cols = df.select_dtypes(include=["datetime64"]).columns.tolist()
bool_cols = df.select_dtypes(include=["bool"]).columns.tolist()
all_cols = df.columns.tolist()

df = drop_duplicates(df)
df = drop_constant_columns(df)
df = drop_columns_by_null_threshold(df, threshold=0.4)
df = impute_missing_values(df, strategy="mode")
df = strip_whitespace_string_columns(df)
df = normalize_minmax(df, columns=numeric_cols)
df = standardize_zscore(df, columns=numeric_cols)
df = robust_scale(df, columns=numeric_cols)
df = log_transform(df, columns=numeric_cols, add_constant=True)
df = log_transform(df, columns=numeric_cols, add_constant=False)
df = convert_str_to_numeric(df, errors='coerce')
df = convert_str_to_numeric(df, errors='ignore')
df = convert_str_to_numeric(df, errors='raise')
df = lowercase_string_columns(df)
df = handle_boolean_columns(df)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

def split_regression_data(df, target_col, method="random", test_size=0.2, stratify_bins=10, time_col=None, n_clusters=5, random_state=42):
    """
    Split regression dataset using different strategies: random, stratified, time-based, or cluster-based.

    Parameters:
    - df (pd.DataFrame): Input dataset.
    - target_col (str): Name of the regression target column.
    - method (str): One of ['random', 'stratified', 'time', 'cluster'].
    - test_size (float): Proportion of test set.
    - stratify_bins (int): Number of quantile bins for stratified splitting.
    - time_col (str): Column for time-based split (required for 'time' method).
    - n_clusters (int): Number of clusters for cluster-based split.
    - random_state (int): Random seed for reproducibility.

    Returns:
    - train_df (pd.DataFrame): Training data.
    - test_df (pd.DataFrame): Testing data.
    """
    df = df.dropna(subset=[target_col]).copy()

    if method == "random":
        train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)

    elif method == "stratified":
        # Bin the target column for stratification
        stratify_labels = pd.qcut(df[target_col], q=stratify_bins, duplicates="drop")
        train_df, test_df = train_test_split(df, test_size=test_size, stratify=stratify_labels, random_state=random_state)

    elif method == "time":
        if time_col is None:
            raise ValueError("`time_col` must be provided for time-based splitting.")
        df = df.sort_values(time_col)
        split_idx = int((1 - test_size) * len(df))
        train_df, test_df = df.iloc[:split_idx], df.iloc[split_idx:]

    elif method == "cluster":
        feature_cols = df.drop(columns=[target_col]).select_dtypes(include=np.number).columns
        if len(feature_cols) == 0:
            raise ValueError("Cluster-based splitting requires numerical features.")

        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(df[feature_cols])

        kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
        df["cluster"] = kmeans.fit_predict(X_scaled)

        # Stratified split using cluster labels
        train_df, test_df = train_test_split(df, test_size=test_size, stratify=df["cluster"], random_state=random_state)
        df.drop(columns=["cluster"], inplace=True)

    else:
        raise ValueError(f"Invalid split method: {method}. Choose from ['random', 'stratified', 'time', 'cluster'].")

    return train_df, test_df

train, test = split_regression_data(df, target_col='target', method='random', test_size=0.3, random_state=42)
X_train = train.drop(columns=['target'])
y_train = train['target']
X_test = test.drop(columns=['target'])
y_test = test['target']

In [35]:
from sklearn.feature_selection import SelectKBest, chi2, f_regression, SelectFromModel, RFE, VarianceThreshold
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.inspection import permutation_importance

def select_k_best(X, y, k=10, score_func=f_regression):
    selector = SelectKBest(score_func=score_func, k=k)
    selector.fit(X, y)
    selected = selector.get_support()
    return X.loc[:, selected]

def lasso_feature_selection(X, y, alpha=0.01):
    lasso = LassoCV(alphas=[alpha]).fit(X, y)
    model = SelectFromModel(lasso, prefit=True)
    return X.loc[:, model.get_support()]

def rfe_selection(X, y, estimator=RandomForestRegressor(), n_features=10, step=1):
    selector = RFE(estimator, n_features_to_select=n_features, step=step)
    selector.fit(X, y)
    return X.loc[:, selector.get_support()]

def random_forest_importance(X, y, n_estimators=100):
    model = RandomForestRegressor(n_estimators=n_estimators)
    model.fit(X, y)
    importances = pd.Series(model.feature_importances_, index=X.columns)
    return importances.sort_values(ascending=False)

def vif_selection(X, thresh=5.0):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    vif_data = pd.DataFrame()
    vif_data['feature'] = X.columns
    vif_data['VIF'] = [variance_inflation_factor(X_scaled, i) for i in range(X.shape[1])]
    return vif_data[vif_data['VIF'] < thresh]

def knn_feature_importance(X, y, n_neighbors=5, n_features=10, random_state=42):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    feature_scores = {}
    for feature in X.columns:
        model = KNeighborsRegressor(n_neighbors=n_neighbors)
        X_feature = X_scaled[:, X.columns.get_loc(feature)].reshape(-1, 1)
        model.fit(X_feature, y)
        score = model.score(X_feature, y)
        feature_scores[feature] = score
    selected_features = pd.Series(feature_scores).nlargest(n_features).index.tolist()
    return selected_features

def variance_threshold_selector(X, threshold=0.0):
    selector = VarianceThreshold(threshold=threshold)
    selector.fit(X)
    return X.loc[:, selector.get_support()]

def univariate_feature_selection(X, y, score_func=chi2, k=10):
    selector = SelectKBest(score_func=score_func, k=k)
    selector.fit(X, y)
    return X.loc[:, selector.get_support()]

In [None]:
from sklearn.impute import SimpleImputer

# Step 1: Drop text/date columns (optional: encode dates separately)
non_numeric_cols = X_train.select_dtypes(include=['object', 'category', 'datetime']).columns
X_train_numeric = X_train.drop(columns=non_numeric_cols)

# Step 2: Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train_numeric), columns=X_train_numeric.columns)

# Step 3: Scale the features
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_imputed), columns=X_train_imputed.columns)

In [26]:
# 2. Lasso Feature Selection
X_lasso = lasso_feature_selection(X_train_scaled, y_train, alpha=0.01)
print("Lasso Selected Features:", X_lasso.columns)

Lasso Selected Features: Index(['bool_col_0', 'bool_col_1', 'bool_col_2', 'bool_col_3', 'int_col_0',
       'int_col_1', 'int_col_3', 'int_col_4', 'float_col_0', 'float_col_1',
       'float_col_2', 'float_col_3', 'float_col_4'],
      dtype='object')


In [27]:
# 3. RFE (Recursive Feature Elimination)
X_rfe = rfe_selection(X_train_scaled, y_train, estimator=RandomForestRegressor(), n_features=10, step=1)
print("RFE Selected Features:", X_rfe.columns)

RFE Selected Features: Index(['int_col_0', 'int_col_1', 'int_col_2', 'int_col_3', 'int_col_4',
       'float_col_0', 'float_col_1', 'float_col_2', 'float_col_3',
       'float_col_4'],
      dtype='object')


In [28]:
# 4. Random Forest Feature Importances
rf_importance = random_forest_importance(X_train_scaled, y_train, n_estimators=100)
print("Random Forest Feature Importances:", rf_importance)

Random Forest Feature Importances: float_col_0    0.182666
float_col_4    0.120645
float_col_2    0.112471
int_col_0      0.096471
int_col_1      0.085069
int_col_4      0.075175
float_col_3    0.074871
float_col_1    0.066909
int_col_3      0.057254
int_col_2      0.052044
bool_col_0     0.017514
bool_col_2     0.016165
bool_col_4     0.015824
bool_col_3     0.015686
bool_col_1     0.011237
text_col_0     0.000000
text_col_1     0.000000
text_col_2     0.000000
text_col_3     0.000000
text_col_4     0.000000
dtype: float64


In [29]:
# 5. Variance Inflation Factor (VIF)
X_vif = vif_selection(X_train_scaled, thresh=5.0)
print("VIF Selected Features:", X_vif['feature'].values)

VIF Selected Features: ['bool_col_0' 'bool_col_1' 'bool_col_2' 'bool_col_3' 'bool_col_4'
 'int_col_0' 'int_col_1' 'int_col_2' 'int_col_3' 'int_col_4' 'float_col_0'
 'float_col_1' 'float_col_2' 'float_col_3' 'float_col_4']


  return 1 - self.ssr/self.uncentered_tss


In [None]:
# 6. Variance Threshold Selector
X_variance_threshold = variance_threshold_selector(X_train_scaled, threshold=0.0)
print("Variance Threshold Selected Features:", X_variance_threshold.columns)

Variance Threshold Selected Features: Index(['bool_col_0', 'bool_col_1', 'bool_col_2', 'bool_col_3', 'bool_col_4',
       'int_col_0', 'int_col_1', 'int_col_2', 'int_col_3', 'int_col_4',
       'float_col_0', 'float_col_1', 'float_col_2', 'float_col_3',
       'float_col_4'],
      dtype='object')


In [None]:
# 7. Univariate Feature Selection (using Chi-Square)
X_univariate = univariate_feature_selection(X_train_scaled, y_train, score_func=f_regression, k=10)
print("Univariate (f_regression) Selected Features:", X_univariate.columns)

Univariate (f_regression) Selected Features: Index(['bool_col_2', 'bool_col_3', 'int_col_0', 'int_col_1', 'int_col_3',
       'int_col_4', 'float_col_0', 'float_col_2', 'float_col_3',
       'float_col_4'],
      dtype='object')


  correlation_coefficient /= X_norms


In [36]:
# 8. KNN Feature Importance
X_knn = knn_feature_importance(X_train_scaled, y_train, n_neighbors=5, n_features=10, random_state=42)
print("KNN Selected Features:", X_knn)

KNN Selected Features: ['float_col_2', 'float_col_0', 'int_col_3', 'int_col_0', 'int_col_4', 'float_col_1', 'float_col_4', 'int_col_2', 'bool_col_2', 'bool_col_4']
