In [17]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import string

def generate_synthetic_data(rows=1000, cols=30, seed=42):
    """
    Generates a synthetic DataFrame with mixed data types:
    - Boolean
    - Integer (numeric)
    - Float
    - Text
    - Categorical
    - Date

    Parameters:
    - rows: int, number of rows
    - cols: int, total number of columns

    Returns:
    - df: pd.DataFrame, synthetic dataset
    """
    np.random.seed(seed)
    random.seed(seed)

    col_types = ['bool', 'int', 'float', 'text', 'category', 'date']
    type_counts = {k: cols // len(col_types) for k in col_types}
    remainder = cols % len(col_types)
    for i in range(remainder):
        type_counts[col_types[i]] += 1

    data = {}

    for i in range(type_counts['bool']):
        data[f'bool_col_{i}'] = np.random.choice([True, False], size=rows)

    for i in range(type_counts['int']):
        data[f'int_col_{i}'] = np.random.randint(0, 1000, size=rows)

    for i in range(type_counts['float']):
        data[f'float_col_{i}'] = np.random.uniform(0, 1000, size=rows)

    for i in range(type_counts['text']):
        data[f'text_col_{i}'] = [''.join(random.choices(string.ascii_letters, k=10)) for _ in range(rows)]

    for i in range(type_counts['category']):
        data[f'cat_col_{i}'] = pd.Series(np.random.choice(['A', 'B', 'C', 'D'], size=rows)).astype("category")

    base_date = datetime(2020, 1, 1)
    for i in range(type_counts['date']):
        data[f'date_col_{i}'] = [base_date + timedelta(days=np.random.randint(0, 3650)) for _ in range(rows)]

    data[f'target'] = np.random.uniform(0, 1000, size=rows)

    df = pd.DataFrame(data)
    df['target'] = df['target'].astype(float)
    return df

# Generate and preview
main_df = generate_synthetic_data()
df = main_df.copy()
df.head()


Unnamed: 0,bool_col_0,bool_col_1,bool_col_2,bool_col_3,bool_col_4,int_col_0,int_col_1,int_col_2,int_col_3,int_col_4,...,cat_col_1,cat_col_2,cat_col_3,cat_col_4,date_col_0,date_col_1,date_col_2,date_col_3,date_col_4,target
0,True,False,True,False,False,946,839,945,801,955,...,C,D,B,B,2025-08-29,2020-05-08,2028-12-02,2025-11-30,2029-12-21,12.137458
1,False,True,False,False,True,686,123,70,588,599,...,A,D,A,A,2027-09-07,2024-01-10,2025-05-17,2026-10-08,2020-09-21,488.643897
2,True,True,False,False,True,750,992,363,809,755,...,C,A,C,C,2025-02-06,2027-04-15,2026-12-04,2028-02-13,2020-11-28,644.205482
3,True,True,False,False,True,771,112,569,648,380,...,C,B,D,C,2025-06-02,2025-09-13,2022-07-10,2026-03-27,2026-02-13,918.191773
4,True,True,False,True,False,902,11,801,617,328,...,B,A,C,D,2024-10-02,2027-11-06,2023-12-01,2029-03-08,2022-12-22,381.335817


In [18]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import PowerTransformer
from sklearn.linear_model import LogisticRegression

def extract_datetime_features(df, datetime_column):
    """
    Extracts features from a datetime column: year, month, day, dayofweek, and weekend flag.
    """
    df_copy = df.copy()
    df_copy[datetime_column] = pd.to_datetime(df_copy[datetime_column])

    df_copy['year'] = df_copy[datetime_column].dt.year
    df_copy['month'] = df_copy[datetime_column].dt.month
    df_copy['day'] = df_copy[datetime_column].dt.day
    df_copy['dayofweek'] = df_copy[datetime_column].dt.dayofweek
    df_copy['is_weekend'] = (df_copy['dayofweek'] >= 5).astype(int)  # 5=Saturday, 6=Sunday

    return df_copy.drop(columns=[datetime_column])


def log_transform_skewed_features(df, skew_threshold=0.75):
    """
    Applies log transformation to skewed numeric features based on skewness threshold.
    Ignores non-positive values since log can't be applied on them.
    """
    df_copy = df.copy()
    numeric_cols = df_copy.select_dtypes(include=[np.number]).columns

    for col in numeric_cols:
        try:
            if (df_copy[col] <= 0).any():
                continue  # Skip columns with non-positive values
            skewness = df_copy[col].skew()
            if isinstance(skewness, (int, float)) and abs(skewness) > skew_threshold:
                df_copy[col] = np.log1p(df_copy[col])  # log(x + 1)
        except Exception as e:
            print(f"Skipped {col} due to error: {e}")

    return df_copy


def generate_polynomial_features(df, degree=2):
    """
    Generates polynomial features for numeric columns in the dataframe.

    Parameters:
    - df (pd.DataFrame): Input dataframe.
    - degree (int): Degree of polynomial features.

    Returns:
    - pd.DataFrame: Original dataframe with polynomial features appended.
    """
    df_copy = df.copy()
    numeric_cols = df_copy.select_dtypes(include=[np.number]).columns

    if len(numeric_cols) == 0:
        print("No numeric columns found for polynomial feature generation.")
        return df_copy

    poly = PolynomialFeatures(degree=degree, include_bias=False)
    
    # Use only numeric columns to fit and transform
    X_numeric = df_copy[numeric_cols]
    poly_features = poly.fit_transform(X_numeric)
    
    # Now get correct feature names using the fitted input
    feature_names = poly.get_feature_names_out(input_features=X_numeric.columns)

    df_poly = pd.DataFrame(poly_features, columns=feature_names, index=df_copy.index)

    return pd.concat([df_copy, df_poly], axis=1)


def create_interaction_terms(df, cols):
    """
    Creates custom interaction terms (pairwise multiplications) between specified numeric columns.

    Parameters:
    - df (pd.DataFrame): Input dataframe.
    - cols (list): List of column names to create interaction terms from.

    Returns:
    - pd.DataFrame: Original dataframe with interaction terms added.
    """
    df_copy = df.copy()
    
    # Ensure columns are valid
    valid_cols = [col for col in cols if col in df_copy.columns]
    
    for i, col1 in enumerate(valid_cols):
        for j, col2 in enumerate(valid_cols):
            if i < j:  # avoids self-multiplication and duplicates
                interaction_name = f'{col1}_x_{col2}'
                df_copy[interaction_name] = list(df_copy[col1].values * df_copy[col2].values)
                
    return df_copy


def frequency_encoding(df, col):
    """
    Performs frequency encoding for a categorical column.
    Encodes each category by the frequency of that category.
    """
    df_copy = df.copy()
    freq_encoding = df_copy[col].value_counts() / len(df_copy)
    df_copy[f'{col}_encoded'] = df_copy[col].map(freq_encoding)
    return df_copy


def apply_pca(df, n_components=2):
    """
    Applies PCA to reduce dimensionality of numeric features.
    Returns transformed features as new columns.
    """
    df_copy = df.copy()
    numeric_cols = df_copy.select_dtypes(include=[np.number]).columns
    pca = PCA(n_components=n_components)
    pca_result = pca.fit_transform(df_copy[numeric_cols])
    pca_df = pd.DataFrame(pca_result, columns=[f'PCA_{i+1}' for i in range(n_components)])
    
    return pd.concat([df_copy, pca_df], axis=1)


In [5]:
# Auto-identify column types
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
date_cols = df.select_dtypes(include=["datetime"]).columns.tolist()

# Create interaction terms (e.g., top 2 correlated pairs)
interaction_cols = numeric_cols[:2]  # You could rank by correlation if needed
df = create_interaction_terms(df, interaction_cols)

# Frequency encode high cardinality categorical columns
for col in categorical_cols:
    if df[col].nunique() > 10:
        df = frequency_encoding(df, col)

# Extract datetime features
for date_col in date_cols:
    df = extract_datetime_features(df, date_col)

# Log transform skewed features
df[numeric_cols] = log_transform_skewed_features(df[numeric_cols], skew_threshold=0.75)

# Generate polynomial features
df = generate_polynomial_features(df[numeric_cols], degree=2)

# PCA (optional: only after standardization)
df = apply_pca(df, n_components=5)
