In [13]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression , Ridge
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split




data = pd.read_csv('50_Startups.csv')
df = data.copy()



common_targets = ['Exited', 'Profit', 'Purchased', 'Target', 'Class', 'Label', 'y','Balance']
target_col = None


for name in common_targets:
    if name in df.columns:
        target_col = name
        break

if target_col is None:
    target_col = df.columns[-1]  # fallback

y = df[target_col]

# -----------------------------
# Drop ID-like columns
# -----------------------------
id_patterns = ['RowNumber', 'CustomerId', 'CustomerID', 'Surname', 'Id', 'ID', 'index']
id_columns = [c for c in id_patterns if c in df.columns]

# -----------------------------
# Feature matrix
# -----------------------------
drop_cols = [target_col] + id_columns
X = df.drop(columns=drop_cols, errors='ignore')






X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# -----------------------------
# Column type detection
# -----------------------------



def build_preprocessor(X):
 

    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

    transformers = []

    if categorical_cols:
        categorical_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OrdinalEncoder(
                handle_unknown='use_encoded_value',
                unknown_value=-1
            ))
        ])
        transformers.append(('cat', categorical_pipeline, categorical_cols))

    if numerical_cols:
        numerical_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])
        transformers.append(('num', numerical_pipeline, numerical_cols))

    if not transformers:
        raise ValueError("No valid features found for preprocessing")

    return ColumnTransformer(transformers)


preprocessor = build_preprocessor(X)




# Decide task type
is_classification = (
    y.dtype.kind in 'biu' and y.nunique() <= 10
)





if is_classification:
    print("Task detected: CLASSIFICATION")

    model = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(max_iter=1000))
    ])

    param_grid = [
        {
            'classifier': [LogisticRegression(solver='liblinear')],
            'classifier__penalty': ['l1', 'l2'],
            'classifier__C': [0.01, 0.1, 1.0, 10.0]
        }
    ]

    scoring = 'accuracy'

else:
    print("Task detected: REGRESSION")

    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', Ridge())
    ])

    param_grid = [
        {
            'regressor': [LinearRegression()]
        },
        {
            'regressor': [Ridge()],
            'regressor__alpha': [0.01, 0.1, 1.0, 10.0, 100.0]
        }
    ]

    scoring = 'r2'



grid = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring=scoring,
    cv=5,
    n_jobs=-1

)



grid.fit(X_train, y_train)




y_pred = grid.predict(X_test)



Task detected: REGRESSION


In [14]:
y_pred = grid.predict(X_test)

np.set_printoptions(precision=2)

comparison = np.concatenate(
    (y_pred.reshape(-1, 1),
     y_test.values.reshape(-1, 1)),
    axis=1
)


In [15]:
comparison

array([[126698.1 , 134307.35],
       [ 84921.06,  81005.76],
       [ 98743.76,  99937.59],
       [ 46522.68,  64926.08],
       [129157.96, 125370.37],
       [ 50860.36,  35673.41],
       [108926.29, 105733.54],
       [100844.06, 107404.34],
       [ 97514.84,  97427.84],
       [112776.06, 122776.86]])

In [16]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

ValueError: continuous is not supported

In [11]:

best_model = grid.best_estimator_



def get_transformed_X(model, X):
    return model.named_steps['preprocessor'].transform(X)

Xt = get_transformed_X(best_model, X)