In [1]:
# train_pipeline.py
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.metrics import r2_score

In [5]:
# 1. Load
df = pd.read_csv("50_Startups.csv")
# 2. Define raw feature names (these are columns as in the original CSV)
numeric_features = ["R&D Spend", "Administration", "Marketing Spend"]
categorical_features = ["State"]
target_column = "Profit"

In [6]:

# 3. Train/test split (use raw df columns)
X = df[numeric_features + categorical_features]
y = df[target_column]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=0
)

In [9]:
# 4. Preprocessor: scale numeric, one-hot encode categorical (drop='first' optional)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False), categorical_features),
    ],
    remainder="drop"
)



In [14]:
preprocessor

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [11]:
# 5. Pipeline: preprocessor + estimator
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimator", SVR(kernel="rbf", C=1000))   # tune C via CV in practice
])


In [16]:
# pipeline is already fitted
num_transformer = pipeline.named_steps["preprocessor"].named_transformers_["num"]
X_test_scaled_numeric = num_transformer.transform(X_test[numeric_features])
print("Scaled numeric shape:", X_test_scaled_numeric.shape)
print("First row scaled numeric:", X_test_scaled_numeric[0])


Scaled numeric shape: (13, 3)
First row scaled numeric: [-0.13911607  2.26499766 -0.66798559]


In [17]:
preprocessor = pipeline.named_steps["preprocessor"]
X_test_transformed = preprocessor.transform(X_test)   # returns numpy array (dense, because we set sparse=False)
print("Preprocessed shape:", X_test_transformed.shape)


Preprocessed shape: (13, 5)


In [12]:
# 6. Fit
pipeline.fit(X_train, y_train)   # y_train is 1D series — fine

# 7. Evaluate
y_pred = pipeline.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"R2 score on test set: {r2:.4f}")


R2 score on test set: 0.0695


In [13]:
# 8. Save the pipeline and metadata
bundle = {
    "pipeline": pipeline,
    "meta": {
        "raw_features": numeric_features + categorical_features,
        "target": target_column
    }
}
joblib.dump(bundle, "pipeline_v1.joblib")
print("Saved pipeline to pipeline_v1.joblib")

Saved pipeline to pipeline_v1.joblib


In [None]:
#we normally scale only the inputs (X), not the outputs (y) — unless you explicitly use something like TransformedTargetRegressor.