In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
import pickle

In [20]:
np.__version__

'1.26.4'

In [22]:
import flask
flask.__version__

  flask.__version__


'3.1.0'

In [8]:
# Load data with header names from the CSV file.
# Ensure that 'Loan_data_ver2.csv' is in the same directory as this script.
data = np.genfromtxt('Loan_data_ver2.csv', delimiter=',', names=True, dtype=None, encoding='utf-8')

# Get the header names (fields) from the structured array.
header_names = data.dtype.names

# Assume that the last column is the target variable.
# Create the feature array by stacking all columns except the last.
X = np.column_stack([data[name] for name in header_names[:-1]])

# Extract the target variable.
y = data[header_names[-1]]

In [11]:
X[0,:]

array(['No', 'Graduate', '5849', '-1', '1'], dtype='<U12')

In [16]:
# ---------------------------
# Step 2: Process Features
# ---------------------------
# Since we don't have pandas DataFrame, we need to work with numpy arrays.
# You need to know which columns are numeric and which are categorical.
# For this example, let's assume:
# - Numeric columns: indices 0, 1, 2
# - Categorical columns: indices 3, 4

# # Extract numeric and categorical features using numpy indexing
# X_numeric = X[:, [2, 3, 4]].astype(float)  # converting to float if needed
# X_categorical = X[:, [0, 1]]  # these remain as strings

# Extract numeric and categorical features using numpy indexing
X_numeric = X[:, [2, 3, 4]].astype(float)  # Ensure numeric columns are float
X_categorical = X[:, [0, 1]].astype(object)  # Convert to object dtype for categorical processing


In [17]:
# ---------------------------
# Step 3: Build Preprocessing Pipelines
# ---------------------------
# Numeric pipeline: impute missing values and scale features
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # or use KNNImputer
    ('scaler', StandardScaler())
])

# Categorical pipeline: impute missing values and one-hot encode
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Since ColumnTransformer expects column indices or names, we'll build separate pipelines.
# We'll fit each pipeline on the corresponding numpy arrays.
X_numeric_transformed = numeric_pipeline.fit_transform(X_numeric)
X_categorical_transformed = categorical_pipeline.fit_transform(X_categorical).toarray()

#X_categorical_transformed = categorical_pipeline.fit_transform(X_categorical).toarray()

# Combine the processed numeric and categorical features using numpy concatenation
X_processed = np.concatenate([X_numeric_transformed, X_categorical_transformed], axis=1)

In [18]:

# ---------------------------
# Step 4: Split Data
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# ---------------------------
# Step 5: Build and Train the Model
# ---------------------------
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# ---------------------------
# Step 6: Save the Pipeline
# ---------------------------
# Since we manually processed the data, you can save both the preprocessing components and the model.
# Here, we save a dictionary containing the pipelines and the model.
pipeline = {
    'numeric_pipeline': numeric_pipeline,
    'categorical_pipeline': categorical_pipeline,
    'categorical_indices': [0, 1],  # indices for categorical features in original X
    'numeric_indices': [2, 3, 4],
    'model': model
}

with open('full_pipeline', 'wb') as f:
    pickle.dump(pipeline, f)

print("Pipeline saved successfully!")


Pipeline saved successfully!
