In [7]:
import csv
import numpy as np
import pickle

# Load dataset
def load_csv(filename):
    with open(filename, 'r') as file:
        reader = csv.reader(file)
        data = list(reader)
    headers = data[0]
    return headers, data[1:]

# Encode categorical variables manually
def label_encode_column(column):
    unique_values = list(set(column))
    encoding = {val: idx for idx, val in enumerate(unique_values)}
    return [encoding[val] for val in column], encoding

# Standardize numerical features
def standardize_features(X):
    X = np.array(X, dtype=float)
    means = np.mean(X, axis=0)
    stds = np.std(X, axis=0)
    return (X - means) / stds, means, stds

# Train-test split function
def train_test_split(X, y, test_size=0.3):
    split_idx = int(len(X) * (1 - test_size))
    return X[:split_idx], X[split_idx:], y[:split_idx], y[split_idx:]

# Custom Decision Tree Regressor
class DecisionTreeRegressorCustom:
    def __init__(self, min_samples_split=2):
        self.min_samples_split = min_samples_split
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y):
        if len(y) < self.min_samples_split or len(set(y)) == 1:
            return np.mean(y)
        best_feature, best_threshold = self._find_best_split(X, y)
        if best_feature is None:
            return np.mean(y)
        left_idx = X[:, best_feature] <= best_threshold
        right_idx = X[:, best_feature] > best_threshold
        left_subtree = self._build_tree(X[left_idx], y[left_idx])
        right_subtree = self._build_tree(X[right_idx], y[right_idx])
        return (best_feature, best_threshold, left_subtree, right_subtree)

    def _find_best_split(self, X, y):
        best_feature, best_threshold, best_variance = None, None, float('inf')
        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_idx = X[:, feature] <= threshold
                right_idx = X[:, feature] > threshold
                if len(y[left_idx]) == 0 or len(y[right_idx]) == 0:
                    continue
                variance = np.var(y[left_idx]) * len(y[left_idx]) + np.var(y[right_idx]) * len(y[right_idx])
                if variance < best_variance:
                    best_feature, best_threshold, best_variance = feature, threshold, variance
        return best_feature, best_threshold

    def predict_one(self, x, node):
        if not isinstance(node, tuple):
            return node
        feature, threshold, left, right = node
        if x[feature] <= threshold:
            return self.predict_one(x, left)
        else:
            return self.predict_one(x, right)

    def predict(self, X):
        return np.array([self.predict_one(x, self.tree) for x in X])

# Load and process dataset
headers, data = load_csv('Housing_Data.csv')
data = np.array(data)

# Separate features and target
y = data[:, headers.index('price')].astype(float)
X = np.delete(data, headers.index('price'), axis=1)

# Encode categorical columns
label_encoders = {}
for i in range(X.shape[1]):
    if not X[:, i][0].replace('.', '', 1).isdigit():
        X[:, i], label_encoders[headers[i]] = label_encode_column(X[:, i])
X = X.astype(float)

# Standardize features
X_scaled, means, stds = standardize_features(X)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3)

# Train model
model = DecisionTreeRegressorCustom()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
r2 = 1 - (np.sum((y_test - y_pred) ** 2) / np.sum((y_test - np.mean(y_test)) ** 2))

# Save model and preprocessing objects
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('scaler.pkl', 'wb') as f:
    pickle.dump({'means': means, 'stds': stds}, f)
with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

print("✅ Model, Scaler, and Label Encoders saved successfully.")
print(f"🎯 Model Accuracy (R² Score): {r2 * 100:.2f}%")


✅ Model, Scaler, and Label Encoders saved successfully.
🎯 Model Accuracy (R² Score): 99.95%
