In [1]:
import numpy as np
import pandas as pd

# Data preprocessing
def preprocess_data(data):
    # Convert categorical variables to numerical using one-hot encoding
    data = pd.get_dummies(data)
    return data

# Splitting the data
def train_test_split(inputs, target, test_size):
    indices = np.arange(len(inputs))
    np.random.shuffle(indices)
    split_index = int(len(inputs) * (1 - test_size))
    train_indices = indices[:split_index]
    test_indices = indices[split_index:]
    X_train = inputs.iloc[train_indices].values
    X_test = inputs.iloc[test_indices].values
    y_train = target[train_indices]
    y_test = target[test_indices]
    return X_train, X_test, y_train, y_test

# Linear regression
def linear_regression(X_train, y_train, X_test):
    # Add a column of ones to the inputs for the intercept term
    ones = np.ones((X_train.shape[0], 1))
    X_train = np.concatenate((ones, X_train), axis=1)
    ones = np.ones((X_test.shape[0], 1))
    X_test = np.concatenate((ones, X_test), axis=1)
    
    # Calculate the coefficients using the normal equation
    X_transpose = np.transpose(X_train)
    theta = np.linalg.inv(X_transpose.dot(X_train)).dot(X_transpose).dot(y_train)
    
    # Make predictions
    predictions = X_test.dot(theta)
    return predictions

# Load and preprocess the data
data = pd.read_csv('Medical Price Dataset.csv')
data = preprocess_data(data)

# Split the data into inputs and target
inputs = data.drop('charges', axis=1)
target = data['charges']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)


# Run linear regression
predictions = linear_regression(X_train, y_train, X_test)
print("Predictions:", predictions)


Predictions: [ 744870.26477117 1085949.70587379  941478.26477117  840189.70587379
  930301.70587379  194610.00141112 1226802.00141112  691148.86780791
  695718.26477117 1023398.26477117 1056166.26477117 1167869.70587379
 1184253.70587379  502732.86780791  969676.86780791 1316914.00141112
 1117132.86780791  805836.86780791  867750.26477117  848381.70587379
  449958.26477117  990630.26477117  756684.86780791 1054770.00141112
 1233405.70587379 1072550.26477117  628786.00141112 1053181.70587379
  522290.00141112  805836.86780791  712102.26477117 1039782.26477117
  872957.70587379  974246.26477117  677938.00141112 1144882.00141112
  938493.70587379  841778.00141112  818598.26477117 1430013.70587379
 1004029.70587379  916902.26477117 1146278.26477117 1020413.70587379
 1138086.26477117  907314.00141112  712102.26477117 1117132.86780791
 1010636.86780791  620594.00141112  695718.26477117  817202.00141112
  854988.86780791  617420.86780791  292914.00141112  669746.00141112
  872957.70587379  64

In [3]:
# Root Mean Squared Error (RMSE)
def calculate_rmse(predictions, targets):
    mse = np.mean((predictions - targets) ** 2)
    rmse = np.sqrt(mse)
    return rmse

# Calculate RMSE of the model
rmse = calculate_rmse(predictions, y_test)
print("RMSE:", rmse)

RMSE: 867621.243061723
