In [None]:
'''Program to implement decision trees using any standard dataset available in the public
domain and find the accuracy of the algorithm.'''

# Import necessary libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load the Iris dataset
data = load_iris()

X = data.data
y = data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

# Create a Decision Tree Classifier
dt_model = DecisionTreeClassifier()

# Train the Decision Tree model
dt_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = dt_model.predict(X_test)

# Calculate the accuracy of the Decision Tree model
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print(f'Accuracy of Decision Tree Classifier: {accuracy}')

# plot the decision tree
# from sklearn.tree import plot_tree
# import matplotlib.pyplot as plt

# plt.figure(figsize=(12,8))
# plot_tree(dt_model, feature_names=data.feature_names, class_names=data.target_names, filled=True)
# plt.title('Decision Tree on Iris Dataset')
# plt.show()

Accuracy of Decision Tree Classifier: 1.0


In [None]:
'''
1. Write a program to predict the percentage of heart disease (the dependent variable) based
on two independent variables, percentage of people biking to town and percentage of people
smoking using Multiple Linear Regression Technique and evaluate its performance. Given a
data set of 498 items.
data set - heart.csv
'''
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import numpy as np

# Load the dataset
heart_data = pd.read_csv('heart.data.csv')

# Display the first few rows of the dataframe to understand its structure
print("First 5 rows of the dataset:")
print(heart_data.head())

# The first column is an index, so we will drop it.
heart_data = heart_data.drop(heart_data.columns[0], axis=1)

# Prepare the data
# Independent variables (features)
X = heart_data[['biking', 'smoking']]
# Dependent variable (target)
y = heart_data['heart.disease']

# Split the data into training and testing sets
# We will use 80% of the data for training and 20% for testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Multiple Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
print("\nModel Performance Evaluation:")
print('R-squared:', metrics.r2_score(y_test, y_pred))
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

# Display the model's coefficients
print("\nModel Coefficients:")
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)

First 5 rows of the dataset:
   Unnamed: 0     biking    smoking  heart.disease
0           1  30.801246  10.896608      11.769423
1           2  65.129215   2.219563       2.854081
2           3   1.959665  17.588331      17.177803
3           4  44.800196   2.802559       6.816647
4           5  69.428454  15.974505       4.062224

Model Performance Evaluation:
R-squared: 0.9751106638392245
Mean Absolute Error (MAE): 0.5210966863808624
Mean Squared Error (MSE): 0.4522479819108037
Root Mean Squared Error (RMSE): 0.6724938526936909

Model Coefficients:
Intercept: 15.01718202768216
Coefficients: [-0.20076448  0.17827087]


In [None]:
'''
2. Write a program to predict medical expenses (the dependent variable) based on the independent
variables, age, sex, bmi, children, smoker and region using Multiple Linear Regression
Technique and evaluate its performance. Given a data set of 1338 items.
data set - insurance.csv
'''

# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

# Load the dataset
try:
    df = pd.read_csv('insurance.csv')
except FileNotFoundError:
    print("Error: 'insurance.csv' not found. Please ensure the dataset is in the correct directory.")
    exit()

# --- 1. Data Preprocessing ---
target_column = 'expenses' 

# Define independent (X) and dependent (y) variables
try:
    X = df.drop(target_column, axis=1)
    y = df[target_column]
except KeyError:
    print(f"Error: The target column '{target_column}' was not found in the DataFrame.")
    print("Please check the column list printed above and update the 'target_column' variable in the script.")
    exit()


# Identify categorical features to be encoded
categorical_features = ['sex', 'smoker', 'region']

# Create a column transformer to apply one-hot encoding to categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ],
    remainder='passthrough'
)

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# --- 2. Model Training ---

# Create a pipeline that first transforms the data and then fits the model
model = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', LinearRegression())])

# Train the model
model.fit(X_train, y_train)

print("Multiple Linear Regression model trained successfully.\n")


# --- 3. Model Prediction ---

# Make predictions on the test set
y_pred = model.predict(X_test)


# --- 4. Model Evaluation ---

# Calculate and print the performance metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("--- Model Performance Evaluation ---")
print(f"Mean Absolute Error (MAE): ${mae:,.2f}")
print(f"Mean Squared Error (MSE): ${mse:,.2f}")
print(f"Root Mean Squared Error (RMSE): ${rmse:,.2f}")
print(f"R-squared (R²): {r2:.4f}")


Multiple Linear Regression model trained successfully.

--- Model Performance Evaluation ---
Mean Absolute Error (MAE): $4,181.56
Mean Squared Error (MSE): $33,600,065.36
Root Mean Squared Error (RMSE): $5,796.56
R-squared (R²): 0.7836
