In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('D:\\Datasets\\insurance.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df['sex'].unique()
df['smoker'].unique()
df['region'].unique()

In [None]:
df['sex'].value_counts()
df['smoker'].value_counts()
df['region'].value_counts()

In [None]:
df['sex'] = df['sex'].map({'male': 0, 'female': 1})
df['smoker'] = df['smoker'].map({'no': 0, 'yes': 1})
df = pd.get_dummies(df, columns=['region'], drop_first=True)

In [None]:
sns.heatmap(df.corr(), annot=True, cmap='Reds')
plt.show()

In [None]:
sns.scatterplot(data=df, x='age', y='charges', hue='smoker')
plt.title('Charges vs Age')
plt.show()

In [None]:
sns.histplot(df['age'], bins=30)
plt.title('Age Distribution')
plt.show()

In [None]:
sns.histplot(df['bmi'], bins=30)
plt.title('BMI Distribution')
plt.show()

In [None]:
sns.barplot(x='smoker', y='charges', data=df)
plt.title('Average Charges by Smoking Status')
plt.show()

In [None]:
X = df.drop('charges', axis=1)
y = df['charges']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print('MAE:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2:', r2_score(y_test, y_pred))

In [None]:
plt.scatter(y_test, y_pred)
plt.xlabel('Actual Charges')
plt.ylabel('Predicted Charges')
plt.title('Actual vs Predicted Charges')
plt.show()