In [84]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [37]:
insurance = pd.read_csv("insurance.csv")
print(insurance.info)
print("now data types -------")
print(insurance.dtypes)

<bound method DataFrame.info of       age     sex   bmi  children smoker     region  expenses
0      19  female  27.9         0    yes  southwest  16884.92
1      18    male  33.8         1     no  southeast   1725.55
2      28    male  33.0         3     no  southeast   4449.46
3      33    male  22.7         0     no  northwest  21984.47
4      32    male  28.9         0     no  northwest   3866.86
...   ...     ...   ...       ...    ...        ...       ...
1333   50    male  31.0         3     no  northwest  10600.55
1334   18  female  31.9         0     no  northeast   2205.98
1335   18  female  36.9         0     no  southeast   1629.83
1336   21  female  25.8         0     no  southwest   2007.95
1337   61  female  29.1         0    yes  northwest  29141.36

[1338 rows x 7 columns]>
now data types -------
age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
expenses    float64
dtype: object


from above
age, bmi, children, expenses are numeric

sex,smoker region are categorical in nature

In [38]:
categorical_columns = ["sex","smoker","region"]
insurance[categorical_columns] = insurance[categorical_columns].astype("category")
print("Data types for different features")
print(insurance.dtypes)
print(".......................................")
print("Basic Info")
print(insurance.info())

Data types for different features
age            int64
sex         category
bmi          float64
children       int64
smoker      category
region      category
expenses     float64
dtype: object
.......................................
Basic Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   age       1338 non-null   int64   
 1   sex       1338 non-null   category
 2   bmi       1338 non-null   float64 
 3   children  1338 non-null   int64   
 4   smoker    1338 non-null   category
 5   region    1338 non-null   category
 6   expenses  1338 non-null   float64 
dtypes: category(3), float64(2), int64(2)
memory usage: 46.3 KB
None


1. expenses - continuous
2. age - discrete
3. children - discrete
4. sex - categorical
5. smoker - categorical (binary)
6. bmi - continuous (ratio)

In [39]:
""" 4. Consider all categorical available as character/string 
 and try to measure the number of
 observations in each category (i.e., number of observations in MALE)
"""

observation_count = dict()

for column in categorical_columns:
    observation_count[column] = insurance[column].value_counts().rename_axis("category").reset_index(name = "count")

observation_count

{'sex':   category  count
 0     male    676
 1   female    662,
 'smoker':   category  count
 0       no   1064
 1      yes    274,
 'region':     category  count
 0  southeast    364
 1  northwest    325
 2  southwest    325
 3  northeast    324}

In [40]:
print(insurance.nunique())

#getting the number of unique values of categorical data
for i in categorical_columns:
    print(insurance[i].unique())

#number of values of each feature value of categorical data
for i in categorical_columns:
    print(insurance[i].value_counts())

age           47
sex            2
bmi          275
children       6
smoker         2
region         4
expenses    1337
dtype: int64
['female', 'male']
Categories (2, object): ['female', 'male']
['yes', 'no']
Categories (2, object): ['no', 'yes']
['southwest', 'southeast', 'northwest', 'northeast']
Categories (4, object): ['northeast', 'northwest', 'southeast', 'southwest']
sex
male      676
female    662
Name: count, dtype: int64
smoker
no     1064
yes     274
Name: count, dtype: int64
region
southeast    364
northwest    325
southwest    325
northeast    324
Name: count, dtype: int64


In [41]:
# Select only numeric columns for correlation calculation
numeric_columns = insurance.select_dtypes(include=['int64', 'float64']).columns
insurance_corr = insurance[numeric_columns].corr()

# Calculate statistical properties for dataset-1
insurance_stats = insurance.describe().transpose()

# Print the results
print("insurance Stats:-")
print(insurance_stats)

print("\nCorrelation Coefficients for insurance")
print(insurance_corr)


insurance Stats:-
           count          mean           std      min        25%      50%  \
age       1338.0     39.207025     14.049960    18.00    27.0000    39.00   
bmi       1338.0     30.665471      6.098382    16.00    26.3000    30.40   
children  1338.0      1.094918      1.205493     0.00     0.0000     1.00   
expenses  1338.0  13270.422414  12110.011240  1121.87  4740.2875  9382.03   

                75%       max  
age          51.000     64.00  
bmi          34.700     53.10  
children      2.000      5.00  
expenses  16639.915  63770.43  

Correlation Coefficients for insurance
               age       bmi  children  expenses
age       1.000000  0.109341  0.042469  0.299008
bmi       0.109341  1.000000  0.012645  0.198576
children  0.042469  0.012645  1.000000  0.067998
expenses  0.299008  0.198576  0.067998  1.000000


## using some linear regression model here

In [87]:
#assuming "expenses" is the target variable, and the rest are input variables
X = insurance.drop(['expenses'], axis=1)
Y = insurance['expenses']

X_encoded = pd.get_dummies(X, drop_first=True)

#split the data
X_train_encoded, X_test_encoded, Y_train, Y_test = train_test_split(X_encoded, Y, test_size=0.2, random_state=22)

model1 = LinearRegression().fit(X_train_encoded, Y_train)

Y_predict = 

LinearRegression()
