In [207]:
import pandas as pd

### Load file

In [208]:
data = pd.read_csv('../data/raw/medical_insurance_cost.csv')

data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


### EDA steps

In [209]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [210]:
data.shape

(1338, 7)

In [211]:
sorted(data['age'].unique())

[18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64]

In [212]:
data['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [213]:
data['children'].value_counts()

children
0    574
1    324
2    240
3    157
4     25
5     18
Name: count, dtype: int64

In [214]:
data = data.drop_duplicates().reset_index(drop = True)
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [215]:
data.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

#### Factorize some column's values and normalize into a 0-1 scale for better interpretation

In [216]:
from sklearn.preprocessing import MinMaxScaler

#create new column and store values as numerical values to use later
data['sex_n'] = pd.factorize(data['sex'])[0]
data['smoker_n'] = pd.factorize(data['smoker'])[0]
data['region_n'] = pd.factorize(data['region'])[0]
features = ['age', 'sex_n', 'bmi', 'children', 'smoker_n', 'region_n', 'charges']

#Put all the values at the same scale, normalize features
scaler = MinMaxScaler() #store the function to use later
features_scaled = scaler.fit_transform(data[features]) #we change our features with MinMaxScaller() thanks to fit_transform funtion

#We create a new dataframe, now with our features normalized 
data_scaled = pd.DataFrame(features_scaled, index=data.index, columns= features)
data_scaled.head()

Unnamed: 0,age,sex_n,bmi,children,smoker_n,region_n,charges
0,0.021739,0.0,0.321227,0.0,0.0,0.0,0.251611
1,0.0,1.0,0.47915,0.2,1.0,0.333333,0.009636
2,0.217391,1.0,0.458434,0.6,1.0,0.333333,0.053115
3,0.326087,1.0,0.181464,0.0,1.0,0.666667,0.33301
4,0.304348,1.0,0.347592,0.0,1.0,0.666667,0.043816


## Feature Selection

1. We choose our features and the target we want and store it as X and Y
2. We then split our data in training set and test set, normally 20% for the test set
3. After splitting, we then choose our most relevant features using SelectKBest
4. Since we want to save our data for later, we add our y_train, y_test values creating a column in X_train_sel, X_test_sel dataframes
5. After feature selection we can initialize and train the model

In [217]:
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split

X = data_scaled.drop('charges', axis=1)
y = data_scaled['charges']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#### Selection steps

In [218]:
#reducing the number of columns to only those considered the most relevant by the SelectKBest method.
selection_model = SelectKBest(f_regression, k=4) #in this case, we chose the best 4 features
selection_model.fit(X_train, y_train)

selected_columns = X_train.columns[selection_model.get_support()]
#this will store the selected columns of X_train that are found true in selection_model.get_support()

X_train_sel = pd.DataFrame(selection_model.transform(X_train), columns = selected_columns)
X_test_sel = pd.DataFrame(selection_model.transform(X_test), columns = selected_columns)

X_train_sel.head()

Unnamed: 0,age,bmi,children,smoker_n
0,0.108696,0.230024,0.0,1.0
1,0.065217,0.26325,0.4,1.0
2,0.73913,0.580172,0.4,1.0
3,0.978261,0.686306,0.0,1.0
4,0.630435,0.286252,0.4,1.0


We have now our 4 best features from our X_train set

In [219]:
X_test_sel.head()

Unnamed: 0,age,bmi,children,smoker_n
0,0.673913,0.176352,0.0,1.0
1,0.23913,0.259349,0.8,1.0
2,0.717391,0.549502,0.6,1.0
3,0.282609,0.49583,0.6,0.0
4,0.282609,0.603444,0.4,1.0


Also for the X_test, we have the same relevant features

##### Creating a new dataset after splitting and feature selection

Adding a new column for our target values in the dataframe we created for our most relevant features, so we can have a new and clean dataframe for train and test sets. 
This helps in reducing the dimensionality of the feature space, which can lead to better visualization, understanding, and analysis of the data. Also improve model performance and much more

In [220]:
X_train_sel['charges'] = y_train.values
X_test_sel['charges'] = y_test.values

In [221]:
X_train_sel.to_csv('../data/processed/data_train.csv', index = False)
X_test_sel.to_csv('../data/processed/data_test.csv', index = False)

# Linear Regression model

In [222]:
data_train = pd.read_csv('../data/processed/data_train.csv')
data_test = pd.read_csv('../data/processed/data_test.csv')

In [223]:
data_train.head()

Unnamed: 0,age,bmi,children,smoker_n,charges
0,0.108696,0.230024,0.0,1.0,0.020339
1,0.065217,0.26325,0.4,1.0,0.034446
2,0.73913,0.580172,0.4,1.0,0.516374
3,0.978261,0.686306,0.0,1.0,0.19607
4,0.630435,0.286252,0.4,1.0,0.137177


In [224]:
X_train = data_train.drop(['charges'], axis=1)
y_train = data_train['charges']
X_test = data_test.drop(['charges'], axis= 1)
y_test = data_test['charges']

In [225]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

#### The 3rd step also very important: predicting based on our X_test

In [226]:
y_pred = model.predict(X_test)
y_pred

array([ 0.10702315,  0.07685187,  0.2083241 ,  0.48685684,  0.12995459,
        0.19726353,  0.45755859,  0.00832554,  0.1556988 ,  0.16287349,
        0.14781084,  0.51037008,  0.4699569 ,  0.25944432,  0.14442757,
        0.13842465,  0.05134994,  0.49351746,  0.03848664,  0.0705325 ,
        0.04276279,  0.45855586,  0.22245537,  0.47182868,  0.47460549,
        0.06698449,  0.55116209,  0.5686984 ,  0.15588314,  0.20127518,
        0.0770878 ,  0.19051001, -0.00377298,  0.17967676,  0.61602286,
        0.18050297,  0.06028104,  0.0447116 ,  0.48287007,  0.13064786,
        0.08387671,  0.45610213,  0.54228697,  0.17124485,  0.09976616,
        0.03950023,  0.07028081,  0.12772953,  0.04874519,  0.13229562,
        0.09175793,  0.1653832 ,  0.47297483,  0.04437078,  0.16108797,
        0.14414371,  0.14902535,  0.02014774,  0.48426463,  0.13172362,
        0.2337337 ,  0.11691   ,  0.18103216,  0.00130779,  0.25281818,
        0.14612565,  0.14268127,  0.47671561,  0.38076578,  0.23

In [227]:
from sklearn.metrics import mean_squared_error, r2_score

print('Mean squared error:', mean_squared_error(y_test, y_pred))
print('coeeficient of determination;', r2_score(y_test, y_pred))


Mean squared error: 0.009150574110332004
coeeficient of determination; 0.8045531086669286


### The coefficients and intercept

In [228]:
print('Coefficients:', model.coef_)
print('Intercept:', model.intercept_)

Coefficients: [ 0.1829699   0.18111814  0.04293572 -0.36780589]
Intercept: 0.3195827183308946
