<a href="https://colab.research.google.com/github/vjbarayuga/Week5-Introduction-To-Machine-Learning/blob/main/Pre_ProcessingExercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
# import libraries
import pandas as pd
import matplotlib.pyplot as plt

In [15]:
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector
from sklearn.impute import SimpleImputer


In [16]:
# open filename
filename = '/content/drive/MyDrive/Coding_DOJO-Data_Science_Bootcamp/03_Machine_Learning_Data_Science/Datasets/insurance.csv'
df = pd.read_csv(filename)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [17]:
# determine the problem/task
# classification task
# determine the X and y
y = df['charges']
X = df.drop(columns = 'charges')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [18]:
#check the type of object
type(y)

pandas.core.series.Series

In [19]:
#type of object for X
type(X)

pandas.core.frame.DataFrame

In [20]:
#view the first five entries for y
y[0:5]

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64

In [21]:
X.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.77,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.88,0,no,northwest


In [22]:
len(X_train)

1003

In [23]:
len(X_test)

335

In [24]:
len(y_train)

1003

In [25]:
len(y_test)

335

In [26]:
#check missing values
print(df.isna().sum().sum(), 'missing values')

0 missing values


In [27]:
#instantiate the selectors to for numeric and categorical data types
num_selector = make_column_selector(dtype_include='number')
cat_selector = make_column_selector(dtype_include='object')
#select the numeric columns of each type
num_columns = num_selector(X_train)
cat_columns = cat_selector(X_train)
#check our lists
print('numeric columns are', num_columns)
print('categorical columns are', cat_columns)

numeric columns are ['age', 'bmi', 'children']
categorical columns are ['sex', 'smoker', 'region']


In [28]:
df_num = df[num_columns]
df_num.loc[:, df_num.isna().any()]

0
1
2
3
4
...
1333
1334
1335
1336
1337


In [29]:
#Instantiate the imputer object from the SimpleImputer class with strategy 'median'
median_imputer = SimpleImputer(strategy='median')
#Fit the imputer object on the numeric training data with .fit() 
#calculates the medians of the columns in the training set
median_imputer.fit(X_train[num_columns])
#Use the mean from the training data to fill the missing values in 
#the numeric columns of both the training and testing sets with .transform()
X_train.loc[:, num_columns] = median_imputer.transform(X_train[num_columns])
X_test.loc[:, num_columns] = median_imputer.transform(X_test[num_columns])

In [30]:
#instantiate a SimpleImputer to fill missing data with the most frequent value
freq_imputer = SimpleImputer(strategy='most_frequent')
#fit on the categorical columns of the training data
freq_imputer.fit(X_train[cat_columns])
#transform the categorical columns of both the training and testing data
X_train.loc[:, cat_columns] = freq_imputer.transform(X_train[cat_columns])
X_test.loc[:, cat_columns] = freq_imputer.transform(X_test[cat_columns])

In [31]:
#check the imputation worked
print(X_train.isna().sum().sum(), 'missing values')
X_train.info()

0 missing values
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1003 entries, 693 to 1126
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1003 non-null   float64
 1   sex       1003 non-null   object 
 2   bmi       1003 non-null   float64
 3   children  1003 non-null   float64
 4   smoker    1003 non-null   object 
 5   region    1003 non-null   object 
dtypes: float64(3), object(3)
memory usage: 54.9+ KB


In [32]:
#make categorical and numeric selectors
cat_selector = make_column_selector(dtype_include='object')


In [33]:
#select categorical columns
cat_data = X_train[cat_selector(X_train)]
cat_data

Unnamed: 0,sex,smoker,region
693,male,no,northwest
1297,female,no,southeast
634,male,no,southwest
1022,male,yes,southeast
178,female,no,southwest
...,...,...,...
1095,female,no,northeast
1130,female,no,southeast
1294,male,no,northeast
860,female,yes,southwest


In [34]:
from sklearn.preprocessing import OneHotEncoder
#instantiate one hot encoder
ohe_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
ohe_encoder.fit(cat_data)
cat_ohe = ohe_encoder.transform(cat_data)
cat_ohe

array([[0., 1., 1., ..., 1., 0., 0.],
       [1., 0., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 0., 1.],
       ...,
       [0., 1., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 1.],
       [0., 1., 1., ..., 0., 0., 1.]])

In [35]:
#convert to dataframe, extract new column names from encoder
#set prefixes to original column names
pd.DataFrame(cat_ohe, columns=ohe_encoder.get_feature_names(cat_data.columns))



Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...
998,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
999,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1000,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
1001,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
