# Section 1  Gettign the Data Ready 

#### Three main things to perform 
####     1. Split the data into features and labels (usually 'X' and 'y')
####     2. Filling (imputing) or disregarding missing values
####     3. Converting non-numerical values to numerical values (feature encoding)

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline 

In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt


plt.style.use('seaborn')

In [3]:
heart_disease = pd.read_csv('data/heart-disease.csv')
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


### Task 1 : splitting intop features and label

In [4]:
X = heart_disease.drop('target' , axis=1)
y = heart_disease['target']
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [5]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [6]:
# Splitting features and labels into training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42,
                                                    test_size=0.2
                                                   )
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

### 2 :  Converting non-numerical to numerical (feature encoding) 

In [7]:
car_sales = pd.read_csv('data/car-sales-extended.csv')
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [8]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [9]:
car_sales.describe()

Unnamed: 0,Odometer (KM),Doors,Price
count,1000.0,1000.0,1000.0
mean,131073.708,4.014,16045.665
std,68859.723885,0.379405,8630.794219
min,10148.0,3.0,2796.0
25%,71238.0,4.0,9481.5
50%,131202.0,4.0,14264.0
75%,192372.75,4.0,20738.75
max,249860.0,5.0,52458.0


In [10]:
X = car_sales.drop('Price', axis=1)
y = car_sales['Price']

# split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
from sklearn.ensemble import RandomForestRegressor

model =  RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

ValueError: could not convert string to float: 'Honda'

In [12]:
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [13]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

category_list = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', 
                                 one_hot, category_list), ],
                               remainder='passthrough')
transformed_X = transformer.fit_transform(X)
# checking transformation 
pd.DataFrame(transformed_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


In [14]:
# building another model 
np.random.seed(42)
X_train, X_test, y_train, y_test =  train_test_split(transformed_X, y, test_size=0.2)

model = RandomForestRegressor(n_estimators=100).fit(X_train, y_train)
model.score(X_test, y_test)

0.3235867221569877

###  3 : Filling and Dealing with Missing values

In [15]:
car_sales_missing = pd.read_csv('data/car-sales-extended-missing-data.csv')
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [16]:
car_sales_missing.describe()

Unnamed: 0,Odometer (KM),Doors,Price
count,950.0,950.0,950.0
mean,131253.237895,4.011579,16042.814737
std,69094.857187,0.382539,8581.695036
min,10148.0,3.0,2796.0
25%,70391.25,4.0,9529.25
50%,131821.0,4.0,14297.0
75%,192668.5,4.0,20806.25
max,249860.0,5.0,52458.0


In [17]:
car_sales_missing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Make           951 non-null    object 
 1   Colour         950 non-null    object 
 2   Odometer (KM)  950 non-null    float64
 3   Doors          950 non-null    float64
 4   Price          950 non-null    float64
dtypes: float64(3), object(2)
memory usage: 39.2+ KB


In [18]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [19]:
# method one 
for column in car_sales_missing.columns:
    if car_sales_missing[column].dtype == 'object':
        car_sales_missing[column] = car_sales_missing[column].fillna('missing')
    if car_sales_missing[column].dtype == 'float':
        car_sales_missing[column] = car_sales_missing[column].fillna(car_sales_missing[column].mean())
    car_sales_missing.dropna(inplace=True)
car_sales_missing.isnull().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [20]:
car_sales_missing

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,missing,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [21]:
category_features = ['Make', 'Colour', "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', 
                                 one_hot, category_features)], 
                               remainder='passthrough')
transformed_X = transformer.fit_transform(car_sales_missing)
pd.DataFrame(transformed_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0,15323.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0,19943.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0,28343.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0,13434.0
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0,14043.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
811,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0,32042.0
812,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0,5716.0
813,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0,31570.0
814,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0,4001.0


#  

### Section 1.2 : Filling missing values with scikit-learn

In [22]:
car_sales_missing = pd.read_csv('data/car-sales-extended-missing-data.csv')
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [23]:
# check for missing values
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [24]:
# drop rows with missing labels
car_sales_missing.dropna(subset=['Price'], inplace=True)
car_sales_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [25]:
# split into X & y
X = car_sales_missing.drop('Price', axis=1)
y =  car_sales_missing['Price']

In [26]:
# Fill missing values with scikit-learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# fill categorical value with 'missing' and numerical with it's mean()
cat_fill = SimpleImputer(fill_value='missing', strategy='constant')
door_fill = SimpleImputer(fill_value=4, strategy='constant')
num_fill = SimpleImputer(strategy='mean')

# define columns
cat_columns = ['Make', 'Colour']
door_column = ['Doors']
num_column = ['Odometer (KM)']

# create an imputer (something that fills missing data)
imputers = ColumnTransformer([
    ('cat_features', cat_fill, cat_columns),
    ('door_features', door_fill, door_column),
    ('num_features', num_fill, num_column),
])

# transform the data
filled_X = imputers.fit_transform(X)
filled_X

array([['Honda', 'White', 4.0, 35431.0],
       ['BMW', 'Blue', 5.0, 192714.0],
       ['Honda', 'White', 4.0, 84714.0],
       ...,
       ['Nissan', 'Blue', 4.0, 66604.0],
       ['Honda', 'White', 4.0, 215883.0],
       ['Toyota', 'Blue', 4.0, 248360.0]], dtype=object)

In [27]:
# make a new dataframe from the filled data
car_sales_filled = pd.DataFrame(filled_X, columns=['Make', 'Colour', 'Doors', 'Odometer (KM)'])
car_sales_filled.head(10)

Unnamed: 0,Make,Colour,Doors,Odometer (KM)
0,Honda,White,4,35431
1,BMW,Blue,5,192714
2,Honda,White,4,84714
3,Toyota,White,4,154365
4,Nissan,Blue,3,181577
5,Honda,Red,4,42652
6,Toyota,Blue,4,163453
7,Honda,White,4,130987
8,missing,White,4,130538
9,Honda,Blue,4,51029


In [28]:
# check for missing values in the new dataset
car_sales_filled.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

### Section 1. 3 :  Converting non-numerical to numerical (feature encoding) 

In [29]:
# converting non-numerical variables to numerical values

one_hot = OneHotEncoder() 

# columns to one_end encode
cat_columns = ['Make', "Colour", 'Doors']

# transform the columns 
transformer = ColumnTransformer([
    ('one_hot', one_hot, cat_columns),
], remainder='passthrough')

car_sales_transformed = transformer.fit_transform(car_sales_filled)
car_sales_transformed

<950x15 sparse matrix of type '<class 'numpy.float64'>'
	with 3800 stored elements in Compressed Sparse Row format>

In [30]:
# splitting into training and test sets 
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(car_sales_transformed, y,
                                                   test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((760, 15), (190, 15), (760,), (190,))

In [31]:
# training a model with five estimators
score = []
for estimators in range(1, 100, 25):
    model = RandomForestRegressor(n_estimators=estimators).fit(X_train, y_train)
    scores = model.score(X_test, y_test)
    score.append(scores)

print(score)

[-0.6388782608951278, 0.2019516493779656, 0.22586663323867517, 0.20765777221897586]


#  

# Task 2 : Choosing a estimator/algorithm for our problem
Check the sklearn-machine-learning map... https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

## Solving a regression problem using the boston dataset

In [32]:
from sklearn.datasets import load_boston
boston =  load_boston()

# turning into a dataset 
boston.keys()
boston_df = pd.DataFrame(boston['data'], columns=boston['feature_names'])
boston_df['target'] = pd.Series(boston['target']) # adding a target column

boston_df.head(10)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2
5,0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3.0,222.0,18.7,394.12,5.21,28.7
6,0.08829,12.5,7.87,0.0,0.524,6.012,66.6,5.5605,5.0,311.0,15.2,395.6,12.43,22.9
7,0.14455,12.5,7.87,0.0,0.524,6.172,96.1,5.9505,5.0,311.0,15.2,396.9,19.15,27.1
8,0.21124,12.5,7.87,0.0,0.524,5.631,100.0,6.0821,5.0,311.0,15.2,386.63,29.93,16.5
9,0.17004,12.5,7.87,0.0,0.524,6.004,85.9,6.5921,5.0,311.0,15.2,386.71,17.1,18.9


In [33]:
len(boston_df)

506

In [34]:
# Let's try the Rigde Regression model
from sklearn.linear_model import Ridge

# Setup random seed
np.random.seed(42)

# create data
X = boston_df.drop('target', axis=1)
y = boston_df['target']

# create train and test set 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=42)

# Instantiate and fit  the model on
model = Ridge().fit(X_train, y_train)

# Check the score 
model.score(X_test, y_test)


0.7200369663975205

Improve the model using another estimator ...https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html
    

In [35]:
# Using the RandomForestRegressor model
from sklearn.ensemble import RandomForestRegressor
np.random.seed(42)

X = boston_df.drop('target', axis=1)
y = boston_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestRegressor().fit(X_train, y_train)
model.score(X_test, y_test)

0.873969014117403

# Choosing an estimator for a Classification Problem
using the sklearn estimator map ...https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

In [36]:
heart_disease = pd.read_csv('data/heart-disease.csv')
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [37]:
len(heart_disease)

303

In [38]:
# using linearsvc estimator class

from sklearn.svm import LinearSVC
np.random.seed(42)
X = heart_disease.drop('target', axis=1)
y = heart_disease['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = LinearSVC(max_iter=1000).fit(X_train, y_train)
clf.score(X_test, y_test)



0.8688524590163934

In [39]:
#using ensemble classifier
from sklearn.ensemble import RandomForestClassifier
np.random.seed(42)
X = heart_disease.drop('target', axis=1)
y = heart_disease['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf =  RandomForestClassifier().fit(X_train, y_train)
clf.score(X_test, y_test)

0.8524590163934426