### Machine Learning Approach

In [None]:
# machine learning approach
# a great tool to analyze data, find hidden patterns and relationships,
# and extract information to enable information-driven decisions and provide insights

# terminologies 
# 1. Observations - Records, Samples, Examples
# 2. Features - Inputs or Attributes that define a given dataset
# 3. Response - Label, Outcome, Target

# Problem to Solve or Dataset to analyze - Machine Learning
# 1. Understand the problem/dataset
# 2. Extract the features from the dataset
# 3. Identify problem type
# 4. Choose the appropriate model
# 5. Train and Test the model
# 6. Strive for acurracy - by fine tuning the parameters

### Example

In [None]:
# Features(Attributes)    Education      ProfessionalTraining(Yes/No)      Hourly Rate
# Observations(records)   16             1                                 90          Response
#                         15             0                                 65          (label)

### Supervised and Unsupervised

In [None]:
# Supervised or Unsupervised - Problem Type
# 1. Supervised Learning
    # the dataset used to train should have, features, observations and responses.
    # then the model is trained to predict the right response/outcome for a given set of data points
    # 'Generalize' the provided dataset so that  'general rule' can be applied to new data as well.
# 2. Unsupervised Learning
    # Response of data or outcome is unknown
    # Supervised learning models are used to identify and visualize patters in data by grouping similar types of data.
    # The goal of this model is to represent data in a way that meaningful information can be extracted.

In [None]:
# Problem Types
# 1. Supervised learning -
# Data Type -       a. Continuous, measured information that has infinite no of possible values
#                      example: Age, Salary, Price, Temperature.
#                   b. Categorical, grouped and labled data 
#                      example: Gender, Face Recognition, Spam-Detection
# Problem types -   a. Regression, b. Classifications
# 2. Unsupervised Learning -
# Data Type -       a. Continuous
#                   b. Categorical
# Problem types -   a. Dimensionality Reduction - reduction of dimension without data loss
#                   b. Clustering categorical data - group similar datapoints
#                   Used to identify data patterns

# Supervised - categories of news based on the topics
# unsupervised - grouping of similar stories on different news networks

# The regression algorithm belonging to the
# supervised learning model is best suited to analyze continuous data.

# The goal of unsupervised learning is to understand the structure of the data and represent it.
# There is no right or certain answer in unsupervised learning.

### Scikit Learn

In [None]:
# Scikit learn - problem solution approach
# 1. Model Selection, ML algo based on dataset type
# 2. Estimator Object, scikit learn model instantiation, The estimator instance or object is a model.
# 3. Model Training
# 4. Predictions
# 5. Model tuning trhough multiple iterations and result observations
# 6. Acurracy

# Follow the four steps
# 1. Create Separate objects for feature and response
# 2. ensure that features and response have only numeric values.
# 3. Features and response should be in the form of a NumPy ndarray
# 4. Features and response, now whould have shapes and sizes
# 5. Features always mapped as x, and response always as y.

### Linear Regression

In [None]:
# Supervised learning model used to analyze continuos data.

#### Simple linear equation:     y = mx + c  
#### Linear regression equation:     y = B0 + B1x   
#### response = intercept + coefficient of x * input features
##### coefficient of x also called slope parameter; 
##### equal to the change in response variable, given a one unit change in predictor variable

![image.png](attachment:image.png)

![image-2.png](attachment:image-2.png)

#### estimator object instantiation
#### sklearn.linear_model.LinearRegression(fit_intercept=True,normalize=False, copy_X=True, n_jobs = 1)

![image.png](attachment:image.png)

In [1]:
# import necessary libraries
import numpy as np
import pandas as pd

In [2]:
# import scikit learn dataset
from sklearn.datasets import load_boston
boston_dataset = load_boston() # create an object to instantiate the dataset

In [7]:
# How to describe a dataset
print(boston_dataset['DESCR'])

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [9]:
# print features of the data set
print(boston_dataset['feature_names'])

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


In [10]:
# creating dataframe from dataset
df_boston = pd.DataFrame(boston_dataset.data)

In [11]:
# set features as columns on the dataframe
df_boston.columns = boston_dataset.feature_names

In [12]:
# View first five observations
df_boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [14]:
# Observation and Features
print(df_boston.shape)
# 506 Observations(records) or rows and 13 columns or Features(Attributes) 

(506, 13)


In [16]:
# No of Response(label)
print(boston_dataset.target.shape)

(506,)


In [17]:
# Print all targets
print(boston_dataset['target'])

[24.  21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 15.  18.9 21.7 20.4
 18.2 19.9 23.1 17.5 20.2 18.2 13.6 19.6 15.2 14.5 15.6 13.9 16.6 14.8
 18.4 21.  12.7 14.5 13.2 13.1 13.5 18.9 20.  21.  24.7 30.8 34.9 26.6
 25.3 24.7 21.2 19.3 20.  16.6 14.4 19.4 19.7 20.5 25.  23.4 18.9 35.4
 24.7 31.6 23.3 19.6 18.7 16.  22.2 25.  33.  23.5 19.4 22.  17.4 20.9
 24.2 21.7 22.8 23.4 24.1 21.4 20.  20.8 21.2 20.3 28.  23.9 24.8 22.9
 23.9 26.6 22.5 22.2 23.6 28.7 22.6 22.  22.9 25.  20.6 28.4 21.4 38.7
 43.8 33.2 27.5 26.5 18.6 19.3 20.1 19.5 19.5 20.4 19.8 19.4 21.7 22.8
 18.8 18.7 18.5 18.3 21.2 19.2 20.4 19.3 22.  20.3 20.5 17.3 18.8 21.4
 15.7 16.2 18.  14.3 19.2 19.6 23.  18.4 15.6 18.1 17.4 17.1 13.3 17.8
 14.  14.4 13.4 15.6 11.8 13.8 15.6 14.6 17.8 15.4 21.5 19.6 15.3 19.4
 17.  15.6 13.1 41.3 24.3 23.3 27.  50.  50.  50.  22.7 25.  50.  23.8
 23.8 22.3 17.4 19.1 23.1 23.6 22.6 29.4 23.2 24.6 29.9 37.2 39.8 36.2
 37.9 32.5 26.4 29.6 50.  32.  29.8 34.9 37.  30.5 36.4 31.1 29.1 50.
 33.3 3