<a href="https://colab.research.google.com/github/vishalgimhan/MachineLearningProjects/blob/main/Machine_Learning_Course_Kaggle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#First, Data Exploration

In [2]:
#Pandas - to explore and manipulate data
import pandas as pd

In [3]:
#Data - Home Prices in Melbourne, Australia

# save filepath to variable for easier access
melbourne_file_path = "/content/drive/MyDrive/Datasets/melb_data.csv"

# read the data and store data in DataFrame titled melbourne_data
melbourne_data = pd.read_csv(melbourne_file_path)

# print a summary of the data in Melbourne data
melbourne_data.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [4]:
# count, shows how many rows have non-missing values.
# mean, which is the average
# std is the standard deviation, which measures how numerically spread out the values are.
# min, 25%, 50%, 75% and max values, imagine sorting each column from lowest to highest value.

# First Machine Learning Model

In [5]:
# Selecting Data for Modeling

#We'll start by picking a few variables using our intuition

melbourne_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [6]:
# The Melbourne data has some missing values
# We will take the simplest option for now, and drop houses from our data.
melbourne_data = melbourne_data.dropna(axis=0)

In [7]:
#Select the Prediction Target
y = melbourne_data.Price

In [8]:
# Choosing "Features"
# The columns that are inputted into our model (and later used to make predictions) are called "features."
# In our case, those would be the columns used to determine the home price.
# Sometimes, you will use all columns except the target as features.
# Other times you'll be better off with fewer features.

melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]

In [9]:
#Review the data we'll be using to predict
X.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude
count,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1.57634,471.00694,141.568645,1964.081988,-37.807904,144.990201
std,0.971079,0.711362,897.449881,90.834824,38.105673,0.07585,0.099165
min,1.0,1.0,0.0,0.0,1196.0,-38.16492,144.54237
25%,2.0,1.0,152.0,91.0,1940.0,-37.855438,144.926198
50%,3.0,1.0,373.0,124.0,1970.0,-37.80225,144.9958
75%,4.0,2.0,628.0,170.0,2000.0,-37.7582,145.0527
max,8.0,8.0,37000.0,3112.0,2018.0,-37.45709,145.52635


In [10]:
# Review the top few rows
X.head()

Unnamed: 0,Rooms,Bathroom,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude
1,2,1.0,156.0,79.0,1900.0,-37.8079,144.9934
2,3,2.0,134.0,150.0,1900.0,-37.8093,144.9944
4,4,1.0,120.0,142.0,2014.0,-37.8072,144.9941
6,3,2.0,245.0,210.0,1910.0,-37.8024,144.9993
7,2,1.0,256.0,107.0,1890.0,-37.806,144.9954


#Building Your Model
The steps to building and using a model are:

**Define:** What type of model will it be? A decision tree? Some other type of model? Some other parameters of the model type are specified too.

**Fit**: Capture patterns from provided data. This is the heart of modeling.

**Predict**: Just what it sounds like

**Evaluate**: Determine how accurate the model's predictions are.

In [11]:
#Building the Model
from sklearn.tree import DecisionTreeRegressor

# Define model. Specify a number for random_state to ensure same results each run
melbourne_model = DecisionTreeRegressor(random_state=1)

# Fit model
melbourne_model.fit(X, y)

In [12]:
print("Making predictions for the following 5 houses:")
print(X.head())
print("The predictions are")
print(melbourne_model.predict(X.head()))

Making predictions for the following 5 houses:
   Rooms  Bathroom  Landsize  BuildingArea  YearBuilt  Lattitude  Longtitude
1      2       1.0     156.0          79.0     1900.0   -37.8079    144.9934
2      3       2.0     134.0         150.0     1900.0   -37.8093    144.9944
4      4       1.0     120.0         142.0     2014.0   -37.8072    144.9941
6      3       2.0     245.0         210.0     1910.0   -37.8024    144.9993
7      2       1.0     256.0         107.0     1890.0   -37.8060    144.9954
The predictions are
[1035000. 1465000. 1600000. 1876000. 1636000.]


#Model Validation

In [13]:
# You've built a model. But how good is it?
# Measure the performance of your model, so you can test and compare alternatives.

# he relevant measure of model quality is predictive accuracy. In other words,
# will the model's predictions be close to what actually happens.

In [14]:
# There are many metrics for summarizing model quality, but we'll start with one called
# Mean Absolute Error (also called MAE).

# The prediction error for each house is:
# error = actual − predicted

#Once we have a model, here is how we calculate the mean absolute error:
from sklearn.metrics import mean_absolute_error

predicted_home_prices = melbourne_model.predict(X)
mean_absolute_error(y, predicted_home_prices)

434.71594577146544

#The Problem with "In-Sample" Scores
Since models' practical value come from making predictions on new data, we measure performance on data that wasn't used to build the model.

In [21]:
#a single "sample" of houses for both building the model and evaluating it. this is bad.
from sklearn.model_selection import train_test_split

# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
# Define model
melbourne_model = DecisionTreeRegressor(random_state = 1)
# Fit model
melbourne_model.fit(train_X, train_y)

# get predicted prices on validation data
val_predictions = melbourne_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

262494.3027759845


In [16]:
# Your mean absolute error for the in-sample data was about 500 dollars. Out-of-sample it is more than 250,000 dollars.

#Underfitting and Overfitting


In [17]:
#Experimenting With Different Models

#Overfitting,
#where a model matches the training data almost perfectly, but does poorly in validation and other new data.

#Underfitting
#When a model fails to capture important distinctions and patterns in the data, so it performs poorly even in training data.

#max_leaf_nodes argument provides a very sensible way to control overfitting vs underfitting.
#The more leaves we allow the model to make, the more we move from the underfitting area in the above graph to the overfitting area.

#We can use a utility function to help compare MAE scores from different values for max_leaf_nodes:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [18]:
# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

#choose one with lowest error

Max leaf nodes: 5  		 Mean Absolute Error:  324110
Max leaf nodes: 50  		 Mean Absolute Error:  252108
Max leaf nodes: 500  		 Mean Absolute Error:  239204
Max leaf nodes: 5000  		 Mean Absolute Error:  249358


Of the options listed, 500 is the optimal number of leaves.

In [19]:
# Store the best value of max_leaf_nodes (it will be either 5, 50, 500, 5000)
scores = {leaf_size: get_mae(leaf_size, train_X, val_X, train_y, val_y) for leaf_size in [5, 50, 500, 5000]}
best_tree_size = min(scores, key=scores.get)

# Fill in argument to make optimal size
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1)

# fit the final model
final_model.fit(X,y)

#Random Forest

Decision trees leave you with a difficult decision. A deep tree with lots of leaves will overfit because each prediction is coming from historical data from only the few houses at its leaf. But a shallow tree with few leaves will perform poorly because it fails to capture as many distinctions in the raw data.

The random forest uses many trees, and it makes a prediction by averaging the predictions of each component tree.

In [22]:
from sklearn.ensemble import RandomForestRegressor

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
melb_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, melb_preds))

191669.7536453626


There is likely room for further improvement, but this is a big improvement over the best decision tree error of 250,000.

#Exercise

In [23]:
#Data - House Prices in Iowa

#Step 1: Loading the data
import pandas as pd

# Path of the file to read
iowa_file_path = '/content/drive/MyDrive/Datasets/home-data-for-ml-course.csv'

# Read the file into a variable home_data
home_data = pd.read_csv(iowa_file_path)


In [24]:
# Step 2: Review the Data

# Print summary statistics in next line
home_data.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [25]:
# Specify Prediction Target

# print the list of columns in the dataset to find the name of the prediction target
home_data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [26]:
y = home_data.SalePrice

In [27]:
#Step 2: Create X
# Create the list of features below
feature_names = ['LotArea','YearBuilt','1stFlrSF','2ndFlrSF','FullBath','BedroomAbvGr','TotRmsAbvGrd']

# Select data corresponding to features in feature_names
X = home_data[feature_names]

# Review data
# print description or statistics from X
print(X.describe())

# print the top few lines
print(X.head())

             LotArea    YearBuilt     1stFlrSF     2ndFlrSF     FullBath  \
count    1460.000000  1460.000000  1460.000000  1460.000000  1460.000000   
mean    10516.828082  1971.267808  1162.626712   346.992466     1.565068   
std      9981.264932    30.202904   386.587738   436.528436     0.550916   
min      1300.000000  1872.000000   334.000000     0.000000     0.000000   
25%      7553.500000  1954.000000   882.000000     0.000000     1.000000   
50%      9478.500000  1973.000000  1087.000000     0.000000     2.000000   
75%     11601.500000  2000.000000  1391.250000   728.000000     2.000000   
max    215245.000000  2010.000000  4692.000000  2065.000000     3.000000   

       BedroomAbvGr  TotRmsAbvGrd  
count   1460.000000   1460.000000  
mean       2.866438      6.517808  
std        0.815778      1.625393  
min        0.000000      2.000000  
25%        2.000000      5.000000  
50%        3.000000      6.000000  
75%        3.000000      7.000000  
max        8.000000     14.

In [28]:
#Step 3: Specify and Fit Model
from sklearn.tree import DecisionTreeRegressor

#specify the model.
#For model reproducibility, set a numeric value for random_state when specifying the model
iowa_model = DecisionTreeRegressor(random_state=1)

# Fit the model
iowa_model.fit(X,y)

In [29]:
#Step 4: Make Predictions
predictions = iowa_model.predict(X)
print(predictions)

[208500. 181500. 223500. ... 266500. 142125. 147500.]


In [30]:
#Compare the top few predictions to the actual home values (in y) for those same homes.
print(y.head())
iowa_model.predict(X.head())

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64


array([208500., 181500., 223500., 140000., 250000.])

In [31]:
print("First in-sample predictions:", iowa_model.predict(X.head()))
print("Actual target values for those homes:", y.head().tolist())

First in-sample predictions: [208500. 181500. 223500. 140000. 250000.]
Actual target values for those homes: [208500, 181500, 223500, 140000, 250000]


In [32]:
#Step 1: Split Your Data
# Import the train_test_split function and uncomment
from sklearn.model_selection import train_test_split

# fill in and uncomment
train_X, val_X, train_y, val_y = train_test_split(X,y,random_state=1)

#Step 2: Specify and Fit the Model
# Specify the model
iowa_model = DecisionTreeRegressor(random_state=1)

# Fit iowa_model with the training data.
iowa_model.fit(train_X,train_y)

#Step 3: Make Predictions with Validation data
# Predict with all validation observations
val_predictions = iowa_model.predict(val_X)

# print the top few validation predictions
print(iowa_model.predict(val_X.head()))
# print the top few actual prices from validation data
print(y.head())

#What do you notice that is different from what you saw with in-sample predictions .

[186500. 184000. 130000.  92000. 164500.]
0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64


In [33]:
#Step 4: Calculate the Mean Absolute Error in Validation Data
from sklearn.metrics import mean_absolute_error
val_mae = mean_absolute_error(val_y,val_predictions)

print("Validation MAE: {:,.0f}".format(val_mae))

Validation MAE: 29,653


In [34]:
#Is that MAE good? There isn't a general rule for what values are good that applies across applications.
#But you'll see how to use (and improve) this number

In [35]:
#Finding the number of nodes with lowest error
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [36]:
#Step 1: Compare Different Tree Sizes

max_leaf_nodes = [5, 25, 50, 100, 250, 500]
# find the ideal tree size from candidate_max_leaf_nodes
for a in max_leaf_nodes:
    choosen_mae = get_mae(a, train_X, val_X, train_y, val_y)
    print("Max Leaf Node: %d \t\t Mean Absolute Error: %d" % (a, choosen_mae))

Max Leaf Node: 5 		 Mean Absolute Error: 35044
Max Leaf Node: 25 		 Mean Absolute Error: 29016
Max Leaf Node: 50 		 Mean Absolute Error: 27405
Max Leaf Node: 100 		 Mean Absolute Error: 27282
Max Leaf Node: 250 		 Mean Absolute Error: 27893
Max Leaf Node: 500 		 Mean Absolute Error: 29454


In [37]:
# Store the best value of max_leaf_nodes (it will be either 5, 25, 50, 100, 250 or 500)
scores = {leaf_size: get_mae(leaf_size, train_X, val_X, train_y, val_y) for leaf_size in max_leaf_nodes}
best_tree_size = min(scores, key=scores.get)

print(best_tree_size)

100


In [38]:
#Step 2: Fit Model Using All Data
# Fill in argument to make optimal size and uncomment
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1)

# fit the final model and uncomment the next two lines
final_model.fit(X,y)

In [39]:
#Random Forest

#Step 1: Use a Random Forest

from sklearn.ensemble import RandomForestRegressor

# Define the model. Set random_state to 1
rf_model = RandomForestRegressor(random_state=1)

# fit your model
rf_model.fit(train_X,train_y)

# Calculate the mean absolute error of your Random Forest model on the validation data
rf_val_mae = mean_absolute_error(val_y,rf_model.predict(val_X))

print("Validation MAE for Random Forest Model: {}".format(rf_val_mae))



Validation MAE for Random Forest Model: 21857.15912981083
