# Dragon Real Estate - Price Predictor

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from joblib import dump, load

In [2]:
# ---importing the dataset
housing=pd.read_csv('data.csv')

In [3]:
print(housing.head())

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD  TAX  PTRATIO  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296     15.3   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242     17.8   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242     17.8   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222     18.7   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222     18.7   

        B  LSTAT  MEDV  
0  396.90   4.98  24.0  
1  396.90   9.14  21.6  
2  392.83   4.03  34.7  
3  394.63   2.94  33.4  
4  396.90   5.33  36.2  


In [4]:
# ----for giving the information we use info function
print(housing.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       501 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB
None


In [5]:
# ----for droping the columns we use drop function
# housing=housing.dropna(axis=1)
print(housing.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       501 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB
None


In [6]:
# ----Now checking the CHAS values in housing dataframe
print(housing['CHAS'].value_counts())

0    471
1     35
Name: CHAS, dtype: int64


In [7]:
# -----Now we chk all the description like min, max, per, mean, count
print(housing.describe())

             CRIM          ZN       INDUS        CHAS         NOX          RM  \
count  506.000000  506.000000  506.000000  506.000000  506.000000  501.000000   
mean     3.613524   11.363636   11.136779    0.069170    0.554695    6.287711   
std      8.601545   23.322453    6.860353    0.253994    0.115878    0.705395   
min      0.006320    0.000000    0.460000    0.000000    0.385000    3.561000   
25%      0.082045    0.000000    5.190000    0.000000    0.449000    5.885000   
50%      0.256510    0.000000    9.690000    0.000000    0.538000    6.211000   
75%      3.677083   12.500000   18.100000    0.000000    0.624000    6.629000   
max     88.976200  100.000000   27.740000    1.000000    0.871000    8.780000   

              AGE         DIS         RAD         TAX     PTRATIO           B  \
count  506.000000  506.000000  506.000000  506.000000  506.000000  506.000000   
mean    68.574901    3.795043    9.549407  408.237154   18.455534  356.674032   
std     28.148861    2.1057

In [8]:
# ----for plotting the graph
# %matplotlib inline
# housing.hist(bins=25, figsize=(25,25))
# plt.show() 

# Train-Test Splitting

In [9]:
# ----we creat a function which is splitting the train data(80%) and test data(20%) 
# ----this is just a learning purpose----we split the dataset using sklearn by simple one line
def split_train_test(data,test_ratio):
    np.random.seed(42)
    shuffled=np.random.permutation(len(data))
    test_set_size=int(len(data)* test_ratio)
    test_indices=shuffled[:test_set_size] 
    train_indices=shuffled[test_set_size:]
    return data.iloc[train_indices],data.iloc[test_indices]

In [10]:
#  -----Now splitting the datasets into train and test
train_set, test_set = split_train_test(housing,0.2)

In [11]:
# print(f"Rows in train dataset{len(train_set)}\nRows in test dataset {len(test_set)}")

In [12]:
#  ----Now we work splitting train and test data in sklearn
train_set, test_set =train_test_split(housing,test_size=0.2,random_state=42)
print(f"Train dataset size is {len(train_set)} \n Test dataset size is {len(test_set)}")

Train dataset size is 404 
 Test dataset size is 102


In [13]:
# -----if our one or more fetchers are categorical and it is shows in 0 or 1 form and 1 values are very short and then when we split test and train data may be possible that the 1 value to not gone in the test data or may be not gone train data so we splitting these fechers are equally destributed in train data or test data
# --So we use stratefy sampling function.
split=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index, test_index in split.split(housing,housing['CHAS']):
    strat_train_set = housing.loc[train_index]
    strat_test_set= housing.loc[test_index]

In [14]:
# print(strat_test_set)
# print(strat_test_set.describe())
# print(strat_test_set.info())
print(strat_test_set['CHAS'].value_counts())

0    95
1     7
Name: CHAS, dtype: int64


In [15]:
# print(strat_train_set)
# print(strat_train_set.describe())
# print(strat_train_set.info())
print(strat_train_set['CHAS'].value_counts())

0    376
1     28
Name: CHAS, dtype: int64


In [16]:
# -----After splitting the dataset into train and test we creat a copy of train dataset in housing
housing=strat_train_set.copy()

In [17]:
# -----Now we find out the correlation in dataset
# ----Correlation tells us that the price is increase by who many fetchers increase and price are decrese who many fetchers use it
corr_matrix=housing.corr()
corr_matrix['MEDV'].sort_values(ascending=False)

MEDV       1.000000
RM         0.680351
B          0.361761
ZN         0.339741
DIS        0.240451
CHAS       0.205066
AGE       -0.364596
RAD       -0.374693
CRIM      -0.393715
NOX       -0.422873
TAX       -0.456657
INDUS     -0.473516
PTRATIO   -0.493534
LSTAT     -0.740494
Name: MEDV, dtype: float64

In [18]:
# -----Now Plotting the correlation graphs using pandas_plotting library
# attributes=['MEDV','RM', 'ZN', 'LSTAT']
# scatter_matrix(housing[attributes],figsize=(12,8))

In [19]:
# -----we plottiong the graph of RM and MEDV relation
# housing.plot(kind='scatter',x='RM',y='MEDV',alpha=0.8)

## Attribute Combinations

In [20]:
# ----Now a new attribute created as improve the ML model
# housing['TAXRM']=housing['TAX']/housing['RM']
# print(housing['TAXRM'])

In [21]:
corr_matrix=housing.corr()
corr_matrix['MEDV'].sort_values(ascending=False)

MEDV       1.000000
RM         0.680351
B          0.361761
ZN         0.339741
DIS        0.240451
CHAS       0.205066
AGE       -0.364596
RAD       -0.374693
CRIM      -0.393715
NOX       -0.422873
TAX       -0.456657
INDUS     -0.473516
PTRATIO   -0.493534
LSTAT     -0.740494
Name: MEDV, dtype: float64

In [22]:
# ----Now plotting the scatter in TAXRM and MEDV fetchers
# housing.plot(kind='scatter',x='TAXRM',y='MEDV',alpha=0.8)

In [23]:
# ----Now we splits the features and lables
housing=strat_train_set.drop("MEDV",axis=1)
housing_labels = strat_train_set["MEDV"].copy()

# Missing Attributes Handles

In [24]:
# ----To take care of missing attributes, you have three options:
# ----1. Get rid of the missing data points
# ----2. Get rid of the whole attribute
# ----3. Set the value to some value(0, mean or median)

In [25]:
# -----option 1
# -----Note that the original housing dataframe will remain unchanged
a=housing.dropna(subset=["RM"])
print(a.shape)

(400, 13)


In [26]:
# -----option 2
# -----Note that the real housing dataframe will remain unchanged until we use inplace=True function
drp=housing.drop("RM",axis=1)
print(drp.shape)

(404, 12)


In [27]:
# ----compute median for  option 3
median=housing["RM"].median()
print(median)

6.2175


In [28]:
# ----fill the median values in RM column
# housing["RM"].fillna(median)

In [29]:
# ----before Imputing the values we chk that the describe of the housing dataset
housing.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,404.0,404.0,404.0,404.0,404.0,400.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0
mean,3.602814,10.836634,11.34495,0.069307,0.558064,6.282723,69.039851,3.74621,9.735149,412.341584,18.473267,353.392822,12.791609
std,8.099383,22.150636,6.877817,0.25429,0.116875,0.715947,28.258248,2.099057,8.731259,168.672623,2.129243,96.069235,7.23574
min,0.00632,0.0,0.74,0.0,0.389,3.561,2.9,1.1296,1.0,187.0,13.0,0.32,1.73
25%,0.086962,0.0,5.19,0.0,0.453,5.87875,44.85,2.035975,4.0,284.0,17.4,374.6175,6.8475
50%,0.286735,0.0,9.9,0.0,0.538,6.2175,78.2,3.1222,5.0,337.0,19.0,390.955,11.57
75%,3.731923,12.5,18.1,0.0,0.631,6.632,94.1,5.1004,24.0,666.0,20.2,395.63,17.1025
max,73.5341,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,36.98


In [30]:
# -----Now filll the nan values as median using sklearn library
imputer=SimpleImputer(strategy='median')
imputer.fit(housing)

SimpleImputer(strategy='median')

In [31]:
# ----Now check that the median values using statistics_ function
# print(imputer.statistics_)
print(imputer.statistics_.shape)

(13,)


In [32]:
# ----Now we transform the dataFrame in one variable
X=imputer.transform(housing)

In [33]:
# ----sklearn is returning the arrays form but we need a data in matrixs form
# ----So we use pandas library to convert the array to datarame
housing_tr=pd.DataFrame(X,columns=housing.columns)

In [34]:
# ----Now chk it the dataframe using describe function
housing_tr.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0
mean,3.602814,10.836634,11.34495,0.069307,0.558064,6.282077,69.039851,3.74621,9.735149,412.341584,18.473267,353.392822,12.791609
std,8.099383,22.150636,6.877817,0.25429,0.116875,0.712415,28.258248,2.099057,8.731259,168.672623,2.129243,96.069235,7.23574
min,0.00632,0.0,0.74,0.0,0.389,3.561,2.9,1.1296,1.0,187.0,13.0,0.32,1.73
25%,0.086962,0.0,5.19,0.0,0.453,5.87975,44.85,2.035975,4.0,284.0,17.4,374.6175,6.8475
50%,0.286735,0.0,9.9,0.0,0.538,6.2175,78.2,3.1222,5.0,337.0,19.0,390.955,11.57
75%,3.731923,12.5,18.1,0.0,0.631,6.63025,94.1,5.1004,24.0,666.0,20.2,395.63,17.1025
max,73.5341,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,36.98


# Scikit-learn Design

## Features Scaling

In [35]:
# ----------------primarily, two types of features scaling methods:
# --1. Min-max scaling (Normalization)
# --Formula---(value - min)/(max - min)
# --Sklearn provides a class called MinMaxScaler for this

# --2. Standardization
# --formula-- (value - mean)/std
# --Sklearn provides a class called standard Scaler for this

#  Creating a Pipeline

In [36]:
my_pipeline=Pipeline([
    ('imputer',SimpleImputer(strategy="median")),
#     -----add as many as you want in your pipeline
    ("std_Scaler",StandardScaler()),
])

In [37]:
# ----Now we fit and transform the data in other variable
housing_num_tr=my_pipeline.fit_transform(housing)

In [38]:
print(housing_num_tr)

[[-0.43942006  3.12628155 -1.12165014 ... -0.97491834  0.41164221
  -0.86091034]
 [-0.44352175  3.12628155 -1.35893781 ... -0.69277865  0.39131918
  -0.94116739]
 [ 0.15682292 -0.4898311   0.98336806 ...  0.81196637  0.44624347
   0.81480158]
 ...
 [-0.43525657 -0.4898311  -1.23083158 ... -0.22254583  0.41831233
  -1.27603303]
 [ 0.14210728 -0.4898311   0.98336806 ...  0.81196637 -3.15239177
   0.73869575]
 [-0.43974024 -0.4898311   0.37049623 ... -0.97491834  0.41070422
   0.09940681]]


In [39]:
# ----Now we impletes the Algorithm of LinearRegression
# ----we use decisionTreeRegressor because LinearRegression is not work accuratilly
# ---if we use DecisionTreeRegressor it converted the model into overfitting which is not a correct
# model=LinearRegression()
# model=DecisionTreeRegressor()
model=RandomForestRegressor()
model.fit(housing_num_tr, housing_labels)

RandomForestRegressor()

In [40]:
# ---Taking some values of houing dataset
some_data=housing.iloc[:5]

In [41]:
# ----taking some labels of housing dataset
some_labels=housing_labels.iloc[:5]

In [42]:
# ----Now we predicted the values using sklearn library and using function of predict()
prepared_data=my_pipeline.transform(some_data)
# ----Now we predict the values
print(model.predict(prepared_data))

[22.486 25.543 16.38  23.363 23.35 ]


In [43]:
# ----Now checking the lables of the data
print(list(some_labels))

[21.9, 24.5, 16.7, 23.1, 23.0]


# #Evaluating the model

In [44]:
# ---Now we check the mean square error
housing_predictions=model.predict(housing_num_tr)
lin_mse=mean_squared_error(housing_labels, housing_predictions)
# ----rmse means that square_root_mean_square_error
lin_rmse=np.sqrt(lin_mse)

In [45]:
print(lin_mse)
print(lin_rmse)

1.352368960396038
1.1629139952705179


# # Using better evaluation techniques -Cross Validation

In [46]:
# ----dividing groups 1 2 3 4 5 6 7 8 9 10
# ----cross validation works that the model is divided into many groups and than then we check the errors
scores=cross_val_score(model,housing_num_tr,housing_labels,scoring="neg_mean_squared_error",cv=10)
rmse_scores = np.sqrt(-scores)

In [47]:
rmse_scores

array([2.78834529, 2.70869688, 4.34386327, 2.57815198, 3.3362867 ,
       2.55759573, 4.81119069, 3.29031835, 3.34554392, 3.17499271])

In [48]:
# ----Now we just make a function which is printed that the scores, score.mean and score.std
def print_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation: ",scores.std())

In [49]:
print_scores(rmse_scores)

Scores:  [2.78834529 2.70869688 4.34386327 2.57815198 3.3362867  2.55759573
 4.81119069 3.29031835 3.34554392 3.17499271]
Mean:  3.2934985529202345
Standard deviation:  0.7124391392219006


# #Saving the model 

In [51]:
# ----Now we use joblib as to run the Dragon Real Estates 
dump(model, 'Dragon.joblib')

['Dragon.joblib']

In [52]:
# # Testing the model on test data
X_test = strat_test_set.drop("MEDV",axis=1)
Y_test=strat_test_set['MEDV'].copy()
X_test_prepared=my_pipeline.transform(X_test)
final_predictions= model.predict(X_test_prepared)
final_mse=mean_squared_error(Y_test, final_predictions)
final_rmse=np.sqrt(final_mse)

In [53]:
print(final_rmse)

2.953527721217459


In [54]:
# ----Now checking the values 
# print(final_predictions)
# print(list(Y_test))

[25.054 11.487 25.498 21.903 18.285 14.937 19.761 14.602 31.272 40.533
 20.096 11.615 23.993 28.979 19.441 10.667 31.697 14.519 23.601 18.779
 19.617 17.83  17.536 21.862 18.384 30.596 16.382 32.748  9.013 33.568
 23.908 21.01  22.904 10.88  20.974 11.273 42.484 24.393 23.173 41.559
 23.819 29.308 20.457 20.883 19.411 33.588 44.171 20.009 20.362 21.841
 21.213 14.662 21.189 15.073 24.816 32.836 42.251 28.16  20.142 20.827
 47.31   9.96  18.654 24.738 15.147 32.888 19.435 18.156 19.125 33.842
 27.243 22.919 21.134 22.41  34.964 12.742 15.902 20.01  20.701 21.413
 22.455 21.566 14.399 22.96  20.816 21.291 14.049 21.465 22.002 23.052
 18.555 27.171  7.334 26.251 18.806 29.883 19.539 31.064 14.666 26.703
 20.579 20.134]
[16.5, 10.2, 30.1, 23.0, 14.4, 15.6, 19.4, 14.1, 30.3, 35.2, 23.1, 13.8, 25.0, 27.9, 19.5, 12.3, 32.2, 13.5, 23.8, 21.7, 19.2, 19.5, 10.4, 23.2, 18.6, 28.5, 15.2, 32.0, 7.2, 34.6, 20.1, 20.6, 23.6, 13.1, 23.8, 12.7, 43.1, 24.7, 22.2, 44.0, 28.1, 31.0, 21.7, 23.4, 19.5, 33.1

In [57]:
prepared_data[0]

array([-0.43942006,  3.12628155, -1.12165014, -0.27288841, -1.42262747,
       -0.24465043, -1.31238772,  2.61111401, -1.0016859 , -0.5778192 ,
       -0.97491834,  0.41164221, -0.86091034])

# # Using the Model

In [None]:
# ----importing libraries
from joblib import dump, load
import numpy as np
model = load('Dragon.joblib')
features=np.array([[-0.43942006,  3.12628155, -1.12165014, -0.27288841, -1.42262747,
       -0.23979304, -1.31238772,  2.61111401, -1.0016859 , -0.5778192 ,
       -0.97491834,  0.41164221, -0.86091034]])
model.predict(features)