In [0]:
import numpy as np
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from prettytable import PrettyTable
import statistics as st

In [0]:
boston = load_boston()
x = boston.data #independent variables
y = boston.target #target variable

# Task: 1

<font color='red'><b>Step 1 Creating samples: </b></font> Randomly create 30 samples from the whole boston data points.
<ol>
<li>Creating each sample: Consider any random 303(60% of 506) data points from whole data set and then replicate any 203 points from the sampled points</li>
<li>Ex: For better understanding of this procedure lets check this examples, assume we have 10 data points [1,2,3,4,5,6,7,8,9,10], first we take 6 data points randomly consider we have selected [4, 5, 7, 8, 9, 3] now we will replciate 4 points from [4, 5, 7, 8, 9, 3], consder they are [5, 8, 3,7] so our final sample will be [4, 5, 7, 8, 9, 3, 5, 8, 3,7]</li>
<li> we create 30 samples like this </li>
<li> Note that as a part of the Bagging when you are taking the random samples make sure each of the sample will have                different set of columns</li>
<li> Ex: assume we have 10 columns for the first sample we will select [3, 4, 5, 9, 1, 2] and for the second sample [7, 9, 1, 4, 5, 6, 2] and so on...</li>
<li> Make sure each sample will have atleast 3 feautres/columns/attributes</li>
</ol>

<font color='red'><b>Step 2 Building High Variance Models on each of the sample and finding train MSE value:</b></font> Build a DecisionTreeRegressor on each of the sample.
<ol><li>Build a regression trees on each of 30 samples.</li>
<li>computed the predicted values of each data point(506 data points) in your corpus.</li>
<li> predicted house price of $i^{th}$ data point $y^{i}_{pred} =  \frac{1}{30}\sum_{k=1}^{30}(\text{predicted value of } x^{i} \text{ with } k^{th} \text{ model})$.</li>
<li>Now calculate the $MSE =  \frac{1}{506}\sum_{i=1}^{506}(y^{i} - y^{i}_{pred})^{2}$.</li>
</ol>

<font color='red'><b>Step 3 Calculating the OOB score :</b></font>
<ol>
<li>Computed the predicted values of each data point(506 data points) in your corpus.</li>
<li>Predicted house price of $i^{th}$ data point $y^{i}_{pred} =  \frac{1}{k}\sum_{\text{k= model which was buit on samples not included } x^{i}}(\text{predicted value of } x^{i} \text{ with } k^{th} \text{ model})$.</li>
<li>Now calculate the $OOB Score =  \frac{1}{506}\sum_{i=1}^{506}(y^{i} - y^{i}_{pred})^{2}$.</li>
</ol>

In [0]:
def generated_indices(x, y): # Takes x, y & returns row indices and their y's
  sampled_rows_x = []
  y_for_x = []
  for i in range(30):
    a60 = list(np.random.choice(x.shape[0], 303)) #Generates a uniform random sample of size 303 in range 506
    a40 = list(np.random.choice(a60,203)) #Generates a uniform random sample of size 203 from data1
    final_sample = a60 + a40 #merging both lists
    final_y = y[final_sample] #Creating final y by using the row no.s in final_rows
    sampled_rows_x.append(final_sample)
    y_for_x.append(final_y)
  return sampled_rows_x, y_for_x #returns sample x's(303+203) row indices and it's relative y

In [0]:
def col_selector(x): #Returns selected columns
  selected_columns = []
  for i in range(30):
    n = np.random.randint(3, x.shape[1]) #Deciding no. of columns to be selected(has to minimum 3)
    columns = list(np.random.choice(x.shape[1], n, replace = False)) #Selecting n columns from range(13),sampling without replacement
    selected_columns.append(columns)
  return selected_columns

In [0]:
def sample_generator(rows, cols, x): # Generates 30 samples
  samples30 = []
  for i in range(30):
    sample = x[rows[i], :][:, cols[i]]
    samples30.append(sample)
  return samples30

In [0]:
def models30(samples, relative_y):
  models = []
  for index, i in enumerate(samples):
    clf = DecisionTreeRegressor(random_state=0)
    y_selected = relative_y[index]
    model = clf.fit(i, y_selected)
    models.append(model)
  return models

In [0]:
def scores(x, y, train_cols, models, X_rows):
  sum_y_pred = []
  sum_oob_y_pred = []

  for i in range(len(x)):
    y_pred = 0
    y_pred_oob = 0
    count = 0

    for idx, model in enumerate(models):
      y_pred += model.predict(x[i,train_cols[idx]].reshape(1,-1))

      if(i not in X_rows[idx]):
        count += 1
        y_pred_oob += model.predict(x[i,train_cols[idx]].reshape(1,-1))
    
    y_pred = y_pred/30
    sum_y_pred.append(y_pred)

    y_pred_oob = y_pred_oob/count
    sum_oob_y_pred.append(y_pred_oob)

  mse = round(mean_squared_error(y, sum_y_pred), 6)
  oob_score = round(mean_squared_error(y, sum_oob_y_pred), 6)
  return mse, oob_score, y_pred

In [0]:
def random_forrest(x, y):
  X_rows, y_for_x = generated_indices(x, y)
  selected_columns = col_selector(x)
  samples = sample_generator(X_rows, selected_columns, x)
  models = models30(samples, y_for_x)

  mse, oob_score, _ = scores(x, y, selected_columns, models, X_rows)
    
  return mse, oob_score

In [11]:
mse, oob_scores = random_forrest(x, y)
print("MSE:",mse,"\nOOB Score:",oob_scores)

MSE: 4.74143 
OOB Score: 14.597161


# Task: 2

<pre>
<font color='red'><b>Computing CI of OOB Score and Train MSE</b></font>
<ol>
<li> Repeat Task 1 for 35 times, and for each iteration store the Train MSE and OOB score </li>
<li> After this we will have 35 Train MSE values and 35 OOB scores </li>
<li> using these 35 values (assume like a sample) find the confidence intravels of MSE and OOB Score </li>
<li> you need to report CI of MSE and CI of OOB Score </li>
<li> Note: Refer the Central_Limit_theorem.ipynb to check how to find the confidence intravel</li>
</ol>
</pre>

In [0]:
mse35 = []
oob35 = []
for _ in range(35):
  mse, oob_scores = random_forrest(x, y)
  mse35.append(mse)
  oob35.append(oob_scores)

![alt text](https://i.imgur.com/U7rnj7d.png)

In [15]:
table = PrettyTable(["#samples", "Sample Name", "Sample Size", "Sample mean", "Pop Std","Left C.I 95%", "Right C.I 95%"])

for i in [mse35, oob35]:
  pop_std = round(np.asarray(i).std(), 3)
  pop_mean = round(np.asarray(i).mean(), 3)
  sample = np.asarray(i)
  sample_size = 30 #read CLT, check n, samples of size 'n'
  sample_mean = round(sample.mean(), 3)
  left_limit = np.round(sample_mean - 2*(pop_std/np.sqrt(sample_size)), 3)
  right_limit = np.round(sample_mean + 2*(pop_std/np.sqrt(sample_size)), 3)
  sample_name = 'MSE' if i == mse35 else 'OOB_score'
  row_no = 1 if i == mse35 else 2
  row = []
  row.append(row_no)
  row.append(sample_name)
  row.append(sample_size)
  row.append(sample_mean)
  row.append(pop_std)
  row.append(left_limit)
  row.append(right_limit)
  table.add_row(row)
print(table)

+----------+-------------+-------------+-------------+---------+--------------+---------------+
| #samples | Sample Name | Sample Size | Sample mean | Pop Std | Left C.I 95% | Right C.I 95% |
+----------+-------------+-------------+-------------+---------+--------------+---------------+
|    1     |     MSE     |      30     |    4.585    |   0.46  |    4.417     |     4.753     |
|    2     |  OOB_score  |      30     |    14.655   |  1.274  |    14.19     |     15.12     |
+----------+-------------+-------------+-------------+---------+--------------+---------------+


# Task: 3
<pre>
<font color='red'><b>Given a single query point predict the price of house.</b></font>

<li>Consider xq= [0.18,20.0,5.00,0.0,0.421,5.60,72.2,7.95,7.0,30.0,19.1,372.13,18.60] Predict the house price for this point as mentioned in the step 2 of Task 1. </li>
</pre>

In [0]:
xq = [0.18, 20.0, 5.00, 0.0, 0.421, 5.60, 72.2, 7.95, 7.0, 30.0, 19.1, 372.13, 18.60]

X_rows, y_for_x = generated_indices(x, y)
selected_columns = col_selector(x)
samples = sample_generator(X_rows, selected_columns, x)
models = models30(samples, y_for_x)

xqa = np.asarray(xq).reshape(1, -1)
agg_y_pred = []
for i in range(len(xqa)):
  y_pred=0
  for idx, model in enumerate(models):
    y_pred += model.predict(xqa[i,selected_columns[idx]].reshape(1,-1))
  y_pred=y_pred/30
  agg_y_pred.append(round(float(y_pred), 6))

In [17]:
print(agg_y_pred)

[20.477273]
