In [1]:
import pandas as pd
from pandas_datareader import data
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('diamonds.csv', usecols=['carat', 'depth', 'table', 'price', 'x', 'y', 'z'])

In [3]:
df.head()

Unnamed: 0,carat,depth,table,price,x,y,z
0,0.23,61.5,55.0,326,3.95,3.98,2.43
1,0.21,59.8,61.0,326,3.89,3.84,2.31
2,0.23,56.9,65.0,327,4.05,4.07,2.31
3,0.29,62.4,58.0,334,4.2,4.23,2.63
4,0.31,63.3,58.0,335,4.34,4.35,2.75


In [4]:
x = df.drop(columns = ['price'])

In [5]:
y = df['price']

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state=None)

In [7]:
rt = DecisionTreeRegressor(
    criterion='squared_error',
    max_depth=9
)

In [8]:
rt.fit(x_train, y_train)

In [9]:
y_pred = rt.predict(x_test)

In [10]:
r2_score(y_test, y_pred)

0.878408683461729

# HuperParameter Tuning

In [11]:
param_grid = {
    'max_depth': [5, 6, 7, 8, 9, 10, None],
    # 'ceriterion': ['mse', 'mae'],
    'max_features': [2, 3, 4, 5],
    'min_samples_split': [10, 20, 40, 45, 50, 55, 60 ,70 ,80]
}

In [12]:
regt = GridSearchCV(DecisionTreeRegressor(), param_grid=param_grid)

In [13]:
regt.fit(x_train, y_train)

In [14]:
regt.best_score_

0.8810322011124052

In [15]:
regt.best_params_

{'max_depth': 7, 'max_features': 5, 'min_samples_split': 20}

# Features Importance 

In [16]:
for importance, name in sorted(zip(rt.feature_importances_, x_train.columns), reverse = True):
    print(name, importance)

carat 0.6868005350120094
y 0.2996182011745922
depth 0.004408071585882202
table 0.0041976837546341544
x 0.002717384190986826
z 0.002258124281895229


## Explaination

1) <strong>rt.feature_importances_ :</strong>
This is an attribute of a trained Random Forest (rt) model (or any tree-based model like Decision Trees, XGBoost, etc.).
It contains an array of importance scores for each feature, where higher values mean the feature was more important in making predictions.

2) <strong>x_train.columns :</strong>
This contains the column names (features) of the training dataset (x_train).
It helps map feature importance scores to their corresponding feature names.

3) <strong>zip(rt.feature_importances_, x_train.columns) :</strong>
Combines the importance scores and feature names into pairs (tuples) like:
(0.45, 'age'), (0.3, 'income'), ...

4) <strong>sorted(..., reverse=True) :</strong>
Sorts the feature importance pairs in descending order (highest importance first).

5) <strong>for importance, name in ... :</strong>
Loops through each sorted (feature importance, feature name) pair.

6) <strong>importance = the importance score (float) :</strong>

7) <strong>name = the feature name (string) :</strong>

8) <strong>print(name, importance) :</strong>
Displays the feature name followed by its importance score.

<strong>What This Code Does :</strong>
Takes the feature importances from a trained model (rt).Pairs each importance score with its corresponding feature name.Sorts these pairs from most important to least important.Prints them in order.