In [7]:
import numpy as np
import pandas as pd

# More tools here
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.ensemble import BaggingRegressor, BaggingClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv('phone_price.csv')

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,0,842,0,2.2,0,1,0,7,0.6,188,...,20,756,2549,9,7,19,0,0,1,0
1,1,1021,1,0.5,1,0,1,53,0.7,136,...,905,1988,2631,17,3,7,1,1,0,1
2,2,563,1,0.5,1,2,1,41,0.9,145,...,1263,1716,2603,11,2,9,1,1,0,1
3,3,615,1,2.5,0,0,0,10,0.8,131,...,1216,1786,2769,16,8,11,1,0,0,1
4,4,1821,1,1.2,0,13,1,44,0.6,141,...,1208,1212,1411,8,2,15,1,1,0,0


### Question 11

In [None]:
train = data.drop(columns=['price_range'])
response = data['price_range']

X_train, X_test, y_train, y_test = train_test_split(train, response, test_size=0.2, stratify=response, random_state=1)

In [20]:
model = DecisionTreeClassifier(random_state=1)

grid = {
    'max_depth': range(2,11),
    "max_leaf_nodes": range(20, 30),
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

gscv = GridSearchCV(
    estimator=model,
    param_grid=grid,
    scoring='accuracy',
    cv=cv,
    n_jobs=-1,
    verbose=1,
)
gscv.fit(X_train, y_train)
print("Best parameters found: ", gscv.best_params_)
print("Best score found: ", gscv.best_score_)

Fitting 5 folds for each of 90 candidates, totalling 450 fits
Best parameters found:  {'max_depth': 9, 'max_leaf_nodes': 28}
Best score found:  0.9512499999999999


In [21]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score

tuned_classifier = gscv.best_estimator_
y_pred_probs = cross_val_predict(tuned_classifier, X_train, y_train, cv=cv, method='predict_proba')[:, 1]

accuracies = []
thresholds = np.arange(0.01, 1.01, 0.01)

for threshold in thresholds:
    accuracy = accuracy_score(y_train, y_pred_probs > threshold)
    accuracies.append(accuracy)

best_threshold = thresholds[np.argmax(accuracies)]
print(best_threshold)
print(np.max(accuracies))
    

0.08
0.951875


In [22]:
y_test_pred = (tuned_classifier.predict_proba(X_test)[0:, 1] > best_threshold)
print(accuracy_score(y_test, y_test_pred))

0.945


### Question 16

In [25]:
# find feature importances of the tuned classifier
importances = tuned_classifier.feature_importances_
print(X_train.columns[importances.argmax()])

ram
