In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('Social_Network_Ads.csv')
data.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [4]:
data['Gender'].replace({'Male' : 0, 'Female' : 1},inplace = True)

In [5]:
X = data.iloc[:,1:4].values
y = data.iloc[:,-1].values

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [7]:
X = scaler.fit_transform(X)
X

array([[-1.02020406, -1.78179743, -1.49004624],
       [-1.02020406, -0.25358736, -1.46068138],
       [ 0.98019606, -1.11320552, -0.78528968],
       ...,
       [ 0.98019606,  1.17910958, -1.46068138],
       [-1.02020406, -0.15807423, -1.07893824],
       [ 0.98019606,  1.08359645, -0.99084367]])

fit: It computes the mean and standard deviation for each feature in the dataset X

transform: It then applies the standardization formula to each feature. For each value in a feature, it subtracts the mean of that feature and divides the result by the standard deviation. This operation scales each feature to have a mean of 0 and a standard deviation of 1.

Normalization: StandardScaler scales each feature to have a mean (average) of 0 and a standard deviation of 1. This process is also known as "standardization" or "z-score normalization."

Visualization: StandardScaler is useful for data visualization. Plotting standardized data can make it easier to visualize and understand the relationships between features and identify patterns or outliers.

In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 1)

In [9]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()

In [10]:
clf.fit(X_train,y_train)

DecisionTreeClassifier()

In [11]:
y_pred = clf.predict(X_test)

In [12]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.7875

In [13]:
param_dist = {
    "criterion" :["gini","entropy"],
    "max_depth" :[1,2,3,4,5,6,7,None]
}

In [15]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(clf,param_grid = param_dist,cv = 10, n_jobs = 1)

In [16]:
grid.fit(X_train,y_train)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(), n_jobs=1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, None]})

In [17]:
grid.best_estimator_

DecisionTreeClassifier(max_depth=2)

In [18]:
grid.best_score_

0.91875

In [19]:
grid.best_index_

1

In [20]:
grid.best_params_

{'criterion': 'gini', 'max_depth': 2}

In [21]:
grid.cv

10

In [22]:
grid.classes_

array([0, 1], dtype=int64)