In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import  accuracy_score
from sklearn.tree import plot_tree


In [2]:
df=pd.read_csv("shop_data.csv")
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

In [None]:

fig,ax=plt.subplots()
sns.lineplot(
    data=df,
    x="Month",
    y="Revenue",
    ax=ax,
    errorbar=None
)


In [None]:
fig,ax=plt.subplots()
sns.lineplot(
    data=df,
    x="Region",
    y="Revenue",
    ax=ax,
    errorbar=None
)


In [None]:
fig,ax=plt.subplots()
sns.lineplot(
    data=df,
    x="Weekend",
    y="Revenue",
    ax=ax,
    errorbar=None
)


In [4]:
le=LabelEncoder()

df["Weekend"]=le.fit_transform(df["Weekend"])
df["Revenue"]=le.fit_transform(df["Revenue"])

cols = ['VisitorType', 'Month','SpecialDay']
df = pd.get_dummies(df, columns=cols, drop_first=True)

In [5]:
X=df.drop("Revenue",axis=1)
y=df["Revenue"]

X_train,X_test,y_train,y_test=train_test_split(
    X,y,test_size=0.2,random_state=42
)


In [None]:
model=DecisionTreeClassifier()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)

print(accuracy_score(y_test,y_pred))

# print(accuracy_score(X_test,y_test))

In [None]:
from sklearn.metrics import r2_score,mean_squared_error

y_pred_train=model.predict(X_train)
y_pred_test=model.predict(X_test)

print("MSE train: ",mean_squared_error(y_train,y_pred_train))
print("MSE test: ",mean_squared_error(y_test,y_pred_test))

print("r2 train: ",r2_score(y_train,y_pred_train))
print("r2 test: ",r2_score(y_test,y_pred_test))

In [None]:
from sklearn.tree import plot_tree

plt.figure(figsize=(18,10))
plot_tree(
    model,
    feature_names=X.columns,
    # class_names=[""],
    filled=True,
    max_depth=3
)

plt.tight_layout()


## Using prepunning

In [None]:
# using the depth 
max_depth=[2,3,4,6,7,5,9,11]
for depth in max_depth:
    model=DecisionTreeClassifier(max_depth=depth)
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)

    print(f"accuracy score for {depth} is {accuracy_score(y_test,y_pred)}")


In [None]:
from sklearn.metrics import r2_score,mean_squared_error

y_pred_train=model.predict(X_train)
y_pred_test=model.predict(X_test)

print("MSE train: ",mean_squared_error(y_train,y_pred_train))
print("MSE test: ",mean_squared_error(y_test,y_pred_test))

print("r2 train: ",r2_score(y_train,y_pred_train))
print("r2 test: ",r2_score(y_test,y_pred_test))

In [None]:
model=DecisionTreeClassifier(
    # max_depth=3,
    min_samples_leaf=5,
    min_samples_split=10
)
model.fit(X_train,y_train)

acc=model.score(X_test,y_test)
    

In [None]:
y_pred_train=model.predict(X_train)
y_pred_test=model.predict(X_test)

print("MSE train: ",mean_squared_error(y_train,y_pred_train))
print("MSE test: ",mean_squared_error(y_test,y_pred_test))

print("r2 train: ",r2_score(y_train,y_pred_train))
print("r2 test: ",r2_score(y_test,y_pred_test))

## POST PRUNNING

In [None]:
full_tree=DecisionTreeClassifier(random_state=42)
full_tree.fit(X_train,y_train)


In [None]:
path=full_tree.cost_complexity_pruning_path(X_train,y_train)
ccp_alphas=path.ccp_alphas

print(ccp_alphas)

In [None]:
tree=[]

for alpha in ccp_alphas:
    model=DecisionTreeClassifier(random_state=42,ccp_alpha=alpha)
    model.fit(X_train,y_train)
    acc=model.score(X_test,y_test)

    tree.append((model,acc))
    

In [None]:
best_acc=0
best_alpha=0

for model,alpha in tree:
    curr_acc=model.score(X_test,y_test)
    if curr_acc>best_acc:
        best_acc=curr_acc
    best_alpha=alpha    
    

In [None]:
best_acc

In [None]:
best_model=DecisionTreeClassifier(ccp_alpha=best_alpha)
best_model.fit(X_train,y_train)


In [None]:
y_pred=best_model.predict(x_test)