# Task 0 - Setup Environment

In [None]:
import pandas as pd
import plotly.express as px
import numpy as np
from sklearn.cluster import AffinityPropagation
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn.cluster import Birch
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score


# Upload the dataset
url = "https://raw.githubusercontent.com/glgunderson/INFOB2DA-PA2/main/data/online_shoppers_intention.csv"
df = pd.read_csv(url)

print("Dataset loaded, shape:", df.shape) # Number of rows and columns


Dataset loaded, shape: (12330, 18)


The dataset consists of 12,330 records (sessions), including 18 key features for each record.

# Task 1 - Get Dataset on screen

## 1.1) Explore Dataset

In [None]:
df.head() # Show first five rows

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


#### Summary Statitics:

In [None]:
df.info() # Summary of dataset
df.isna().sum() # Check for missing values
df.describe() # Basic statitics for numeric columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,OperatingSystems,Browser,Region,TrafficType
count,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0
mean,2.315166,80.818611,0.503569,34.472398,31.731468,1194.74622,0.022191,0.043073,5.889258,0.061427,2.124006,2.357097,3.147364,4.069586
std,3.321784,176.779107,1.270156,140.749294,44.475503,1913.669288,0.048488,0.048597,18.568437,0.198917,0.911325,1.717277,2.401591,4.025169
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,7.0,184.1375,0.0,0.014286,0.0,0.0,2.0,2.0,1.0,2.0
50%,1.0,7.5,0.0,0.0,18.0,598.936905,0.003112,0.025156,0.0,0.0,2.0,2.0,3.0,2.0
75%,4.0,93.25625,0.0,0.0,38.0,1464.157214,0.016813,0.05,0.0,0.0,3.0,2.0,4.0,4.0
max,27.0,3398.75,24.0,2549.375,705.0,63973.52223,0.2,0.2,361.763742,1.0,8.0,13.0,9.0,20.0


#### Feature Types:

- **Numeric features**  
  `Administrative`, `Administrative_Duration`, `Informational`, `Informational_Duration`,  
  `ProductRelated`, `ProductRelated_Duration`, `BounceRates`, `ExitRates`, `PageValues`, `SpecialDay`

- **Categorical features**  
  `Month`, `OperatingSystems`, `Browser`, `Region`, `TrafficType`, `VisitorType`

- **Boolean features**  
  `Weekend`, `Revenue` (target)

In [None]:
df.describe(include='bool')    # summary counts for booleans

Unnamed: 0,Weekend,Revenue
count,12330,12330
unique,2,2
top,False,False
freq,9462,10422


In [None]:
categorical_features = ['Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType']

for col in categorical_features:
    df[col] = df[col].astype('category')

# Summary stats for categorical features
df.describe(include='category')

Unnamed: 0,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType
count,12330,12330,12330,12330,12330,12330
unique,10,8,13,9,20,3
top,May,2,2,1,2,Returning_Visitor
freq,3364,6601,7961,4780,3913,10551


#### Interpretation of Categorical and Boolean Summaries:

- **Month** - 10 unique months are represented, with **May** being the most common (3,364 sessions).  
- **OperatingSystems** - 8 categories, with **OS = 2** used in more than half of the sessions (6,601).  
- **Browser** - 13 browser categories, but **Browser = 2** dominates heavily (7,961 users).  
- **Region** - 9 regions, with **Region = 1** accounting for 4,780 sessions.  
- **TrafficType** - 20 traffic sources, with **TrafficType = 2** the most frequent (3,913).  
- **VisitorType** - 3 categories, with **Returning_Visitor** making up the vast majority (10,551 out of 12,330).  

- **Weekend (Boolean)** - Most sessions occur on weekdays (9,462 False vs. 2,384 True).  
- **Revenue (Boolean)** - Only a small proportion of sessions end in purchase (2,279 True vs. 10,051 False).  

## 1.2) 'Browser 13' vs. 'Other Browsers'

#### Browsers vs. PageValues

In [None]:
# Create dataframe that only consists of Browser and PageValues
df2 = df.loc[:,["Browser","PageValues"]]

# Ensure correct order of labels:
df2['Browser'] = df['Browser'].astype(str)
Browser_labels = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13"]
df2['Browser'] = pd.Categorical(df2['Browser'], categories = Browser_labels)

# Illustrate Browser vs. PageValues
px.bar(
    data_frame = df2.groupby(['Browser'],observed=False).mean().reset_index(),
    x = 'Browser',
    y = 'PageValues',
    color='Browser',
    labels={'Browser':'Browsers','PageValues':'Average PageValue'},
    title="Average PagesValue per. Browser"
    )

#### BrowserGroup vs. PageValues

In [None]:
df2.loc[df2['Browser'] == "13", 'BrowserGroup'] = 'Browser 13'
df2.loc[df2['Browser'] != "13", 'BrowserGroup'] = 'Other Browsers'

df2 = df2.loc[:,["BrowserGroup","PageValues"]]

px.bar(
    data_frame = df2.groupby(['BrowserGroup'],observed=False).mean().reset_index(),
    x = 'BrowserGroup',
    y = 'PageValues',
    color='BrowserGroup',
    labels={'Browser':'Browsers','PageValues':'Average PageValue'},
    title="Average PagesValue per. Browser Group"
    )


#### Browsers vs.

# Task 2 - Preprocessing

In [None]:
df_Norm = df.copy(True)

# Log normalization
def log_normalization(column: pd.Series) -> pd.Series:
    min = column.min()
    max = column.max()

    c = 0000000000000.1

    normalized = (np.log(column + c) - np.log(min + c)) / (np.log(max + c) - np.log(min + c))

    return normalized

# Convert Page type counts to floats
df_Norm['Administrative'] = df_Norm['Administrative'].astype(float)
df_Norm['Informational'] = df_Norm['Informational'].astype(float)
df_Norm['ProductRelated'] = df_Norm['ProductRelated'].astype(float)

df_Norm = df_Norm.select_dtypes(include=['float'])

for c in df_Norm.columns:
  df_Norm[c] = log_normalization(df_Norm[c])

df_Norm['Revenue'] = df['Revenue']

df_Norm.describe()


Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay
count,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0
mean,0.333843,0.334955,0.117124,0.126915,0.573069,0.607312,0.137934,0.287252,0.13545,0.079586
std,0.328444,0.331471,0.230356,0.264832,0.142232,0.185979,0.253236,0.247471,0.262724,0.242791
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.481065,0.562414,0.0,0.121546,0.0,0.0
50%,0.428034,0.415069,0.0,0.0,0.586677,0.650612,0.027899,0.204252,0.0,0.0
75%,0.662887,0.655468,0.0,0.0,0.670675,0.717468,0.141452,0.36907,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


#### PCA

In [None]:
# Choose amount of principal components
pca = PCA(n_components=2)

# Define what attributes to consider and choose target:
target = df_Norm['Revenue']
attributes = df_Norm.drop(columns=['Revenue'])

# New dataframe with principal components:
pcaTransformed = pca.fit_transform(attributes)
df_PCA = pd.DataFrame(pcaTransformed)

df_PCA.columns = df_PCA.columns.astype(str)

px.scatter(
    df_PCA,
    x = '0',
    y = '1'
    )

# Task 3 - Clustering Algorithms

In [None]:
df_sample = df_PCA.sample(frac=0.15)

In [None]:
px.scatter(
    df_sample,
    x = '0',
    y = '1'
    )

### 3.1) Affinity Propagation Clustering

In [None]:
# Create copy of Sample
df_AFC = df_sample.copy()

clustering = AffinityPropagation(
    max_iter=1000,
    ).fit(df_AFC)

# Cluster Labels
labels = clustering.labels_

df_AFC['cluster'] = labels.astype(str)

# Amount of clusters
print(len(clustering.cluster_centers_indices_))

# Plot
px.scatter(
    data_frame=df_AFC,
    x = '0',
    y = '1',
    color=labels.astype(str)
    )

22



Affinity propagation did not converge, this model may return degenerate cluster centers and labels.



### 3.2) DBSCAN Clustering

In [None]:
df_dbscan = df_sample.copy()

db_clustering = DBSCAN(eps = 0.065, min_samples =19).fit(df_dbscan)
labels = db_clustering.labels_

# Cluster Labels
db_labels = db_clustering.labels_

df_dbscan["cluster"] = db_labels.astype(str)

px.scatter(
    data_frame=df_dbscan,
    x = '0',
    y = '1',
    color=db_labels.astype(str)
    )

### 3.3) Birch Clustering

In [None]:
df_brc = df_sample.copy()

brc = Birch(
    branching_factor=20,
    threshold=0.080,
    ).fit(df_brc)

# Cluster Labels
brc_labels = brc.labels_

df_brc['cluster'] = brc_labels

px.scatter(
    data_frame=df_brc,
    x = '0',
    y = '1',
    color=brc_labels.astype(str)
    )

# Task 4 - Evaluation of Clustering Methods

### 4.1) Silhouette Score (Algorithm)

##### Custom Euclidean Metric

In [None]:
def custom_euclidean(x, y):
    return np.sqrt((x[0]-y[0])**2 + (x[1]-y[1])**2)

##### Intra Cluster Mean Distance


In [None]:
def a(df: pd.DataFrame, i: int):
    feature_columns = ["0", "1"]

    # Find point 'i' attribute values:
    ref_point = df.loc[i, feature_columns].to_numpy()
    cluster_id = df.loc[i, "cluster"]

    # Filter only the same cluster
    same_cluster = df[df["cluster"] == cluster_id]

    # Remove the index point
    other_points_in_cluster = same_cluster.drop(index=i)

    # Compute distances
    distances = other_points_in_cluster[feature_columns].apply(
        lambda row: custom_euclidean(ref_point, row.to_numpy()), axis=1
    )

    return distances.mean()

##### Inter Cluster Minimum Mean Distance

In [None]:
def b(df: pd.DataFrame, i: int):
    feature_columns = ["0", "1"]

    # Find point 'i' attribute values:
    ref_point = df.loc[i, feature_columns].to_numpy()
    cluster_id = df.loc[i, "cluster"]

    # Filter only other clusters
    other_clusters = df[df["cluster"] != cluster_id]

    # Initiate list of mean distances to other clusters
    mean_distances = []

    # Loop over all other clusters one at a time
    for cluster in other_clusters["cluster"].unique():

        selected_cluster = other_clusters[other_clusters["cluster"] == cluster]

        # Compute euclidean distances
        distances = selected_cluster[feature_columns].apply(
        lambda row: custom_euclidean(ref_point, row.to_numpy()), axis=1)

        # Mean distance from 'i' to selected cluster
        mean_distances.append(distances.mean())


    return min(mean_distances)

##### Silhouette Score

In [None]:
def silhouette_score(df: pd.DataFrame, i: int):
    a_val = a(df, i)
    b_val = b(df, i)
    return (b_val - a_val) / max(a_val,b_val)

### Evaluating Silhouette Scores:

In [None]:
# Affinity Propagation:
df_AFC["Silhouette_score"] = df_AFC['Silhouette_score'] = [silhouette_score(df_AFC, i) for i in df_AFC.index]

df_AFC = df_AFC.sort_values(by=['cluster', 'Silhouette_score'], ascending=[True, False]).reset_index(drop=True)

px.bar(
    df_AFC,
    x='Silhouette_score',
    color=df_AFC['cluster'].astype(str),
    orientation='h'
).update_traces(width=15)


In [None]:
# DBSCAN:
df_dbscan["Silhouette_score"] = df_dbscan['Silhouette_score'] = [silhouette_score(df_dbscan, i) for i in df_dbscan.index]


In [None]:
df_dbscan = df_dbscan.sort_values(by=['cluster', 'Silhouette_score'], ascending=[True, False]).reset_index(drop=True)

px.bar(
    df_dbscan,
    x='Silhouette_score',
    color=df_dbscan['cluster'].astype(str),
    orientation='h'
).update_traces(width=15)


In [None]:
# Birch:
df_brc["Silhouette_score"] = df_brc['Silhouette_score'] = [silhouette_score(df_brc, i) for i in df_brc.index]

df_brc = df_brc.sort_values(by=['cluster', 'Silhouette_score'], ascending=[True, False]).reset_index(drop=True)

px.bar(
    df_brc,
    x='Silhouette_score',
    color=df_brc['cluster'].astype(str),
    orientation='h'
).update_traces(width=15)

### 4.2) David Bouldin Score

In [30]:
db_score_affinity = davies_bouldin_score(df_sample, clustering.labels_)
print(f"The Davies Bouldin Score for Affinity Propagation is {db_score_affinity}.")

db_score_dbscan = davies_bouldin_score(df_dbscan, db_clustering.labels_)
print(f"The Davies Bouldin Score for DBSCAN is {db_score_dbscan}.")

db_score_birch = davies_bouldin_score(df_PCA, brc_labels)
print(f"The Davies Bouldin Score for Birch is {db_score_birch}.")

The Davies Bouldin Score for Affinity Propagation is 0.5387596172703707.
The Davies Bouldin Score for DBSCAN is 0.3176848413261349.
The Davies Bouldin Score for Birch is 0.6400433419835238.


### 4.3) Calinski-Harabasz Index

In [31]:
ch_score_affinity = calinski_harabasz_score(df_sample, clustering.labels_)
print(f"The Calinski-Harabasz score for Affinity Propagation is {ch_score_affinity}.")

ch_score_dbscan = calinski_harabasz_score(df_dbscan, db_clustering.labels_)
print(f"The Calinski-Harabasz score for DBSCAN is {ch_score_dbscan}.")

ch_score_birch = calinski_harabasz_score(df_PCA, brc_labels)
print(f"The Calinski-Harabasz score for Birch is {ch_score_birch}.")

The Calinski-Harabasz score for Affinity Propagation is 314.42045563610515.
The Calinski-Harabasz score for DBSCAN is 60498.02950951246.
The Calinski-Harabasz score for Birch is 13616.352139217675.


# Task 5 - Distance Functions

### 5.1) Euclidean Distance Function

This function was implemented in the algorithm for calculation silhouette score

### 5.2) Manhatten Distance Function

In [None]:

def custom_manhattan(x, y):
    return abs(x[0]-y[0]) + abs(x[1]-y[1])

### 5.3) Cosine Similarity Function

In [None]:
def custom_cosine(x,y):
    dot = x[0] * y[0] + x[1] * y[1]
    norm_x = np.sqrt(x[0]**2 + x[1]**2)
    norm_y = np.sqrt(y[0]**2 + y[1]**2)
    similarity = dot / (norm_x * norm_y)

    return 1 - similarity

### 5.4) Evaluation

In [None]:
# Euclidean
db_euclidean = DBSCAN(eps = 0.045, min_samples =35,metric=custom_euclidean).fit(df_dbscan)
print(davies_bouldin_score(df_dbscan,db_euclidean.labels_))

df_dbscan["Cluster"] = db_euclidean.labels_.astype(str)
db_labels = db_euclidean.labels_.astype(str)

px.scatter(
    data_frame=df_dbscan,
    x = '0',
    y = '1',
    color=db_labels
    )

0.6801109740279863


In [None]:
# Manhattan
db_manhattan = DBSCAN(eps = 0.045, min_samples =35,metric=custom_manhattan).fit(df_dbscan)
print(davies_bouldin_score(df_dbscan,db_clustering.labels_))

df_dbscan["Cluster"] = db_manhattan.labels_.astype(str)
db_labels = db_manhattan.labels_.astype(str)

px.scatter(
    data_frame=df_dbscan,
    x = '0',
    y = '1',
    color=db_labels
    )

0.46654181224974145


In [None]:
# Cosine
db_cosine = DBSCAN(eps = 0.035, min_samples =35,metric=custom_cosine).fit(df_dbscan)
print(davies_bouldin_score(df_dbscan,db_cosine.labels_))

df_dbscan["Cluster"] = db_cosine.labels_.astype(str)
db_labels = db_cosine.labels_.astype(str)

px.scatter(
    data_frame=df_dbscan,
    x = '0',
    y = '1',
    color=db_labels
    )

ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)