In [1]:
  from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [3]:
dataset = pd.read_csv('/content/drive/MyDrive/ML Dataset/online_shoppers_intention.csv')


In [4]:
dataset.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [5]:
from sklearn.preprocessing import LabelEncoder

# Assuming 'dataset' is your pandas DataFrame
# Identify columns with object (string) dtype
categorical_cols = dataset.select_dtypes(include=['object']).columns

# Create a LabelEncoder object
labelencoder = LabelEncoder()

# Encode categorical features
for col in categorical_cols:
    dataset[col] = labelencoder.fit_transform(dataset[col])


In [6]:
dataset.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,2,1,1,1,1,2,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,2,2,2,1,2,2,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,2,4,1,9,3,2,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,2,3,2,2,4,2,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,2,3,3,1,4,2,True,False


In [7]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Rescale Data

In [9]:
from sklearn.preprocessing import MinMaxScaler
from numpy import set_printoptions
array = dataset.values
#separate array into input and utput componenets
X = array[:,:-1]
Y= array[:,-1]

scaler = MinMaxScaler(feature_range = (0,1))
rescaledX = scaler.fit_transform(X)

#summarize transformed data
set_printoptions(precision = 3)

print(rescaledX[:,:-1])

[[0.    0.    0.    ... 0.    0.    1.   ]
 [0.    0.    0.    ... 0.    0.053 1.   ]
 [0.    0.    0.    ... 1.    0.105 1.   ]
 ...
 [0.    0.    0.    ... 0.    0.632 1.   ]
 [0.148 0.022 0.    ... 0.25  0.526 1.   ]
 [0.    0.    0.    ... 0.    0.053 0.   ]]


# Standardize Data

In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)
rescaleX = scaler.transform(X)
#summarize transformed data
set_printoptions(precision = 3)
print(rescaleX[:,:-1])

[[-0.697 -0.457 -0.396 ... -0.894 -0.763  0.408]
 [-0.697 -0.457 -0.396 ... -0.894 -0.514  0.408]
 [-0.697 -0.457 -0.396 ...  2.437 -0.266  0.408]
 ...
 [-0.697 -0.457 -0.396 ... -0.894  2.219  0.408]
 [ 0.507 -0.033 -0.396 ... -0.061  1.722  0.408]
 [-0.697 -0.457 -0.396 ... -0.894 -0.514 -2.488]]


# Normalize Data

In [11]:
from sklearn.preprocessing import Normalizer
scaler = Normalizer().fit(X)
rescaleX = scaler.transform(X)
#summarize transformed data
set_printoptions(precision = 3)
print(rescaleX[:,:-1])


[[0.    0.    0.    ... 0.277 0.277 0.553]
 [0.    0.    0.    ... 0.016 0.031 0.031]
 [0.    0.    0.    ... 0.835 0.278 0.186]
 ...
 [0.    0.    0.    ... 0.005 0.07  0.011]
 [0.011 0.211 0.    ... 0.008 0.031 0.006]
 [0.    0.    0.    ... 0.043 0.087 0.   ]]


# Binarize Data

In [12]:
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=0.0).fit(X)
binaryX = binarizer.transform(X)
#summarize transformed data
set_printoptions(precision = 3)
print(binaryX[:,:-1])

[[0. 0. 0. ... 1. 1. 1.]
 [0. 0. 0. ... 1. 1. 1.]
 [0. 0. 0. ... 1. 1. 1.]
 ...
 [0. 0. 0. ... 1. 1. 1.]
 [1. 1. 0. ... 1. 1. 1.]
 [0. 0. 0. ... 1. 1. 0.]]


#Univariate Selection


In [13]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import LabelEncoder

# Creating a LabelEncoder object
labelencoder = LabelEncoder()


# Convert boolean values in 'Y' to 0 and 1
Y_encoded = labelencoder.fit_transform(Y)

# feature extraction
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y_encoded)  # Use the encoded target variable
# summarize scores
set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)
# summarize selected features
print(features[:,:-1])

[1.134e+03 4.175e+04 3.580e+02 3.506e+04 1.932e+04 8.774e+05 2.965e+01
 2.899e+01 1.751e+05 5.380e+01 8.616e+01 1.037e+00 8.873e+00 3.038e+00
 1.283e+00 3.755e+01 8.120e+00]
[[0.0 0.0 0.0]
 [0.0 0.0 64.0]
 [0.0 0.0 0.0]
 ...
 [0.0 0.0 184.25]
 [75.0 0.0 346.0]
 [0.0 0.0 21.25]]


In [14]:
#Recursive Feature Elimination
# from sklearn.feature_selection import RFE
# from sklearn.svm import SVC

# model = SVC(kernel="linear")
# rfe = RFE(model, n_features_to_select=2)
# X_new = rfe.fit_transform(X_train, y_train)

# print("Feature Ranking:", rfe.ranking_)

# print("Selected Features Shape:", X_new.shape)


In [15]:
 # PCAfrom sklearn.decomposition import PCA
# pca = PCA(n_components=2)
# X_new = pca.fit_transform(X_train)

# print("Explained Variance Ratio:", pca.explained_variance_ratio_)
# print("Selected Features Shape:", X_new.shape)


#Evaluate the Performance of Machine Learning Algorithms with Resampling

#K-fold cross validation

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold, LeaveOneOut, RepeatedStratifiedKFold
from sklearn.svm import SVC # Importing the SVC class

k = 10
kf = KFold(n_splits=k, shuffle=True, random_state=0)
model = SVC(kernel='linear')

scores = cross_val_score(model, X, y, cv=kf)

print(f"K-Fold Cross Validation Scores: {scores}")
print(f"Mean Accuracy: {np.mean(scores):.4f}")

##Leave-One-Out Cross Validation

In [None]:
loo = LeaveOneOut()
scores = cross_val_score(model, X, y, cv=loo)

print(f"LOOCV Accuracy: {np.mean(scores):.4f}")


## Repeated Random Test-Train Splits

In [None]:
repeated_kf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)

scores = cross_val_score(model, X, y, cv=repeated_kf)

print(f"Repeated Holdout Accuracy Scores: {scores}")
print(f"Mean Accuracy: {np.mean(scores):.4f}")


# Machine Learning Algorithm Performance Metrics

### 1.Classification Metrics

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, confusion_matrix, classification_report

model = SVC(kernel='linear', probability=True)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)

accuracy = accuracy_score(y_test, y_pred)
logloss = log_loss(y_test, y_pred_prob)
roc_auc = roc_auc_score(y_test, y_pred_prob[:, 1])
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Classification Accuracy: {accuracy:.4f}")
print(f"Logarithmic Loss: {logloss:.4f}")
print(f"Area Under ROC Curve: {roc_auc:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

### 2.Regression Tasks

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae:.4f}")
print(f"Mean Squared Error: {mse:.4f}")
print(f"R2 Score: {r2:.4f}")
