# Iris Flower Dataset Example

In [None]:
# Library imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn.datasets
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing

import time

from pathlib import Path

### Load Dataset and Analyse it

In [None]:
iris = sklearn.datasets.load_iris()

In [None]:
print(iris.DESCR)

In [None]:
print(iris.data)

In [None]:
print(iris.target)

In [None]:
print(iris.target_names)

#### Load dataset into Pandas DataFrame

In [None]:
iris_df = pd.DataFrame(iris.data, columns=['Sepal length', 'Sepal width', 'Petal length', 'Petal width'])
iris_df_labels = iris_df.copy() # Creates a copy of DataFrame Object
iris_df_labels['Label'] = iris.target_names[iris.target] # Adding Label Column with the actual name mapping
iris_df_labels.head() # head() shows top 10 rows of the dataset

In [None]:
iris_df_labels.tail() # tail() shows last 10 rows of the dataset

In [None]:
iris_df_labels.describe() # describe() command provides statistical information of entire dataframe

### Visualize Dataset using Scatter Plots

In [None]:
groups = iris_df_labels.groupby("Label")
for name, group in groups:
    plt.plot(group["Sepal width"], group["Sepal length"], marker="o", linestyle="", label=name)
plt.xlabel('Sepal width')
plt.ylabel('Sepal length')
plt.legend()
plt.show()

In [None]:
for name, group in groups:
    plt.plot(group["Petal width"], group["Petal length"], marker="o", linestyle="", label=name)
plt.xlabel('Petal width')
plt.ylabel('Petal length')
plt.legend()
plt.show()

### Split Dataset into Training and Test

In [None]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(iris.data, iris.target, train_size=0.80)

In [None]:
print(f'X_train: {X_train.shape}, y_train: {y_train.shape}')
print(f'X_test: {X_test.shape}, y_test: {y_test.shape}')

#### Standardization of Data

> Zero mean and Unit Variance

In [None]:
def scale_data(scaling):
    if scaling:
        print("------------------Scaling the data------------------")
        scaler = preprocessing.StandardScaler().fit(X_train)
        Scaled_X_train = scaler.transform(X_train)
        Scaled_X_test = scaler.transform(X_test)
    else:
        print("------------------Not scaling the data------------------")
        Scaled_X_train = X_train
        Scaled_X_test = X_test
    return Scaled_X_train, Scaled_X_test

Scaled_X_train, Scaled_X_test = scale_data(False)
print(f'\nThe first data sample Original Form:\n\n {Scaled_X_train[1]}')

### Training a Decision Tree Classifier on Original Data - Cross Validation

In [None]:
time_begin = time.time()
dtc = tree.DecisionTreeClassifier()
scores = cross_val_score(dtc, Scaled_X_train, y_train, cv=5)
print("Accuracy of Decision Tree: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))
time_end = time.time()
print(f"Time taken in seconds: {time_end - time_begin:.4f} s")

### Training a Decision Tree Classifier on Scaled Data (Standardized Data)

In [None]:
Scaled_X_train, Scaled_X_test = scale_data(True)
print(f'\nThe first data sample of Original Normalized Data:\n {Scaled_X_train[1]}')

time_begin = time.time()
dtc = tree.DecisionTreeClassifier()
scores = cross_val_score(dtc, Scaled_X_train, y_train, cv=5)
print("\nAccuracy of Decision Tree: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))
time_end = time.time()
print(f"Time taken in seconds: {time_end - time_begin:.4f} s")

#### Fiting the Decision Tree model on Scaled Data

In [None]:
print("Training the model...")
time_begin = time.time()
dtc.fit(Scaled_X_train, y_train)
time_end = time.time()
print(f"Time taken to train in seconds: {time_end - time_begin:.4f} s")

### Evaluating test accuracy on trained model

In [None]:
y_pred = dtc.predict(Scaled_X_test)
y_true = y_test
print(f'The mean accuracy score for Decision Tree is: {dtc.score(Scaled_X_test, y_test):0.4f}')

### Visualizing the trained decision tree

In [None]:
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(dtc, feature_names=iris.feature_names, class_names=iris.target_names, filled=True)

## Self-Study Tasks

1. Watch Decision Tree videos of StatQuest
2. Search and understand what is Information Gain, Entropy and Gini Impurity
3. Checkout documentation of all the python packages, classes and methods used in this code
4. Study about Support Vector Machine, Naive Bayes and Random Forest Classifiers and apply them to this dataset.
5. Read about what is class imbalance and how to overcome it.
6. Read about Cross-Validation.
7. Watch StatQuest videos on Underfitting and Overfitting.
8. Read about F1_Score and ROC_AUC Score, and their importance in classification
9. Train Decision Tree on Online Shoppers Purchasing Intention Dataset
10. Read about PCA from Python Data Science Handbook - https://jakevdp.github.io/PythonDataScienceHandbook/05.09-principal-component-analysis.html
11. How to deal with categorical (non-numerical) data features? Read about Feature Encoding.

# Online Shoppers Purchasing Intent Dataset

**Rows / Datapoints**: 12330

**Columns / Features / Attributes + Label**: 17 + 1

**Classes**: 2

### Reading the dataset

In [None]:
osi_df = pd.read_csv('./online_shoppers_intention.csv') # df stands for dataframe
print(f'The rows and columns of our dataset are: {osi_df.shape}')

In [None]:
osi_df.head(10) # head() shows top 10 rows of the dataset

In [None]:
osi_df.describe() # describe() command provides statistical information of entire dataframe

### Visualizing the Dataset

In [None]:
RESULTS_PATH = "./results"
SCATTER_PLOT = "/scatter_plot/"
SCATTER_PLOT_PATH = RESULTS_PATH + SCATTER_PLOT
Path(SCATTER_PLOT_PATH).mkdir(parents=True, exist_ok=True)

colors = np.where(osi_df.Revenue == 1, 'g', 'r')

fig = osi_df.plot.scatter(y='PageValues', x='ExitRates', c=colors).get_figure()
fig.savefig(SCATTER_PLOT_PATH+'PageValues-ExitRates.png', dpi=150)

In [None]:
fig = osi_df.plot.scatter(y='PageValues', x='BounceRates', c=colors).get_figure()
fig.savefig(SCATTER_PLOT_PATH+'PageValues-BounceRates.png', dpi=150)

In [None]:
fig = osi_df.plot.scatter(x='SpecialDay', y='PageValues', c=colors).get_figure()
fig.savefig(SCATTER_PLOT_PATH+'PageValues-SpecialDay.png', dpi=150)

In [None]:
fig = osi_df.plot.scatter(x='Month', y='PageValues', c=colors).get_figure()
fig.savefig(SCATTER_PLOT_PATH+'PageValues-Month.png', dpi=150)

In [None]:
fig = osi_df.plot.scatter(x='ProductRelated', y='Administrative', c=colors).get_figure()
fig.savefig(SCATTER_PLOT_PATH+'Administrative-ProductRelated.png', dpi=150)

In [None]:
fig = osi_df.plot.scatter(x='ProductRelated', y='Informational', c=colors).get_figure()
fig.savefig(SCATTER_PLOT_PATH+'Informational-ProductRelated.png', dpi=150)

In [None]:
fig = osi_df.plot.scatter(x='ProductRelated_Duration', y='Administrative_Duration', c=colors).get_figure()
fig.savefig(SCATTER_PLOT_PATH+'AdministrativeDuration-ProductRelatedDuration', dpi=150)

In [None]:
fig = osi_df.plot.scatter(x='ProductRelated_Duration', y='Informational_Duration', c=colors).get_figure()
fig.savefig(SCATTER_PLOT_PATH+'InformationalDuration-ProductRelatedDuration.png', dpi=150)

In [None]:
fig = osi_df.plot.scatter(x='OperatingSystems', y='Browser', c=colors).get_figure()
fig.savefig(SCATTER_PLOT_PATH+'Browser-OperatingSystems.png', dpi=150)

In [None]:
fig = osi_df.plot.scatter(x='Region', y='TrafficType', c=colors).get_figure()
fig.savefig(SCATTER_PLOT_PATH+'TrafficType-Region.png', dpi=150)