## IMPORTING LIBRARIES

In [None]:
# local modules
import exploredata

# external libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.ensemble import RandomForestClassifier
                  
import pickle as pc
import gradio as gr

##### Function to load the dataset

In [None]:
def load_user_cookies_data(filename):
    if os.path.isfile(filename):
      return pd.read_csv(filename)
    else:
      return ("Invalid file name, make sure the filename is correct and is in the same package")

In [None]:
UserCookiesData = load_user_cookies_data("shopping.csv")

In [None]:
UserCookiesData.info()

In [None]:
UserCookiesData.head()

In [None]:
UserCookiesData['Revenue'] = UserCookiesData['Revenue'].astype(int)
UserCookiesData['Revenue'].value_counts()

## EXPLORATORY DATA ANALYSIS

* Here We will use the helper functions in the local explore.py module

In [None]:
# checking the Distribution of customers on Revenue

plt.rcParams['figure.figsize'] = (13, 5)

plt.subplot(1, 2, 1)
sns.countplot(UserCookiesData['Revenue'], palette = 'coolwarm_r')
plt.title('Distribution of customers on Revenue', fontsize = 15)
plt.xlabel('Revenue or not', fontsize = 15)
plt.ylabel('count', fontsize = 15)
plt.show()

##### Plotting a pie chart for operating systems distribution

In [None]:
UserCookiesData['OperatingSystems'].value_counts()

In [None]:
#Prepare and split into train, validate, and test sets.
train, validate, test = exploredata.process_unencoded_data(data = UserCookiesData )


In [None]:
UserCookiesData.select_dtypes('object').columns

In [None]:

categorical_vars = UserCookiesData.select_dtypes('object').columns
quantitative_vars = UserCookiesData.select_dtypes('float').columns
int_vars = UserCookiesData.select_dtypes('int').columns

##### Exploring univariate variables

In [None]:
UserCookiesData['OperatingSystems'].value_counts()

In [None]:
# This code cell is plotting a pie chart of the different operating systems.
size = [6601, 2585, 2555, 589]
colors = ['violet', 'yellow', 'green', 'orange']
labels = "2", "1", "3", "others"
plt.rcParams['figure.figsize'] = (18, 7)
plt.subplot(1, 2, 2)
plt.pie(size, colors = colors, labels = labels, shadow = True, autopct = '%.2f%%', startangle=90)
plt.title('Different Operating Systems', fontsize = 30)
plt.axis('off')
plt.legend()
plt.show()

In [None]:
# This code cell is plotting a pie chart for the visitor types
size = [10551, 1694, 85]
explode = [0, 0, 0.1]
labels = "Returning Visitor", "New Visitor", "Others"
colors = ['blue', 'lightblue', 'orange']
plt.rcParams['figure.figsize'] = (18, 7)
plt.subplot(1, 2, 1)
plt.pie(size, colors = colors, labels = labels, explode = explode, shadow = True, autopct = '%.2f%%')
plt.title('Different Visitors', fontsize = 30)
plt.axis('off')
plt.legend()
plt.show()

In [None]:
exploredata.univariate(UserCookiesData, categorical_vars, quantitative_vars)

##### Observations from univariate exploration

- Different user types with reference to region are not normally (Gaussian) distributed. This regional data has an exponential distribution. Therefore, we must be concerned with this type distribution.
- Multiple types of traffic are not normally(Gaussian) distributed. This data has an exponential distribution.
- More than 85% of visitors are repeat customers, which is enormous. For marketing purposes, this information can be useful.
- 90% of people only used the top 3 browsers.
- 95% of the users in  this session cookies data uses the top 3 Operating Systems. The online will then need to concentrate on these browsers in order to grow embark on specific operations to increase customer purchases.
- The distribution of Weekend and Revenue statistics is highly unbalanced.

##### Exploring Bivariate Analysis

In [None]:
UserCookiesData.head(1)

In [None]:
exploredata.bivariate_categorical(data=UserCookiesData, target="Revenue", categorical_variable ="Weekend" )

In [None]:
exploredata.bivariate_categorical(data = UserCookiesData, target= 'Revenue', categorical_variable = 'OperatingSystems')

In [None]:
exploredata.bivariate_quant(data = UserCookiesData, target = 'Revenue', quantitative_var = 'PageValues')

In [None]:
exploredata.bivariate_quant(data = UserCookiesData, target = 'Revenue', quantitative_var = 'ExitRates')

In [None]:
exploredata.bivariate_quant(data = UserCookiesData, target="Revenue", quantitative_var="BounceRates")

In [None]:
plt.figure(figsize=(15, 15))
sns.heatmap(UserCookiesData.corr(), cmap='coolwarm', center=0, annot=True)
plt.show()

In [None]:
plt.figure(figsize=[10,5])
sns.pairplot(UserCookiesData,x_vars=['BounceRates','ExitRates'],y_vars=['BounceRates','ExitRates'],hue='Revenue',diag_kind='kde')
plt.show()

In [None]:

exploredata.plot_cat_by_target(data =UserCookiesData, target_variable = "Revenue", categorical_var = "VisitorType")

##### *Brief observations from bivariate analysis*

## Statistical Analysis

#### Hypothesis Testing

##### Test Analysis for Categorical variables with label variable (Revenue)

In [None]:
alpha = 0.05

  #### **Hypothesis 1**
  - Ho : Browser type is independent of the revenue(either purchased made or not) of customers 
  - Ha : Browser type is not independent of the revenue(either purchased made or not) of customers


  Using that alpha = 0.05

In [None]:
sns.histplot(data=UserCookiesData, x='Browser', hue="Revenue" , palette =["yellow", "blue"] )

In [None]:
test1 = exploredata.run_chi2(data= UserCookiesData, categorical_var = "Browser", target_variable="Revenue")
test1

In [None]:
test1[0]['p-value']< alpha

* ##### *TakeAways from test 1*
    - Since the p-value is greater than alpha (a significance value of 0.05), we failed to reject the null hypothesis that Browser type is independent of a users decision to make purchase.
    - We conclude that, a user decision to either make purchase from an online shop is not affected by the type of Browser they user
    - This will help us in our next method about feature engineering and feature selection to improve the effectives of some models like the Random forest classifier
    - We will not include Browser type in the features.

#### **Hypothesis 2**

  + Ho : VisitorType is independent of the purchase decision of the user
  + Ha : VisitorType is not independent of the purchase decision of the user

In [None]:
sns.histplot(data=UserCookiesData, x="VisitorType", hue="Revenue", palette=["lightgreen", "yellow"])

In [None]:
test2 = exploredata.run_chi2(data=UserCookiesData, categorical_var = "VisitorType", target_variable = "Revenue")
test2

In [None]:
test2[0]["p-value"] < alpha

* ##### *TakeAways from test 2*
    - Since the p-value is less than alpha (a significance value of 0.05), we reject the null hypothesis that Visitor type is independent of a users decision to make purchase.
    - We conclude that, a user decision to either make purchase from an online shop is not independent of the Visitors Type (Either returning user or new user)
    - This will help us in our next method about feature engineering and feature selection to improve the effectives of some models like the Random forest classifier
    - We will include VisitorType in the features for our Random Forest classifer training

 #### **Hypothesis 3**
    * Ho: ProductRelated is independent of the purchase decision of the user
    * Ha: ProductRelated is independent of the purchase decison of the user

In [None]:
plt.figure(figsize=[20,10])
sns.histplot(data = UserCookiesData,weights=3, x='ProductRelated', hue="Revenue", palette=["red", "green"])

In [None]:
test3 = exploredata.run_chi2(data=UserCookiesData, categorical_var="ProductRelated", target_variable="Revenue")

* ##### TakeAways from test 3

  #### **Hypothesis 4**
  - Ho:Operating Systems is independent of a users buying decision
  - Ha: Operating Systems is not independent of a users buying decision

In [None]:
sns.histplot(data = UserCookiesData, weights=20, x = "OperatingSystems", hue = "Revenue", palette=["Black", "Yellow"])

In [None]:
test4 = exploredata.run_chi2(data=UserCookiesData, categorical_var = "OperatingSystems", target_variable = "Revenue")
test4

In [None]:
test4[0]['p-value'] < alpha

* ##### *TakeAways from test 4*
    - Since the p-value is less than the significance value of 0.05, we reject the null hypothesis and infer that the Operating Systems type is not independent of the customers intention to make a purchase

 #### **Hypothesis 5**

In [None]:
test5 = exploredata.run_chi2(data = UserCookiesData, categorical_var = "Weekend", target_variable = "Revenue")
test5

In [None]:
test5[0]['p-value'] < alpha

* ##### *TakeAways from test 5*
    - Since the p-value is greater than alpha (a significance value of 0.05), we failed to reject the null hypothesis that Weekend is independent of a users decision to make purchase.
    - We conclude that, a user decision to either make purchase from an online shop is not affected by the type whethe the user visits the website in weekends or not
    - This pre-informs us that Weekend as a feature will not be used in our feature engineering and feature selection to improve the effectives of some models like the Random forest classifier
    - We will not include Weekend in the features.

#### **Hypothesis test 6**

  * Ho: The Region location of a customer is independent of the purchasing intent of the customer
* Ha: The Region location of a customer is not independent of the purchasing intent of the customer

In [None]:

sns.histplot(data=UserCookiesData, x = "Region", hue="Revenue", bins=10, weights=30,palette="coolwarm" )

In [None]:
test6 = exploredata.run_chi2(data=UserCookiesData, categorical_var="Region", target_variable = "Revenue")
print(test6)

In [None]:
test6[0]['p-value'] < alpha

* ##### *TakeAways from test 6*
   - Since the p-value is greater than the significane value of 0.05, we reject the null hypothesis and a conclsion drawn that the purchasing intent of a customer is not dependent on the Region location of a customer.

##### **Hypothesis Test Analysis for Quantitative variables with label variable (Revenue)**

In [None]:
quant_vars = ['BounceRates', 'ExitRates', 'PageValues', 'Administrative_Duration','Informational_Duration','ProductRelated_Duration']

In [None]:
exploredata.two_t_test(data = UserCookiesData, quantitative_vars=quant_vars, target_variable = 'Revenue')

* ##### ***TakeAways from the levene, mannwhitneyu and shiporo-wilk test for the numerical variables***

## DATA PREPROCESSING

##### *Encoding both weekend and the label variable (Revenue) into numeric for modelling*

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
UserCookiesEncoded = pd.get_dummies(UserCookiesData)

In [None]:
UserCookiesEncoded.info()

In [None]:
encoder = LabelEncoder()

In [None]:
UserCookiesEncoded["Weekend"] = encoder.fit_transform(UserCookiesEncoded["Weekend"])

In [None]:
UserCookiesEncoded["Revenue"] = encoder.fit_transform(UserCookiesEncoded["Revenue"])

In [None]:
UserCookiesEncoded["Revenue"].value_counts()

##### *Data segmentation into training and testing data sets*

In [None]:
y_label = UserCookiesEncoded["Revenue"]
x_label = UserCookiesEncoded.drop(["Revenue"],axis=1)

In [None]:

x_train, x_test, y_train, y_test = train_test_split(x_label, y_label, test_size =  0.2, random_state = 0)

## MODEL TRAINING AND TESTING

* ##### ***Random Forest Modelling***

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
def compare_random_classifier_models(x_train, y_train, x_test, y_test):
    models_cont = []
    for num in range(2, 20):
        for val in range(1, 23):
            classifier = RandomForestClassifier(n_estimators=50, random_state = 126, max_depth = num, min_samples_leaf = val)
            classifier.fit(x_train, y_train)
            train_score = classifier.score(x_train, y_train)
            predictions = classifier.predict(x_test)

            tp = confusion_matrix(y_test, predictions)[1][1]
            fp = confusion_matrix(y_test, predictions)[0][1]
            tn = confusion_matrix(y_test, predictions)[0][0]
            fn = confusion_matrix(y_test, predictions)[1][0]
            test_score = classifier.score(x_test, y_test)
            eval_params = {
                'max_depth':num,
                'min_samples_leaf': val,
                'True Positves': tp,
                'False Positives': fp,
                'True Negatives': tn,
                'False Negatvies': fn,
                'Precision': tp / (tp + fp),
                'Recall': tp / (tp + fn),
                'Specificity': round(tn / (tn + fp),2),
                'Training Accuracy': round(train_score, 2),
                'Test Accuracy': round(test_score,2)
            }
            models_cont.append(eval_params)
    return pd.DataFrame(models_cont)


In [None]:
compare_random_classifier_models(x_train, y_train, x_test, y_test)

In [None]:
def test_best_random_forest_classifier(x_train, y_train, x_test, y_test):
    classifier = RandomForestClassifier(random_state = 123, max_depth = 395, min_samples_leaf = 19)
    classifier.fit(x_train, y_train)
    train_score = classifier.score(x_train, y_train)
    test_score = classifier.score(x_test, y_test)
    predictions = classifier.predict(x_test)

    tp = confusion_matrix(y_test, predictions)[1][1]
    fp = confusion_matrix(y_test, predictions)[0][1]
    tn = confusion_matrix(y_test, predictions)[0][0]
    fn = confusion_matrix(y_test, predictions)[1][0]
    test_score = classifier.score(x_test, y_test)
    eval_params = {
        'max_depth':395,
        'min_samples_leaf': 19,
        'True Positves': tp,
        'False Positives': fp,
        'True Negatives': tn,
        'False Negatvies': fn,
        'Precision': tp / (tp + fp),
        'Recall': tp / (tp + fn),
        'Specificity': round(tn / (tn + fp),2),
        'Training Accuracy': round(train_score, 2),
        'Test Accuracy': round(test_score,2)
    }
    test_results = [eval_params]
    test_df = pd.DataFrame(test_results)
    return classifier, test_df

In [None]:
test_best_random_forest_classifier(x_train, y_train,x_test, y_test)[1]

### Feature Engineering

* ##### ***KNN Modelling***

## MODEL EVALUATION

### DEPLOYMENT OF BEST MODEL WITH GRADIO