## IMPORTING LIBRARIES

In [None]:
# local modules
import exploredata

# external libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
                  

##### Function to load the dataset

In [None]:
def load_user_cookies_data(filename):
    if os.path.isfile(filename):
      return pd.read_csv(filename)
    else:
      return ("Invalid file name, make sure the filename is correct and is in the same package")

In [None]:
UserCookiesData = load_user_cookies_data("shopping.csv")

In [None]:
UserCookiesData.info()

In [None]:
UserCookiesData.head()

In [None]:
UserCookiesData['Revenue'] = UserCookiesData['Revenue'].astype(int)
UserCookiesData['Revenue'].value_counts()

## EXPLORATORY DATA ANALYSIS

* Here We will use the helper functions in the local explore.py module

In [None]:
# checking the Distribution of customers on Revenue

plt.rcParams['figure.figsize'] = (13, 5)

plt.subplot(1, 2, 1)
sns.countplot(UserCookiesData['Revenue'], palette = 'coolwarm_r')
plt.title('Distribution of customers on Revenue', fontsize = 15)
plt.xlabel('Revenue or not', fontsize = 15)
plt.ylabel('count', fontsize = 15)
plt.show()

##### Plotting a pie chart for operating systems distribution

In [None]:
UserCookiesData['OperatingSystems'].value_counts()

In [None]:
# plotting a pie chart for Operating Systems

plt.rcParams['figure.figsize'] = (18, 7)
size = [6601, 2585, 2555, 589]
colors = ['violet', 'yellow', 'green', 'orange']
labels = "2", "1", "3", "others"

plt.subplot(1, 2, 2)
plt.pie(size, colors = colors, labels = labels, shadow = True, autopct = '%.2f%%', startangle=90)
plt.title('Different Operating Systems', fontsize = 30)
plt.axis('off')
plt.legend()
plt.show()

In [None]:
#Prepare and split into train, validate, and test sets.
train, validate, test = exploredata.process_unencoded_data(data = UserCookiesData )


In [None]:
UserCookiesData.select_dtypes('object').columns

In [None]:

categorical_vars = UserCookiesData.select_dtypes('object').columns
quantitative_vars = UserCookiesData.select_dtypes('float').columns
int_vars = UserCookiesData.select_dtypes('int').columns

##### Exploring univariate variables

In [None]:
exploredata.univariate(UserCookiesData, categorical_vars, quantitative_vars)

##### Exploring Bivariate variables

In [None]:
exploredata.bivariate_categorical(data=UserCookiesData, target="Revenue", categorical_vars ="Weekend" )

In [None]:
plt.figure(figsize=(15, 15))
sns.heatmap(UserCookiesData.corr(), cmap='coolwarm', center=0, annot=True)
plt.show()

In [None]:
plt.figure(figsize=[10,5])
sns.pairplot(data=UserCookiesData, hue="Revenue")
plt.show()

In [None]:

exploredata.plot_cat_by_target(data =UserCookiesData, target_variable = "Revenue", categorical_var = "VisitorType")

## Statistical Analysis

#### Hypothesis Testing

##### Test Analysis for Categorical variables with label variable (Revenue)

In [None]:
alpha = 0.05

  #### **Hypothesis 1**
  - Ho : Browser type is independent of the revenue(either purchased made or not) of customers 
  - Ha : Browser type is not independent of the revenue(either purchased made or not) of customers


  Using that alpha = 0.05

In [None]:
sns.histplot(data=UserCookiesData, x='Browser', hue="Revenue" , palette =["yellow", "blue"] )

In [None]:
test1 = exploredata.run_chi2(data= UserCookiesData, categorical_var = "Browser", target_variable="Revenue")
test1

In [None]:
test1[0]['p-value']< alpha

* ##### *TakeAways from test 1*
    - Since the p-value is greater than alpha (a significance value of 0.05), we failed to reject the null hypothesis that Browser type is independent of a users decision to make purchase.
    - We conclude that, a user decision to either make purchase from an online shop is not affected by the type of Browser they user
    - This will help us in our next method about feature engineering and feature selection to improve the effectives of some models like the Random forest classifier
    - We will not include Browser type in the features.

#### **Hypothesis 2**

  + Ho : VisitorType is independent of the purchase decision of the user
  + Ha : VisitorType is not independent of the purchase decision of the user

In [None]:
sns.histplot(data=UserCookiesData, x="VisitorType", hue="Revenue", palette=["lightgreen", "yellow"])

In [None]:
test2 = exploredata.run_chi2(data=UserCookiesData, categorical_var = "VisitorType", target_variable = "Revenue")
test2

In [None]:
test2[0]["p-value"] < alpha

* ##### *TakeAways from test 2*
    - Since the p-value is less than alpha (a significance value of 0.05), we reject the null hypothesis that Visitor type is independent of a users decision to make purchase.
    - We conclude that, a user decision to either make purchase from an online shop is not independent of the Visitors Type (Either returning user or new user)
    - This will help us in our next method about feature engineering and feature selection to improve the effectives of some models like the Random forest classifier
    - We will include VisitorType in the features for our Random Forest classifer training

 #### **Hypothesis 3**
    * Ho: ProductRelated is independent of the purchase decision of the user
    * Ha: ProductRelated is independent of the purchase decison of the user

In [None]:
plt.figure(figsize=[20,10])
sns.histplot(data = UserCookiesData,weights=3, x='ProductRelated', hue="Revenue", palette=["red", "green"])

In [None]:
test3 = exploredata.run_chi2(data=UserCookiesData, categorical_var="ProductRelated", target_variable="Revenue")

* ##### TakeAways from test 3

  #### **Hypothesis 4**
  - Ho:Operating Systems is independent of a users buying decision
  - Ha: Operating Systems is not independent of a users buying decision

In [None]:
sns.histplot(data = UserCookiesData, weights=20, x = "OperatingSystems", hue = "Revenue", palette=["Black", "Yellow"])

In [None]:
test4 = exploredata.run_chi2(data=UserCookiesData, categorical_var = "OperatingSystems", target_variable = "Revenue")
test4

In [None]:
test4[0]['p-value'] < alpha

* ##### *TakeAways from test 4*
    - Since the p-value is less than the significance value of 0.05, we reject the null hypothesis and infer that the Operating Systems type is not independent of the customers intention to make a purchase

 #### **Hypothesis 5**

In [None]:
test5 = exploredata.run_chi2(data = UserCookiesData, categorical_var = "Weekend", target_variable = "Revenue")
test5

In [None]:
test5[0]['p-value'] < alpha

* ##### *TakeAways from test 5*
    - Since the p-value is greater than alpha (a significance value of 0.05), we failed to reject the null hypothesis that Weekend is independent of a users decision to make purchase.
    - We conclude that, a user decision to either make purchase from an online shop is not affected by the type whethe the user visits the website in weekends or not
    - This pre-informs us that Weekend as a feature will not be used in our feature engineering and feature selection to improve the effectives of some models like the Random forest classifier
    - We will not include Weekend in the features.

#### **Hypothesis test 6**

  * Ho: The Region location of a customer is independent of the purchasing intent of the customer
* Ha: The Region location of a customer is not independent of the purchasing intent of the customer

In [None]:

sns.histplot(data=UserCookiesData, hue="Revenue", )

## DATA PREPROCESSING

In [None]:
# def convert_T_F_to_ints(data):
  

## FEATURE ENGINEERING

## MODEL TRAINING AND TESTING

## MODEL EVALUATION