## IMPORTING LIBRARIES

In [None]:
# local modules
import exploredata


# external libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os

# scikit-learn modelling algorithms
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier    

# deployment libraries
import pickle as pc


import warnings
warnings.filterwarnings('ignore')

##### Function to load the dataset

In [None]:
def load_user_cookies_data(filename):
    if os.path.isfile(filename):
      return pd.read_csv(filename)
    else:
      return ("Invalid file name, make sure the filename is correct and is in the same package")

In [None]:
UserCookiesData = load_user_cookies_data("shopping.csv")

In [None]:
UserCookiesData.info()

In [None]:
UserCookiesData.head()

In [None]:
UserCookiesData['Revenue'] = UserCookiesData['Revenue'].astype(int)
UserCookiesData['Revenue'].value_counts()

##### Helper functions for exploratory data analysis

In [None]:
def binary_to_ints(value):
    if value == True:
      return 1
    else:
      return 0

In [None]:
def train_validate_test_split(data, target, seed = 126):
    """
    It splits the data into train, validate and test sets.
    :return: three dataframes: train, validate, and test.
    """
    train_validate, test = train_test_split(data, test_size=0.20, random_state=seed, stratify=data[target])
    train, validate = train_test_split(train_validate, test_size=0.30, random_state=seed,stratify=train_validate[target])
    return train, validate, test


In [None]:
def process_unencoded_data(data):
    """
    It takes in a dataframe, drops duplicates, removes rows where tenure is 0, removes $ and , from
    TotalCharges, converts TotalCharges to float, strips whitespace from all object columns, and returns
    a train, validate, and test dataframe
    """
    data.drop_duplicates(inplace = True)
    categorical_columns = data.select_dtypes('object').columns

    for column in categorical_columns:
        data[column] = data[column].str.strip()
    return train_validate_test_split(data, 'Revenue')


univariate data exploratory analysis helper functions

In [None]:

def freq_table(train, cat_var):
    """
    It takes a dataframe and a categorical variable as input, and returns a frequency table as output
    :return: A dataframe with the unique values of the categorical variable, the count of each unique
    value, and the percentage of each unique value.
    """
    class_labels = list(train[cat_var].unique())
    freq_table = (
      pd.DataFrame(
        {cat_var: class_labels,
        'Count': train[cat_var].value_counts(normalize=False),
        'Percent': round(train[cat_var].value_counts(normalize=True)*100,2)
        }))

    return freq_table




In [None]:
def univariate_quant(data,quantitative_variables):

    descriptive_statistics = data[quantitative_variables].describe()
    plt.figure(figsize=(8,2))
    plot = plt.subplot(1, 2, 1)
    plot = plt.hist(data[quantitative_variables], color='yellow')
    plot = plt.title(quantitative_variables)
    plot = plt.subplot(1, 2, 2)
    plot = plt.boxplot(data[quantitative_variables])
    plot = plt.title(quantitative_variables)
    return plot, descriptive_statistics


In [None]:
def univariate_categorical(data, categorical_vars):
    """
    It creates a bar chart of the frequency of each category in a categorical variable.
    :param data: the dataframe
    :param categorical_vars: The categorical variable you want to plot
    """
    frequency_table = freq_table(data, categorical_vars)
    plt.figure(figsize=(6,3))
    sns.barplot(x=categorical_vars, y='Count', data=frequency_table, color='lightblue')
    plt.xticks(rotation = 90)
    plt.title(categorical_vars)
    plt.show()
    print(frequency_table)

In [None]:
def univariate(data, categorical_vars, quantitative_vars):
    """
    This function takes in a dataframe, a list of categorical variables, and a list of quantitative
    variables. It then calls the univariate_categorical function for each categorical variable and the
    univariate_quant function for each quantitative variable.
    """
    for var in categorical_vars:
        univariate_categorical(data, var)

    for column in quantitative_vars:
        plot, descriptive_statistics = univariate_quant(data, column)
        plt.gca(figsize = 10)
        plt.show(plot)
        print(descriptive_statistics)

bivariate data analysis helper functions

In [None]:
def plot_cat_by_target(data, target_variable, categorical_var):
    """
    It takes a dataframe, a target variable, and a categorical variable, and plots the mean of the
    target variable for each category of the categorical variable
    :return: A plot
    """
    p = plt.figure(figsize=(10,2))
    p = sns.barplot(categorical_var, target_variable, data=data, alpha=.8, color='lightseagreen')
    overall_rate = data[target_variable].mean()
    p = plt.axhline(overall_rate, ls='--', color='gray')
    return p

In [None]:
def compare_means(data, target_variable, quantitative_vars, alt_hyp='two-sided'):
    x = data[data[target_variable]==0][quantitative_vars]
    y = data[data[target_variable]==1][quantitative_vars]
    return stats.mannwhitneyu(x, y, use_continuity=True, alternative=alt_hyp)

In [None]:
def plot_boxen(data, target_variable, quantitative_var):
    """
    It plots a boxen plot for the quantitative variable and the target variable.
    """
    average = data[quantitative_var].mean()
    p = sns.boxenplot(data=data, x=target_variable, y=quantitative_var, color='orange')

    p = plt.title(quantitative_var)
    p = plt.axhline(average, ls='--', color='black')
    return p


In [None]:
def plot_swarm(data, target_variable, quantitative_var):
    """
    It plots a swarmplot of the quantitative variable against the target variable.
    """
    average = data[quantitative_var].mean()
    p = sns.swarmplot(data=data, x=target_variable, y=quantitative_var, color='lightgray')
    p = plt.title(quantitative_var)
    p = plt.axhline(average, ls='--', color='black')
    return p

In [None]:
def bivariate_quant(data, target, quantitative_var):
    """
    It takes a dataframe, a target variable, and a quantitative variable, and then it prints the
    descriptive statistics for the quantitative variable, grouped by the target variable. It also plots
    a boxen plot of the quantitative variable, grouped by the target variable
    """
    print(quantitative_var, "\n____________________\n")
    descriptive_stats = data.groupby(target)[quantitative_var].describe()
    plt.figure(figsize=(4,4))
    plot_boxen(data, target, quantitative_var)
    # plot_swarm(data, target, quantitative_vars)
    plt.show()
    print(descriptive_stats, "\n")


In [None]:
def bivariate_categorical(data, target, categorical_variable):
    """
    It takes a dataframe, a target variable, and a categorical variable, and returns a crosstab of the
    two variables and a bar chart of the crosstab
    """
    ct = pd.crosstab(data[categorical_variable], data[target], margins=True)
    plot = plot_cat_by_target(data, target, categorical_variable)
    print("\nobserved:\n", ct)
    plt.show(plot)

## EXPLORATORY DATA ANALYSIS

* Here We will use the helper functions in the local explore.py module

In [None]:
# checking the Distribution of customers on Revenue

plt.rcParams['figure.figsize'] = (13, 5)
plt.subplot(1, 2, 1)
sns.countplot(UserCookiesData['Revenue'], palette = 'coolwarm_r')
plt.title('Distribution of customers on Revenue', fontsize = 15)
plt.xlabel('Revenue or not', fontsize = 15)
plt.ylabel('count', fontsize = 15)
plt.show()

##### Plotting a pie chart for operating systems distribution

In [None]:
UserCookiesData['OperatingSystems'].value_counts()

In [None]:
#Prepare and split into train, validate, and test sets.
train, validate, test = exploredata.process_unencoded_data(data = UserCookiesData )


In [None]:
UserCookiesData.select_dtypes('object').columns

In [None]:

categorical_vars = UserCookiesData.select_dtypes('object').columns
quantitative_vars = UserCookiesData.select_dtypes('float').columns
int_vars = UserCookiesData.select_dtypes('int').columns

##### Exploring univariate variables

In [None]:
UserCookiesData['OperatingSystems'].value_counts()

In [None]:
# This code cell is plotting a pie chart of the different operating systems.
size = [6601, 2585, 2555, 589]
colors = ['violet', 'yellow', 'green', 'orange']
labels = "2", "1", "3", "others"
plt.rcParams['figure.figsize'] = (18, 7)
plt.subplot(1, 2, 2)
plt.pie(size, colors = colors, labels = labels, shadow = True, autopct = '%.2f%%', startangle=90)
plt.title('Different Operating Systems', fontsize = 30)
plt.axis('off')
plt.legend()
plt.show()

In [None]:
# This code cell is plotting a pie chart for the visitor types
size = [10551, 1694, 85]
explode = [0, 0, 0.1]
labels = "Returning Visitor", "New Visitor", "Others"
colors = ['blue', 'lightblue', 'orange']
plt.rcParams['figure.figsize'] = (18, 7)
plt.subplot(1, 2, 1)
plt.pie(size, colors = colors, labels = labels, explode = explode, shadow = True, autopct = '%.2f%%')
plt.title('Different Visitors', fontsize = 30)
plt.axis('off')
plt.legend()
plt.show()

In [None]:
exploredata.univariate(UserCookiesData, categorical_vars, quantitative_vars)

##### Observations from univariate exploration

- Different user types with reference to region are not normally (Gaussian) distributed. This regional data has an exponential distribution. Therefore, we must be concerned with this type distribution.
- Multiple types of traffic are not normally(Gaussian) distributed. This data has an exponential distribution.
- More than 85% of visitors are repeat customers, which is enormous. For marketing purposes, this information can be useful.
- 90% of people only used the top 3 browsers.
- 95% of the users in  this session cookies data uses the top 3 Operating Systems. The online will then need to concentrate on these browsers in order to grow embark on specific operations to increase customer purchases.
- The distribution of Weekend and Revenue statistics is highly unbalanced.

##### Exploring Bivariate Analysis

In [None]:
UserCookiesData.head(1)

In [None]:
exploredata.bivariate_categorical(data=UserCookiesData, target="Revenue", categorical_variable ="Weekend" )

In [None]:
exploredata.bivariate_categorical(data = UserCookiesData, target= 'Revenue', categorical_variable = 'OperatingSystems')

In [None]:
exploredata.bivariate_quant(data = UserCookiesData, target = 'Revenue', quantitative_var = 'PageValues')

In [None]:
exploredata.bivariate_quant(data = UserCookiesData, target = 'Revenue', quantitative_var = 'ExitRates')

In [None]:
exploredata.bivariate_quant(data = UserCookiesData, target="Revenue", quantitative_var="BounceRates")

In [None]:
plt.figure(figsize=(12, 12))
sns.heatmap(UserCookiesData.corr(), cmap='coolwarm', center=0, annot=True)
plt.show()

In [None]:
plt.figure(figsize=[10,5])
sns.pairplot(UserCookiesData,x_vars=['BounceRates','ExitRates'],y_vars=['BounceRates','ExitRates'],hue='Revenue',diag_kind='kde')
plt.show()

In [None]:

exploredata.plot_cat_by_target(data =UserCookiesData, target_variable = "Revenue", categorical_var = "VisitorType")

##### *Brief observations from bivariate analysis*

## Statistical Analysis

#### Hypothesis Testing

##### Test Analysis for Categorical variables with label variable (Revenue)

In [None]:
alpha = 0.05

  #### **Hypothesis 1**
  - Ho : Browser type is independent of the revenue(either purchased made or not) of customers 
  - Ha : Browser type is not independent of the revenue(either purchased made or not) of customers


  Using that alpha = 0.05

In [None]:
sns.histplot(data=UserCookiesData, x='Browser', hue="Revenue" , palette =["yellow", "blue"] )

In [None]:
test1 = exploredata.run_chi2(data= UserCookiesData, categorical_var = "Browser", target_variable="Revenue")
test1

In [None]:
test1[0]['p-value']< alpha

* ##### *TakeAways from test 1*
    - Since the p-value is greater than alpha (a significance value of 0.05), we failed to reject the null hypothesis that Browser type is independent of a users decision to make purchase.
    - We conclude that, a user decision to either make purchase from an online shop is not affected by the type of Browser they user
    - This will help us in our next method about feature engineering and feature selection to improve the effectives of some models like the Random forest classifier
    - We will not include Browser type in the features.

#### **Hypothesis 2**

  + Ho : VisitorType is independent of the purchase decision of the user
  + Ha : VisitorType is not independent of the purchase decision of the user

In [None]:
sns.histplot(data=UserCookiesData, x="VisitorType", hue="Revenue", palette=["lightgreen", "yellow"])

In [None]:
test2 = exploredata.run_chi2(data=UserCookiesData, categorical_var = "VisitorType", target_variable = "Revenue")
test2

In [None]:
test2[0]["p-value"] < alpha

* ##### *TakeAways from test 2*
    - Since the p-value is less than alpha (a significance value of 0.05), we reject the null hypothesis that Visitor type is independent of a users decision to make purchase.
    - We conclude that, a user decision to either make purchase from an online shop is not independent of the Visitors Type (Either returning user or new user)
    - This will help us in our next method about feature engineering and feature selection to improve the effectives of some models like the Random forest classifier
    - We will include VisitorType in the features for our Random Forest classifer training

 #### **Hypothesis 3**
    * Ho: ProductRelated is independent of the purchase decision of the user
    * Ha: ProductRelated is independent of the purchase decison of the user

In [None]:
plt.figure(figsize=[20,10])
sns.histplot(data = UserCookiesData,weights=3, x='ProductRelated', hue="Revenue", palette=["red", "green"])

In [None]:
test3 = exploredata.run_chi2(data=UserCookiesData, categorical_var="ProductRelated", target_variable="Revenue")

In [None]:
test3[0]['p-value'] < alpha

* ##### TakeAways from test 3

  #### **Hypothesis 4**
  - Ho:Operating Systems is independent of a users buying decision
  - Ha: Operating Systems is not independent of a users buying decision

In [None]:
sns.histplot(data = UserCookiesData, weights=20, x = "OperatingSystems", hue = "Revenue", palette=["Black", "Yellow"])

In [None]:
test4 = exploredata.run_chi2(data=UserCookiesData, categorical_var = "OperatingSystems", target_variable = "Revenue")
test4

In [None]:
test4[0]['p-value'] < alpha

* ##### *TakeAways from test 4*
    - Since the p-value is less than the significance value of 0.05, we reject the null hypothesis and infer that the Operating Systems type is not independent of the customers intention to make a purchase

 #### **Hypothesis 5**

In [None]:
test5 = exploredata.run_chi2(data = UserCookiesData, categorical_var = "Weekend", target_variable = "Revenue")
test5

In [None]:
test5[0]['p-value'] < alpha

* ##### *TakeAways from test 5*
    - Since the p-value is greater than alpha (a significance value of 0.05), we failed to reject the null hypothesis that Weekend is independent of a users decision to make purchase.
    - We conclude that, a user decision to either make purchase from an online shop is not affected by the whether the user visits the website on weekends or not
    - This pre-informs us that Weekend as a feature will not be used in our feature engineering and feature selection to improve the effectives of some models like the Random forest classifier
    - We will not include Weekend in the features.

#### **Hypothesis test 6**

  * Ho: The Region location of a customer is independent of the purchasing intent of the customer
* Ha: The Region location of a customer is not independent of the purchasing intent of the customer

In [None]:

sns.histplot(data=UserCookiesData, x = "Region", hue="Revenue", bins=10, weights=30,palette="coolwarm" )

In [None]:
test6 = exploredata.run_chi2(data=UserCookiesData, categorical_var="Region", target_variable = "Revenue")
print(test6)

In [None]:
test6[0]['p-value'] < alpha

* ##### *TakeAways from test 6*
   - Since the p-value is greater than the significane value of 0.05, we reject the null hypothesis and a conclsion drawn that the purchasing intent of a customer is not dependent on the Region location of a customer.

##### **Hypothesis Test Analysis for Quantitative variables with label variable (Revenue)**

In [None]:
quant_vars = ['BounceRates', 'ExitRates', 'PageValues', 'Administrative_Duration','Informational_Duration','ProductRelated_Duration']

In [None]:
exploredata.two_t_test(data = UserCookiesData, quantitative_vars=quant_vars, target_variable = 'Revenue')

* ##### ***TakeAways from the levene, mannwhitneyu and shiporo-wilk test for the numerical variables***

## DATA PREPROCESSING

##### *Encoding both weekend and the label variable (Revenue) into numeric for modelling*

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
UserCookiesEncoded = pd.get_dummies(UserCookiesData)

In [None]:
UserCookiesEncoded.head()

In [None]:
UserCookiesEncoded.info()

In [None]:
encoder = LabelEncoder()

In [None]:
UserCookiesEncoded["Weekend"] = encoder.fit_transform(UserCookiesEncoded["Weekend"])

In [None]:
UserCookiesEncoded["Revenue"] = encoder.fit_transform(UserCookiesEncoded["Revenue"])

In [None]:
UserCookiesEncoded["Revenue"].value_counts()

##### *Data segmentation into training and testing data sets*

In [None]:
y_label = UserCookiesEncoded["Revenue"]
x_label = UserCookiesEncoded.drop(["Revenue"],axis=1)

In [None]:
x_label

In [None]:
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
x_label=pd.DataFrame(ss.fit_transform(x_label))
x_label

In [None]:

x_train, x_test, y_train, y_test = train_test_split(x_label, y_label, test_size =  0.2, random_state = 0)

In [None]:
x_train.info()

## MODEL TRAINING AND TESTING

#### ***Random Forest Modelling***

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:


def compare_random_classifier_models(x_train, y_train, x_test, y_test):
    models_cont = []
    for num in range(2, 20):
        for val in range(1, 23):
            classifier = RandomForestClassifier(n_estimators=50, random_state = 126, max_depth = num, min_samples_leaf = val)
            classifier.fit(x_train, y_train)
            train_score = classifier.score(x_train, y_train)
            predictions = classifier.predict(x_test)

            tp = confusion_matrix(y_test, predictions)[1][1]
            fp = confusion_matrix(y_test, predictions)[0][1]
            tn = confusion_matrix(y_test, predictions)[0][0]
            fn = confusion_matrix(y_test, predictions)[1][0]
            test_score = classifier.score(x_test, y_test)
            eval_params = {
                'max_depth':num,
                'min_samples_leaf': val,
                'True Positves': tp,
                'False Positives': fp,
                'True Negatives': tn,
                'False Negatvies': fn,
                'Precision': tp / (tp + fp),
                'Recall': tp / (tp + fn),
                'Specificity': round(tn / (tn + fp),2),
                'Training Accuracy': round(train_score, 2),
                'Test Accuracy': round(test_score,2)
            }
            models_cont.append(eval_params)
    return pd.DataFrame(models_cont)


In [None]:
compare_random_classifier_models(x_train, y_train, x_test, y_test)

* #### Feature Engineering for random forest model

In [None]:
UserCookiesData_ = UserCookiesData.copy()

In [None]:
UserCookiesData_.drop(['Browser', 'Weekend'], axis=1,inplace=True)

In [None]:

UserCookiesData_["VisitorType"].value_counts()


In [None]:
UserCookiesData["Month"]

In [None]:
UserCookiesData_["VisitorType"].replace(["Returning_Visitor","New_Visitor", "Other" ], [0,1,2], inplace = True)

In [None]:
UserCookiesData_["Month"].replace(["Jan","Feb", "Mar", "Apr", "May","June", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" ], [0,1,2, 3,4,5,6,7,8,9,10,11], inplace = True)

In [None]:
y_label_ = UserCookiesData_["Revenue"]
x_label_ = UserCookiesData_.drop(["Revenue"],axis=1)

In [None]:
x_train_, x_test_, y_train_, y_test_ = train_test_split(x_label_, y_label_, test_size =  0.2, random_state = 0)

In [None]:
x_train_.head()

In [None]:
def test_best_random_forest_classifier(x_train, y_train, x_test, y_test):
    classifier = RandomForestClassifier(random_state = 123, max_depth = 394, min_samples_leaf = 19)
    classifier.fit(x_train, y_train)
    train_score = classifier.score(x_train, y_train)
    test_score = classifier.score(x_test, y_test)
    predictions = classifier.predict(x_test)
    print(predictions)
    tp = confusion_matrix(y_test, predictions)[1][1]
    fp = confusion_matrix(y_test, predictions)[0][1]
    tn = confusion_matrix(y_test, predictions)[0][0]
    fn = confusion_matrix(y_test, predictions)[1][0]
    test_score = classifier.score(x_test, y_test)
    eval_params = {
        'max_depth':395,
        'min_samples_leaf': 19,
        'True Positves': tp,
        'False Positives': fp,
        'True Negatives': tn,
        'False Negatvies': fn,
        'Precision': tp / (tp + fp),
        'Recall': tp / (tp + fn),
        'Specificity': round(tn / (tn + fp),3),
        'Training Accuracy': round(train_score, 3),
        'Test Accuracy': round(test_score,3)
    }
    test_results = [eval_params]
    test_df = pd.DataFrame(test_results)
    return classifier, test_df

In [None]:
rf_model = test_best_random_forest_classifier(x_train_, y_train_, x_test_, y_test_)

In [None]:
rf_model[1]

In [None]:
from sklearn.metrics import RocCurveDisplay
ax = plt.gca()
rfc_disp = RocCurveDisplay.from_estimator(rf_model[0], x_test_, y_test_, ax=ax, alpha=0.8)
rfc_disp.plot(ax=ax, alpha=0.8)
plt.show()

In [None]:
predictions = rf_model[0].predict(x_test_)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
cm_1 = confusion_matrix(y_test_, predictions, labels=rf_model[0].classes_)
display = ConfusionMatrixDisplay(confusion_matrix=cm_1,display_labels=rf_model[0].classes_)
plt.figure(figsize=(2,1))
display.plot()
plt.show()

In [None]:
with open("model.pkl", "wb") as f:
  pc.dump(rf_model[0], f)  

* #### HyperParameter Tunning for random forest model

In [None]:
from pprint import pprint
tuned_rf = RandomForestClassifier(random_state = 123)
print('Parameters currently in use:\n')
pprint(tuned_rf.get_params())

In [None]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

In [None]:
rf = RandomForestClassifier()
tuned_rnf= RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
tuned_rnf.fit(x_train, y_train)

In [None]:
def evaluate(model, x_test, y_test):
    predictions = model.predict(x_test)
    errors = abs(predictions - y_test)
    mape = 100 * np.mean(errors / y_test)
    print('Model Performance')
    print()
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(model.score(x_test, y_test)))

In [None]:
best_random = tuned_rnf.best_estimator_
random_accuracy = evaluate(best_random, x_test, y_test)

In [None]:
intent_rf_pipeline = pc.load(
    open("./model.pkl", "rb")
)

intent_rf_pipeline.predict(x_test_)[1]

#### ***KNN Modelling***

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [None]:
knn_classifier = KNeighborsClassifier(n_neighbors=3)
knn_classifier.fit(x_train, y_train)

In [None]:
knn_classifier.predict(x_test)

In [None]:
knn_classifier.score(x_test, y_test)

- * With a random choice for k=3, we had an accuracy of 85%, which is a good start for this model

* ##### K-Fold cross validation for KNN model

In [None]:
knn_cross_valid = KNeighborsClassifier(n_neighbors=3)
cv_scores = cross_val_score(knn_cross_valid, x_train, y_train, cv=5)
print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores, keepdims = True)))

- * With a K-Fold cross validation for the KNN model we had an improvement in accuracy which is 86%

* ##### GridSearchCv hypertuning for KNN model

In [None]:

knn2 = KNeighborsClassifier()
param_grid = {'n_neighbors': np.arange(1, 25)}
knn_gscv = GridSearchCV(knn2, param_grid, cv=5)
knn_gscv.fit(x_train, y_train)

In [None]:
knn_gscv.best_params_

In [None]:
knn_gscv.best_score_

- * With GridSearchCv hypertuning for the KNN model there was a slight improvement in the models accuracy to 87%

### TRAINING WITH SMOTE FOR BALANCING

In [None]:
from imblearn.over_sampling import RandomOverSampler
smote=RandomOverSampler(random_state=42)
X_new,Y_new=smote.fit_resample(x_label,y_label)
X_new=pd.DataFrame(X_new,columns=x_train.columns)
Y_new=pd.DataFrame(Y_new,columns=['Revenue'])
X_new.head()

In [None]:
x_train,x_test,y_train,y_test=train_test_split(X_new,Y_new,test_size=0.2,random_state=1)

In [None]:
rf_with_smote=RandomForestClassifier(n_estimators=50,max_depth=16)
rf_with_smote.fit(x_train,y_train)
print('Train score:',rf_with_smote.score(x_train,y_train))
print('Test score:',rf_with_smote.score(x_test,y_test))