# Importing the relevant libraries

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

from statsmodels.stats.outliers_influence import variance_inflation_factor

import matplotlib.pyplot as plt

import seaborn as sns

import numpy as np

# Data Preprocessing

### Importing the Database

In [None]:
# Import the data from the CSV file
raw_data = pd.read_csv('ml_datasource.csv')

# Display the first 5 rows of the dataframe for preview
raw_data.head()

In [None]:
data = raw_data.copy()

### Removing Outliers

In [None]:
# Reset any modifications to the plotting context (sns) made via seaborn
sns.reset_orig()

# Set the font scale for seaborn plots
sns.set(font_scale=1.5)

# Initialize a grid of plots with specified dimensions
fig, axes = plt.subplots(3, 2, figsize=(20,20))

# Plotting distribution plots for each of the columns in the dataset
sns.kdeplot(data=data['days_on_platform'], ax=axes[0,0])
sns.kdeplot(data=data['minutes_watched'], ax=axes[0,1])
sns.kdeplot(data=data['courses_started'], ax=axes[1,0])
sns.kdeplot(data=data['practice_exams_started'], ax=axes[1,1])
sns.kdeplot(data=data['practice_exams_passed'], ax=axes[2,0])
sns.kdeplot(data=data['minutes_spent_on_exams'], ax=axes[2,1]);

plt.show()

In [None]:
# Removing outliers based on 'minutes_watched', 'courses_started',
# and 'practice_exams_started', and 'minutes_spent_on_exams' fields
data_no_outliers = data[(data['minutes_watched'] <= 1000)
                            & (data['courses_started']<=10)
                            & (data['practice_exams_started']<=10)
                            & (data['minutes_spent_on_exams']<=40)]

In [None]:
# Reset any modifications to the plotting context (sns) made via seaborn
sns.reset_orig()

# Set the font scale for seaborn plots
sns.set(font_scale=1.5)

# Initialize a grid of plots with specified dimensions
fig, axes = plt.subplots(3, 2, figsize=(20,20))

# Plotting distribution plots for each of the columns in the dataset
sns.kdeplot(data=data_no_outliers['days_on_platform'], ax=axes[0,0])
sns.kdeplot(data=data_no_outliers['minutes_watched'], ax=axes[0,1])
sns.kdeplot(data=data_no_outliers['courses_started'], ax=axes[1,0])
sns.kdeplot(data=data_no_outliers['practice_exams_started'], ax=axes[1,1])
sns.kdeplot(data=data_no_outliers['practice_exams_passed'], ax=axes[2,0])
sns.kdeplot(data=data_no_outliers['minutes_spent_on_exams'], ax=axes[2,1]);

plt.show()

### Checking for Multicollinearity

In [None]:
# Printing the column names of the dataset (for reference)
data_no_outliers.columns.to_numpy()

In [None]:
# Selecting the numerical columns for Variance Inflation Factor (VIF) calculation
variables = data_no_outliers[['days_on_platform',
                              'minutes_watched',
                              'courses_started', 
                              'practice_exams_started', 
                              'practice_exams_passed', 
                              'minutes_spent_on_exams']]

# Creating a DataFrame to store the VIF value for each feature
vif = pd.DataFrame()

# Computing the VIF for each selected feature using list comprehension
# Storing the values in a column called 'VIF'
vif['VIF'] = [variance_inflation_factor(variables.to_numpy(), i) for i in range(variables.shape[1])]

# Storing the names of the features in a column called 'features'
vif['features'] = variables.columns

# Displaying the DataFrame
vif

In [None]:
# Dropping 'practice_exams' to prevent multicollinearity 
data_no_mult = data_no_outliers.drop('practice_exams_started', axis = 1)

# Displaying the first five rows of the new data
data_no_mult.head()

In [None]:
# Selecting specific columns for new Variance Inflation Factor (VIF) calculation
variables = data_no_outliers[['days_on_platform',
                              'minutes_watched',
                              'courses_started', 
                              'practice_exams_passed', 
                              'minutes_spent_on_exams']]

# Computing the new VIF values for each selected feature
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(variables.to_numpy(), i) for i in range(variables.shape[1])]
vif["features"] = variables.columns
vif

### Dealing with NaN Values

In [None]:
# Checking the number of null values in each column
data_no_mult.isnull().sum()

In [None]:
# Select rows from the 'data_no_mult' DataFrame where the 
# 'student_country' column has missing values (NaN).
data_no_mult.loc[ data_no_mult['student_country'].isna()]

In [None]:
# Replacing NaN values with the string 'NAM'
data_no_nulls = data_no_mult.fillna('NAM', axis = 1)

In [None]:
# Displaying records where 'student_country' is 'NAM'
data_no_nulls.loc[ data_no_nulls['student_country'] == 'NAM', 'student_country']

In [None]:
# Re-checking the number of null values in each column after replacement
data_no_nulls.isnull().sum()

### Splitting the Data

In [None]:
# Defining inputs (feature variables) and target (outcome variable)
inputs = data_no_nulls.drop(['purchased'],axis=1)
target = data_no_nulls['purchased']

In [None]:
# Splitting the data into train and test sets, ensuring balanced classes with stratification
x_train, x_test, y_train, y_test = train_test_split(inputs, 
                                                    target, 
                                                    test_size=0.2, 
                                                    random_state=365,
                                                    stratify = target)

In [None]:
# Display the first 5 rows of the DataFrame for preview
x_train.head()

### Encoding the Data

In [None]:
# Initializing an ordinal encoder for categorical variables
enc = OrdinalEncoder(handle_unknown = 'use_encoded_value', 
                     unknown_value = 170);

In [None]:
# Encoding the 'student_country' column in the training and testing datasets
# and storing the encoded variable in a new column called 'student_country_enc'
x_train['student_country_enc'] = enc.fit_transform(x_train['student_country'].to_numpy().reshape(-1, 1));
x_test['student_country_enc'] = enc.transform(x_test['student_country'].to_numpy().reshape(-1, 1));

# Dropping the original 'student_country' column after encoding
x_train = x_train.drop('student_country', axis = 1)
x_test = x_test.drop('student_country', axis = 1)

# Displaying the first five rows of the encoded training dataset
x_train.head()

In [None]:
# We use the np.asarray() method to ensure data type consistency and compatibility.
# y_train values are converted to integers and x_train values are converted to floating-point numbers.

x_train_array = np.asarray(x_train, dtype = 'float')
y_train_array = np.asarray(y_train, dtype = 'int')

x_test_array = np.asarray(x_test, dtype = 'float')
y_test_array = np.asarray(y_test, dtype = 'int')