In [2]:
pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.1 MB)
     |████████████████████████████████| 11.1 MB 76 kB/s            
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Collecting scipy>=1.5.0
  Downloading scipy-1.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5 MB)
     |████████████████████████████████| 34.5 MB 19 kB/s             
Collecting joblib>=1.1.1
  Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
     |████████████████████████████████| 301 kB 100 kB/s            
[?25hInstalling collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.3.2 scipy-1.10.1 threadpoolctl-3.5.0
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the

In [3]:
#Generate a similar dataset with missing values
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Number of records to generate
num_records = 200

# Generate data for each column
products = np.random.choice(['Tablet', 'Smartphone', 'Laptop'], size=num_records, p=[0.3, 0.4, 0.3])
ages = np.random.randint(18, 65, size=num_records).astype(float)  # Convert to float to allow NaN
genders = np.random.choice(['Male', 'Female'], size=num_records, p=[0.5, 0.5])
education_years = np.random.randint(10, 21, size=num_records).astype(float)  # Convert to float
marital_statuses = np.random.choice(['Single', 'Partnered'], size=num_records, p=[0.6, 0.4])
usage_per_week = np.random.randint(1, 8, size=num_records).astype(float)  # Convert to float
tech_skills = np.random.randint(1, 6, size=num_records).astype(float)  # Convert to float
incomes = np.random.randint(20000, 120001, size=num_records).astype(float)  # Convert to float
hours_spent = np.random.randint(1, 21, size=num_records).astype(float)  # Convert to float

# Introduce missing values randomly
# Define the percentage of missing values
missing_percentage = 0.1  # 10% missing values

# Function to randomly assign NaN values
def introduce_missing_values(array):
    # Create a mask for missing values
    mask = np.random.rand(len(array)) < missing_percentage
    array[mask] = np.nan  # Set the selected indices to NaN
    return array

# Create a DataFrame
data = pd.DataFrame({
    'Product': introduce_missing_values(products.copy()),
    'Age': introduce_missing_values(ages.copy()),
    'Gender': introduce_missing_values(genders.copy()),
    'Education': introduce_missing_values(education_years.copy()),
    'MaritalStatus': introduce_missing_values(marital_statuses.copy()),
    'Usage': introduce_missing_values(usage_per_week.copy()),
    'TechSkill': introduce_missing_values(tech_skills.copy()),
    'Income': introduce_missing_values(incomes.copy()),
    'HoursSpent': introduce_missing_values(hours_spent.copy())
})

# Save the dataset to a CSV file
data.to_csv('TechGadgetSales_with_missing_values.csv', index=False)

# Display the first few rows of the dataset
print("Dataset After Imputation with Missing Values:")
print(data.head())

# Check how many missing values are present in each column
print("\nMissing values count in each column:")
print(data.isnull().sum())

Dataset After Imputation with Missing Values:
      Product   Age  Gender  Education MaritalStatus  Usage  TechSkill  \
0  Smartphone  41.0    Male       19.0        Single    6.0        4.0   
1      Laptop  28.0     nan        NaN        Single    6.0        3.0   
2      Laptop  25.0  Female       11.0        Single    3.0        5.0   
3  Smartphone  53.0  Female       15.0        Single    4.0        1.0   
4      Tablet   NaN  Female       15.0        Single    4.0        5.0   

    Income  HoursSpent  
0  68874.0         1.0  
1  69708.0         2.0  
2      NaN         1.0  
3      NaN         1.0  
4      NaN         9.0  

Missing values count in each column:
Product           0
Age              18
Gender            0
Education        23
MaritalStatus     0
Usage            19
TechSkill        16
Income           22
HoursSpent       18
dtype: int64


In [14]:
#Method 1: handle the missing values by using mean and the most frequent category
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
data = pd.read_csv("TechGadgetSales_with_missing_values.csv")
print("\nMissing values count in each column:")
print(data.isnull().sum())

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.impute import SimpleImputer
data = pd.read_csv('TechGadgetSales_with_missing_values.csv')

# Display the original dataset with missing values
print("Original Dataset with Missing Values:")
print(data.head())
print("\nMissing values count in each column:")
print(data.isnull().sum())

# Handle missing values for numeric columns by using the mean imputation
numeric_cols = ['Age', 'Education', 'Usage', 'TechSkill', 'Income', 'HoursSpent']
numeric_imputer = SimpleImputer(strategy='mean')
data[numeric_cols] = numeric_imputer.fit_transform(data[numeric_cols])

#Handle missing values for categorical columns by using the mode imputation
categorical_cols = ['Product', 'Gender', 'MaritalStatus']
categorical_imputer = SimpleImputer(strategy='most_frequent')
data[categorical_cols] = categorical_imputer.fit_transform(data[categorical_cols])

# Display the dataset after imputation
print("\nDataset After Imputation:")
print(data.head())

# Check whether there are missing values after imputation
print("\nMissing values count in each column after imputation:")
print(data.isnull().sum())

#Save the cleaned dataset to a new CSV file
data.to_csv('TechGadgetSales_cleaned.csv', index=False)
print("\nCleaned dataset saved as 'TechGadgetSales_cleaned.csv'.")



Missing values count in each column:
Product          14
Age              18
Gender           21
Education        23
MaritalStatus    24
Usage            19
TechSkill        16
Income           22
HoursSpent       18
dtype: int64
Original Dataset with Missing Values:
      Product   Age  Gender  Education MaritalStatus  Usage  TechSkill  \
0  Smartphone  41.0    Male       19.0        Single    6.0        4.0   
1      Laptop  28.0     NaN        NaN        Single    6.0        3.0   
2      Laptop  25.0  Female       11.0        Single    3.0        5.0   
3  Smartphone  53.0  Female       15.0        Single    4.0        1.0   
4      Tablet   NaN  Female       15.0        Single    4.0        5.0   

    Income  HoursSpent  
0  68874.0         1.0  
1  69708.0         2.0  
2      NaN         1.0  
3      NaN         1.0  
4      NaN         9.0  

Missing values count in each column:
Product          14
Age              18
Gender           21
Education        23
MaritalStatus    2

In [6]:
#Method 2: handle the missing values by using KNN model
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
data = pd.read_csv('TechGadgetSales_with_missing_values.csv')

# Display the original dataset with missing values
print("Original Dataset:")
print(data.head())
print("\nMissing values count in each column:")
print(data.isnull().sum())

#Separate numeric and categorical columns
numeric_cols = ['Age', 'Education', 'Usage', 'TechSkill', 'Income', 'HoursSpent']
categorical_cols = ['Product', 'Gender', 'MaritalStatus']

#Impute numeric columns by using KNN
knn_imputer = KNNImputer(n_neighbors=5)
data[numeric_cols] = knn_imputer.fit_transform(data[numeric_cols])

# Handle categorical columns
encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' avoids dummy variable trap
encoded_categorical = encoder.fit_transform(data[categorical_cols])
encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out(categorical_cols))
data_cleaned = pd.concat([data[numeric_cols], encoded_categorical_df], axis=1)
knn_imputer_categorical = KNNImputer(n_neighbors=5)
data_cleaned = pd.DataFrame(knn_imputer_categorical.fit_transform(data_cleaned), columns=data_cleaned.columns)
for col in encoder.get_feature_names_out(categorical_cols):
    data_cleaned[col] = np.where(data_cleaned[col] > 0.5, 1, 0)  # Convert back to binary indicators

# Display the cleaned dataset after handling missing values
print("\nDataset After KNN Imputation:")
print(data_cleaned.head())

# Check whether there are missing values after imputation
print("\nMissing values count in each column after imputation:")
print(data_cleaned.isnull().sum())

#Save the cleaned dataset to a new CSV file
data_cleaned.to_csv('TechGadgetSales_cleaned_knn.csv', index=False)
print("\nCleaned dataset saved as 'TechGadgetSales_cleaned_knn.csv'.")


Original Dataset:
      Product   Age  Gender  Education MaritalStatus  Usage  TechSkill  \
0  Smartphone  41.0    Male       19.0        Single    6.0        4.0   
1      Laptop  28.0     NaN        NaN        Single    6.0        3.0   
2      Laptop  25.0  Female       11.0        Single    3.0        5.0   
3  Smartphone  53.0  Female       15.0        Single    4.0        1.0   
4      Tablet   NaN  Female       15.0        Single    4.0        5.0   

    Income  HoursSpent  
0  68874.0         1.0  
1  69708.0         2.0  
2      NaN         1.0  
3      NaN         1.0  
4      NaN         9.0  

Missing values count in each column:
Product          14
Age              18
Gender           21
Education        23
MaritalStatus    24
Usage            19
TechSkill        16
Income           22
HoursSpent       18
dtype: int64

Dataset After KNN Imputation:
    Age  Education  Usage  TechSkill   Income  HoursSpent  Product_Smartphone  \
0  41.0       19.0    6.0        4.0  68874.