1. Setting up the python env

In [2]:
  #  python -m venv env

2. Installing necessary libraries
    - numpy
    - seaborn
    - matplotlib
    - scikit-learn
    - pandas

3. Learn how to use Jupyter in Vs Code

4. Finding Dataset
    - kaggle: https://www.kaggle.com/datasets
    - UCI: https://archive.ics.uci.edu/datasets
    - google dataset: https://datasetsearch.research.google.com/
    - USA dataset: [data.gov](https://data.gov/)
    - Paper with code: https://paperswithcode.com/datasets
    - Harrvard dataset: https://dataverse.harvard.edu/dataverse/harvard/

5. Loading Dataset

In [3]:
# loading csv using pandas
import pandas as pd
dataset_dir = "D:\Github\dataset\Ecommerce Customers.csv"
df = pd.read_csv(dataset_dir)
df.head()

Unnamed: 0,Email,Address,Avatar,Avg. Session Length,Time on App,Time on Website,Length of Membership,Yearly Amount Spent
0,mstephenson@fernandez.com,"835 Frank Tunnel\nWrightmouth, MI 82180-9605",Violet,34.497268,12.655651,39.577668,4.082621,587.951054
1,hduke@hotmail.com,"4547 Archer Common\nDiazchester, CA 06566-8576",DarkGreen,31.926272,11.109461,37.268959,2.664034,392.204933
2,pallen@yahoo.com,"24645 Valerie Unions Suite 582\nCobbborough, D...",Bisque,33.000915,11.330278,37.110597,4.104543,487.547505
3,riverarebecca@gmail.com,"1414 David Throughway\nPort Jason, OH 22070-1220",SaddleBrown,34.305557,13.717514,36.721283,3.120179,581.852344
4,mstephens@davidson-herman.com,"14023 Rodriguez Passage\nPort Jacobville, PR 3...",MediumAquaMarine,33.330673,12.795189,37.536653,4.446308,599.406092


In [6]:
# loading image with pillow
import os
from PIL import Image

folder_path = '../../dataset/Chess Dataset'
images = []
labels = []

for subfolder_name in os.listdir(folder_path):
    subfolder_path = os.path.join(folder_path, subfolder_name)
    
    if os.path.isdir(subfolder_path):
        for filename in os.listdir(subfolder_path):
            if filename.endswith('.jpg') or filename.endswith('.png'):
                img_path = os.path.join(subfolder_path, filename)
                img = Image.open(img_path)
                images.append(img)
                labels.append(subfolder_name)

print("Loaded Images: ", len(images))
print("Labels: ", labels[:10])


Loaded Images:  534
Labels:  ['Bishop', 'Bishop', 'Bishop', 'Bishop', 'Bishop', 'Bishop', 'Bishop', 'Bishop', 'Bishop', 'Bishop']


In [None]:
# loading image using OpenCV
import cv2
import os

folder_path = '../dataset/Chess Dataset'
images = []
labels = []

for subfolder_name in os.listdir(folder_path):
    subfolder_path = os.path.join(folder_path, subfolder_name)
    
    if os.path.isdir(subfolder_path):
        for filename in os.listdir(subfolder_path):
            if filename.endswith('.jpg') or filename.endswith('.png'):
                img_path = os.path.join(subfolder_path, filename)
                img = cv2.imread(img_path)
                if img is not None: 
                    images.append(img)
                    labels.append(subfolder_name)

print("Loaded Images: ", len(images))
print("Labels: ", labels[:10])

In [None]:
# loading image using pytorch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor()
])

dataset = datasets.ImageFolder('../dataset/Chess Dataset', transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

iter_image = 100
print(f"Image {iter_image+1}: Label {dataset[iter_image][1]}, Class Name: {dataset.classes[dataset[iter_image][1]]}")

6. Inspect dataset

In [None]:
# explore csv file
print("type of df: ", type(df))
print("lenght df: ", len(df))
print("shape df: ", df.shape)

# print(df.info())

In [None]:
# update cell in dataframe
df.at[0, 'Time on App'] = None

In [7]:
# showing basics statistic
df.describe()

Unnamed: 0,Avg. Session Length,Time on App,Time on Website,Length of Membership,Yearly Amount Spent
count,500.0,500.0,500.0,500.0,500.0
mean,33.053194,12.052488,37.060445,3.533462,499.314038
std,0.992563,0.994216,1.010489,0.999278,79.314782
min,29.532429,8.508152,33.913847,0.269901,256.670582
25%,32.341822,11.388153,36.349257,2.93045,445.038277
50%,33.082008,11.983231,37.069367,3.533975,498.887875
75%,33.711985,12.75385,37.716432,4.126502,549.313828
max,36.139662,15.126994,40.005182,6.922689,765.518462


In [None]:
# count by None value
df['Address'].isna().sum() # ISNA method return where values repaced true for NA values

In [None]:
# count by condition
df[df['Avatar'] == 'DarkGreen'].shape[0]

7. Handling missing data

In [None]:
# Removing record that Missing
df.dropna().head()

In [None]:
# Filling Missing Data (Imputation)
df.fillna(df.mean(numeric_only=True)).head()

8. Normalization

In [None]:
# Apply Min-Max Scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
pd.DataFrame(
    scaler.fit_transform(
        df[['Avg. Session Length', 'Time on App','Time on Website', 'Length of Membership', 'Yearly Amount Spent']]), 
        columns=['Avg. Session Length', 'Time on App','Time on Website', 'Length of Membership', 'Yearly Amount Spent']
        )

9. Splitting dataset

In [90]:
# splitting dataset using scikit learn package

from sklearn.model_selection import train_test_split

X = df[['Avg. Session Length', 'Time on App', 'Time on Website', 'Length of Membership']]
y = df[['Yearly Amount Spent']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)