1. Setting up the python env

In [25]:
  #  python -m venv env

2. Installing necessary libraries
    - numpy
    - seaborn
    - matplotlib
    - scikit-learn
    - pandas

3. Learn how to use Jupyter in Vs Code

4. Finding Dataset
    - kaggle: https://www.kaggle.com/datasets
    - UCI: https://archive.ics.uci.edu/datasets
    - google dataset: https://datasetsearch.research.google.com/
    - USA dataset: [data.gov](https://data.gov/)
    - Paper with code: https://paperswithcode.com/datasets
    - Harrvard dataset: https://dataverse.harvard.edu/dataverse/harvard/

5. Loading Dataset

In [None]:
# loading csv using pandas
import pandas as pd
dataset_dir = "../dataset/Ecommerce Customers.csv"
df = pd.read_csv(dataset_dir)
df.head()

In [None]:
# loading image with pillow
import os
from PIL import Image

folder_path = '../dataset/Chess Dataset'
images = []
labels = []

for subfolder_name in os.listdir(folder_path):
    subfolder_path = os.path.join(folder_path, subfolder_name)
    
    if os.path.isdir(subfolder_path):
        for filename in os.listdir(subfolder_path):
            if filename.endswith('.jpg') or filename.endswith('.png'):
                img_path = os.path.join(subfolder_path, filename)
                img = Image.open(img_path)
                images.append(img)
                labels.append(subfolder_name)

print("Loaded Images: ", len(images))
print("Labels: ", labels[:10])


In [None]:
# loading image using OpenCV
import cv2
import os

folder_path = '../dataset/Chess Dataset'
images = []
labels = []

for subfolder_name in os.listdir(folder_path):
    subfolder_path = os.path.join(folder_path, subfolder_name)
    
    if os.path.isdir(subfolder_path):
        for filename in os.listdir(subfolder_path):
            if filename.endswith('.jpg') or filename.endswith('.png'):
                img_path = os.path.join(subfolder_path, filename)
                img = cv2.imread(img_path)
                if img is not None: 
                    images.append(img)
                    labels.append(subfolder_name)

print("Loaded Images: ", len(images))
print("Labels: ", labels[:10])

In [None]:
# loading image using pytorch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor()
])

dataset = datasets.ImageFolder('../dataset/Chess Dataset', transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

iter_image = 100
print(f"Image {iter_image+1}: Label {dataset[iter_image][1]}, Class Name: {dataset.classes[dataset[iter_image][1]]}")

6. Inspect dataset

In [None]:
# explore csv file
print("type of df: ", type(df))
print("lenght df: ", len(df))
print("shape df: ", df.shape)

# print(df.info())

In [None]:
# update cell in dataframe
df.at[0, 'Time on App'] = None

In [None]:
# showing basics statistic
df.describe()

In [None]:
# count by None value
df['Address'].isna().sum() # ISNA method return where values repaced true for NA values

In [None]:
# count by condition
df[df['Avatar'] == 'DarkGreen'].shape[0]

7. Handling missing data

In [None]:
# Removing record that Missing
df.dropna().head()

In [None]:
# Filling Missing Data (Imputation)
df.fillna(df.mean(numeric_only=True)).head()

8. Normalization

In [None]:
# Apply Min-Max Scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
pd.DataFrame(
    scaler.fit_transform(
        df[['Avg. Session Length', 'Time on App','Time on Website', 'Length of Membership', 'Yearly Amount Spent']]), 
        columns=['Avg. Session Length', 'Time on App','Time on Website', 'Length of Membership', 'Yearly Amount Spent']
        )

9. Splitting dataset

In [90]:
# splitting dataset using scikit learn package

from sklearn.model_selection import train_test_split

X = df[['Avg. Session Length', 'Time on App', 'Time on Website', 'Length of Membership']]
y = df[['Yearly Amount Spent']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)