In [1]:
import pandas as pd
import numpy as np
import logging

# notebook only
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)


filename="../data/data.csv"
df = pd.read_csv(filename)
df.head()

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,1,55,Male,Blouse,Clothing,53,Kentucky,L,Gray,Winter,3.1,Yes,Express,Yes,Yes,14,Venmo,Fortnightly
1,2,19,Male,Sweater,Clothing,64,Maine,L,Maroon,Winter,3.1,Yes,Express,Yes,Yes,2,Cash,Fortnightly
2,3,50,Male,Jeans,Clothing,73,Massachusetts,S,Maroon,Spring,3.1,Yes,Free Shipping,Yes,Yes,23,Credit Card,Weekly
3,4,21,Male,Sandals,Footwear,90,Rhode Island,M,Maroon,Spring,3.5,Yes,Next Day Air,Yes,Yes,49,PayPal,Weekly
4,5,45,Male,Blouse,Clothing,49,Oregon,M,Turquoise,Spring,2.7,Yes,Free Shipping,Yes,Yes,31,PayPal,Annually


In [2]:
missing = sum(df.isna().sum())
logging.info(f"Missing elements: {missing}")

INFO:root:Missing elements: 0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3900 entries, 0 to 3899
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Customer ID             3900 non-null   int64  
 1   Age                     3900 non-null   int64  
 2   Gender                  3900 non-null   object 
 3   Item Purchased          3900 non-null   object 
 4   Category                3900 non-null   object 
 5   Purchase Amount (USD)   3900 non-null   int64  
 6   Location                3900 non-null   object 
 7   Size                    3900 non-null   object 
 8   Color                   3900 non-null   object 
 9   Season                  3900 non-null   object 
 10  Review Rating           3900 non-null   float64
 11  Subscription Status     3900 non-null   object 
 12  Shipping Type           3900 non-null   object 
 13  Discount Applied        3900 non-null   object 
 14  Promo Code Used         3900 non-null   

In [4]:
cols = df.columns
object_cols = []
for i in cols:
    col = df[i]
    if col.dtype == "object":
        object_cols.append(i)

logging.info(f"The columns which contain objects are: {', '.join(object_cols)}")
logging.info("These columns will be encoded")

INFO:root:The columns which contain objects are: Gender, Item Purchased, Category, Location, Size, Color, Season, Subscription Status, Shipping Type, Discount Applied, Promo Code Used, Payment Method, Frequency of Purchases
INFO:root:These columns will be encoded


In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
logging.info("Encoding.......")
for col in object_cols:
    df[col] = le.fit_transform(df[col])
logging.info("Encoding complete")

INFO:root:Encoding.......
INFO:root:Encoding complete


In [7]:
from sklearn.model_selection import train_test_split
import os
logging.info("")
processed_data = pd.read_csv("../data/processed_data.csv")
target_column = 'Frequency of Purchases'
X = processed_data.loc[:, processed_data.columns != target_column]
y = processed_data.loc[:, processed_data.columns == target_column]
directory = '../data/splits'

if not os.path.exists(directory):
    os.makedirs(directory)
    print(f"Directory '{directory}' created successfully!")
else:
    print(f"Directory '{directory}' already exists.")    

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,stratify = y, random_state=47)
    
np.save(f'../data/splits/X_train.npy', X_train)
np.save(f'../data/splits/X_test.npy', X_test)
np.save(f'../data/splits/y_train.npy', y_train)
np.save(f'../data/splits/y_test.npy', y_test)

Directory '../data/splits' created successfully!
