### Importing Librares

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

### Reading The Data

In [2]:
df = pd.read_csv( '../data/raw/Zara_Sales_Data.csv' )

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252 entries, 0 to 251
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Product_ID        252 non-null    int64 
 1   Product_Position  252 non-null    object
 2   Promotion         252 non-null    object
 3   Product_Category  252 non-null    object
 4   Seasonal          252 non-null    object
 5   Sales_Volume      252 non-null    int64 
 6   brand             252 non-null    object
 7   url               252 non-null    object
 8   sku               252 non-null    object
 9   name              251 non-null    object
 10  description       250 non-null    object
 11  price             252 non-null    object
 12  currency          252 non-null    object
 13  scraped_at        252 non-null    object
 14  terms             252 non-null    object
 15  section           252 non-null    object
dtypes: int64(2), object(14)
memory usage: 31.6+ KB


#### Filter rows with nulls

In [4]:
df[ df.isna().sum( axis = 1 ).astype( bool ) ][ [ 'Product_ID', 'name', 'description'] ]

Unnamed: 0,Product_ID,name,description
60,151925,VINTAGE EFFECT LEATHER BOMBER JACKET,
72,173576,,


In [5]:
df[ df['terms'] == 'jackets' ]['description'].unique()

array(['Puffer jacket made of tear-resistant ripstop fabric. High collar and adjustable long sleeves with adhesive straps. Welt pockets at hip. Adjustable hem with side elastics. Front zip closure.',
       'Straight fit blazer. Pointed lapel collar and long sleeves with buttoned cuffs. Welt pockets at hip and interior pocket. Central back vent at hem. Front button closure.',
       'Slim fit jacket. Notched lapel collar. Long sleeves with buttoned cuffs. Welt pocket at chest and flap pockets at hip. Interior pocket. Back vents. Front button closure.',
       'Slim fit jacket made of viscose blend fabric. Notched lapel collar. Long sleeves with buttoned cuffs. Welt pocket at chest and flap pockets at hip. Interior pocket. Back vents. Front button closure.',
       'Jacket made of faux leather faux shearling with fleece interior. Tabbed lapel collar. Long sleeves. Zip pockets at hip. Front zip closure.',
       'Relaxed fit jacket. Contrasting lapel collar and long sleeves with buttoned

### FIll the describtion of index 60

In [6]:
df.loc[ 60, 'description' ] = 'Jacket made of faux leather fabric with washed effect. Rib elastic collar and long sleeves. Welt pockets at hip and interior pocket. Elastic hem. Front zip closure.'

In [7]:
df.isna().sum()

Product_ID          0
Product_Position    0
Promotion           0
Product_Category    0
Seasonal            0
Sales_Volume        0
brand               0
url                 0
sku                 0
name                1
description         1
price               0
currency            0
scraped_at          0
terms               0
section             0
dtype: int64

In [8]:
df[ df['name'].isna()  | df['description'].isna() ]

Unnamed: 0,Product_ID,Product_Position,Promotion,Product_Category,Seasonal,Sales_Volume,brand,url,sku,name,description,price,currency,scraped_at,terms,section
72,173576,End-cap,Yes,Clothing,No,1838,Zara,https://www.zara.com/us/en/-p04310461.html,336378923-700-2,,,£129.00,USD,2024-02-19 08:50:54,jackets,MAN


### Drop this row

In [9]:
df.dropna( inplace = True, ignore_index= True )

In [10]:
df.shape

(251, 16)

### Generate Total Price Column

In [11]:
df.sample()

Unnamed: 0,Product_ID,Product_Position,Promotion,Product_Category,Seasonal,Sales_Volume,brand,url,sku,name,description,price,currency,scraped_at,terms,section
156,147449,End-cap,No,Clothing,No,1191,Zara,https://www.zara.com/us/en/suede-running-sneak...,316608794-802-39,SUEDE RUNNING SNEAKERS,Running shoes. Made of suede leather. Upper in...,£69.90,USD,2024-02-19 09:00:49,shoes,MAN


In [12]:
df['price'].unique()

array(['£19.99', '£169.00', '£129.00', '£139.00', '£79.90', '£69.99',
       '£159.00', '£439.00', '£99.90', '£69.90', '£109.00', '£89.90',
       '£299.00', '£49.90', '£59.90', '£349.00', '£29.99', '£189.00',
       '£12.99', '£45.90', '£9.99', '£39.90', '£7.99', '£35.90', '£47.90',
       '£27.90', '£19.90', '£29.90'], dtype=object)

In [13]:
df['price'] = df['price'].str.replace( '£', '' ).astype( float )

In [14]:
df['price'].unique()

array([ 19.99, 169.  , 129.  , 139.  ,  79.9 ,  69.99, 159.  , 439.  ,
        99.9 ,  69.9 , 109.  ,  89.9 , 299.  ,  49.9 ,  59.9 , 349.  ,
        29.99, 189.  ,  12.99,  45.9 ,   9.99,  39.9 ,   7.99,  35.9 ,
        47.9 ,  27.9 ,  19.9 ,  29.9 ])

In [15]:
df[ 'Total_Sales' ] = df['price'] * df['Sales_Volume']

In [16]:
os.makedirs( '../data/cleaned data', exist_ok = True )

In [17]:
df.to_csv( '../data/cleaned data/cleaned_data.csv', index = False )