# Importing the dependencies

In [49]:
import numpy as np 
import pandas as pd 
import os
import json
import ast

# Data Loading

In [21]:
# Defining the Path to the directory containing JSON files and empty python list
directory_path = '/kaggle/input/fashion-product-images-dataset/fashion-dataset/styles'
data_list = []

# Listing all files in the directory
file_list = os.listdir(directory_path)

# Filter JSON files
json_files = [file for file in file_list if file.endswith('.json')]

# Load JSON files
for file_name in json_files:
    file_path = os.path.join(directory_path, file_name)
    with open(file_path, 'r') as file:
        json_data = json.load(file)
        data_list.append(json_data['data'])    
        
# Creating a DataFrame from the list of dictionaries
df = pd.DataFrame(data_list)
df.to_csv('/kaggle/working/data.csv')

In [None]:
# Loading the data from saved pickle file
df = pd.read_csv('../Dataset/data.csv')

### Defining utility functions

In [None]:
# Function to convert string representation to dictionary
def parse_master_category(string_repr):
    try:
        return ast.literal_eval(string_repr.replace("'", "\""))
    except ValueError:
        return None

# Understanding the data (Overview)

In [8]:
df.sample(3)

Unnamed: 0.1,Unnamed: 0,id,price,discountedPrice,styleType,productTypeId,articleNumber,visualTag,productDisplayName,variantName,...,otherFlags,articleDisplayAttr,productDescriptors,styleOptions,colours,discountData,productSpecificationEntry,relatedStyleOptions,associatedImageProducts,styleVideoAlbumList
23533,23533,21532,750.0,750.0,P,598,KIARA0077 Grey,,Kiara Women Sporty Look Grey Handbag,Sporty Look,...,"[{'dataType': 'BOOLEAN', 'name': 'isFragile', ...","{'id': 46, 'core': {'order': '0', 'display': '...",{'materials_care_desc': {'descriptorType': 'ma...,"[{'id': 93359, 'name': 'Size', 'value': 'Onesi...",{'colors': {'21533': {'dre_landing_page_url': ...,,,,,
35635,35635,48439,1699.0,1699.0,P,289,56IJ6-White,,French Connection Men White Printed T-shirt,IND EAGLE DEAN SS CREW,...,"[{'dataType': 'BOOLEAN', 'name': 'isFragile', ...","{'id': 90, 'core': {'order': '0', 'display': '...",{'description': {'descriptorType': 'descriptio...,"[{'id': 188652, 'name': 'Size', 'value': 'S', ...",{'colors': {'48438': {'dre_landing_page_url': ...,,,,,
16373,16373,44138,2800.0,2800.0,P,399,3139420624019,,Dunhill Men Desire Blue Perfume,DUNHILL DESIRE BLUE EDT,...,"[{'dataType': 'BOOLEAN', 'name': 'isFragile', ...","{'id': 162, 'core': {'order': '0', 'display': ...",{'materials_care_desc': {'descriptorType': 'ma...,"[{'id': 175432, 'name': 'Size', 'value': '50ML...",,,,,,


In [11]:
# Defining all the 
exclude_attributes = ['landingPageUrl', 'crossLinks','brandUserProfile','codEnabled','styleImages','lookGoodAlbum','style360Images','subCategory'
        ,'articleType','otherFlags','articleDisplayAttr','productDescriptors','styleOptions','Unnamed: 0','id','colours', 'discountData', 'productSpecificationEntry',
       'relatedStyleOptions', 'associatedImageProducts','styleVideoAlbumList','productTypeId','articleNumber','colour1','colour2','visualTag',
        'catalogAddDate','navigationId','articleAttributes']

# Dropping unnecessary columns
df.drop(exclude_attributes,axis=1,inplace=True)

In [None]:
# Extracting the dictionary from the string
df['masterCategory'] = df['masterCategory'].apply(parse_master_category)

In [34]:
# Information about the attributes
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43483 entries, 0 to 44445
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   price               43483 non-null  float64
 1   discountedPrice     43483 non-null  float64
 2   styleType           43483 non-null  object 
 3   productDisplayName  43476 non-null  object 
 4   variantName         43476 non-null  object 
 5   myntraRating        43483 non-null  int64  
 6   brandName           43483 non-null  object 
 7   ageGroup            43482 non-null  object 
 8   gender              43483 non-null  object 
 9   baseColour          43468 non-null  object 
 10  fashionType         43480 non-null  object 
 11  season              43462 non-null  object 
 12  year                43482 non-null  float64
 13  usage               43166 non-null  object 
 14  vat                 43483 non-null  float64
 15  displayCategories   37817 non-null  object 
 16  weig

In [35]:
duplicate_vals = df.duplicated().sum()
if duplicate_vals != 0:
    print("Data have total :",duplicate_vals,"duplicate values")
    df.drop_duplicates(inplace=True)

df.reset_index(drop=True,inplace=True)

Data have total : 749 duplicate values


In [52]:
# Let's check percentage of missing values
df.isnull().mean()*100

price                  0.000000
discountedPrice        0.000000
styleType              0.000000
productDisplayName     0.016380
variantName            0.016380
myntraRating           0.000000
brandName              0.000000
ageGroup               0.002340
gender                 0.000000
baseColour             0.035101
fashionType            0.007020
season                 0.049141
year                   0.002340
usage                  0.741798
vat                    0.000000
displayCategories     13.013058
weight                 0.000000
navigationId           0.000000
articleAttributes      0.000000
masterCategory         0.000000
isEMIEnabled           0.000000
dtype: float64

In [59]:
isReturnable_lst = []
isExchangeable_lst = []

for i in range(len(df)):
    isReturnable_lst.append(df.iloc[i]['masterCategory']['isReturnable'])
    isExchangeable_lst.append(df.iloc[i]['masterCategory']['isExchangeable'])

df['isReturnable'] = isReturnable_lst
df['isExchangeable'] = isExchangeable_lst
df.drop(['masterCategory'],axis=1,inplace=True)

In [75]:
# Saving the dataset 
df.to_csv('../Dataset/clean_data_V1.csv')

# Exploratory data analysis

# Feature engineering

# Train test split

# Model training

# Model evaluation