In [None]:
import numpy as np
import pandas as pd
import os
import yellowbrick
import pickle
import seaborn as sns
import warnings
import matplotlib.pyplot as plt
import plotly.express as px

from matplotlib.collections import PathCollection
from statsmodels.graphics.gofplots import qqplot
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

import re
from sklearn.feature_extraction.text import TfidfVectorizer

# Capture similarity 
from sklearn.metrics.pairwise import linear_kernel

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



**Reading the dataset**

In [None]:
data = pd.read_csv('../input/fashion-clothing-products-catalog/myntra_products_catalog.csv')
print('Dataset contains {} rows and {} columns.\n'.format(data.shape[0],data.shape[1]))
data.info()
print('\n')
data.head()
data.isnull().sum()

**FINDING OUTLIERS : NUMIMAGES**

In [None]:
#finding outliers
data.head()
data.describe()[["NumImages"]]
fig1 = px.box(data, y="NumImages")

fig1.show()

**FINDING OUTLIERS : PRICE (INR)**

In [None]:
data.head()
data.describe()[["Price (INR)"]]
fig2 = px.box(data, y="Price (INR)")

fig2.show()

**DATA DESCRIPTION**

In [None]:
data.describe()

**FINDING THE MISSING VALUES**

In [None]:
#finding missing values
missing_data=data.isna().sum()
data.dtypes
data.describe()
lower_b=[]
upper_b=[]
nums=["int64","float64"]
out=0
newdata=data.select_dtypes(include=nums)
for c in newdata.columns.values:
    q1=(np.percentile(newdata[c],25))
    q3=(np.percentile(newdata[c],75))
    interq=(q3-q1)
    lower_b=(q1-(1.5*interq))
    upper_b=(q3+(1.5*interq))
    for v in newdata[c]:
        if(v < lower_b or v > upper_b):
            out += 1
print("missing val : ",sum(missing_data))
print("outliers: ",out)



**PERCENTAGE OF THE MISSING DATA**

In [None]:
#percentage of missing
for c in data.columns:
    percentage=np.mean(data[c].isnull())
    print('{} - {}%'.format(c,round(percentage*100)))
    

**REMOVAL OF THE NULL VALUE**

In [None]:
#dropping the rows with missing data
data.dropna()

**CHECKING FOR DUPLICATE DATA**

In [None]:
dup=data[data.duplicated()]
print("Duplicate rows: ",dup)
#no dupplicate rows in the dataframe.

**CORRELATION MATRIX**

In [None]:
correlations=data.corr()
print(correlations)

In [None]:
sns.heatmap(correlations)
plt.show()


**PRINTING THE FIRST 5 ROWS OF THE DATASET**

In [None]:
data.head()

**FINDING THE COUNT OF PRODUCTS PER BRAND**

In [None]:
productbrand_count=data['ProductBrand'].value_counts()

In [None]:
productbrand_count

**KNOWLEDGE BASED RECOMMENDER SYSTEM**

In [None]:

# Select just relevant features
relevant_features = ['ProductBrand','Gender','Price (INR)','PrimaryColor']
data = data[relevant_features]

# Print the dataframe
data.head()

**THE FUNCTION FOR THE KNOWLEDGE BASED RECOMMENDER SYSTEM**

In [None]:
def build_chart(df, percentile=0.8):
    print("Input preferred gender")
    gender = input()
    
    print("Input least price")
    min_price = int(input())
    
    print("Input most price")
    max_price = int(input())
    
    
    print("Input preferred brand")
    brand = input()
    
    
    print("Input prefered color")
    color = input()
    
    clothes = df.copy()
    
    clothes = clothes[(clothes['Gender'] == gender) & 
                    (clothes['Price (INR)'] >= min_price) & 
                    (clothes['Price (INR)'] <= max_price) & 
                    (clothes['ProductBrand'] == brand) & 
                    (clothes['PrimaryColor'] == color)]
    return clothes
    

In [None]:
personal_recommendations = build_chart(data).head(8)    #here if we want to display more than 8 suggestions, we can change the number inside the head ()

**THE ABOVE IS THE INPUT TAKEN FROM THE USER TO MAKE THE RECOMMENDATIONS**

In [None]:
personal_recommendations