## Importing necessary libraries

In [1]:
# Importing necessary libraries:
import numpy as np
import pandas as pd

import plotly.express as px

import warnings
warnings.filterwarnings("ignore")

## Importing dataset

In [2]:
# Importing the data:
data_path = "C:/Users/yuvan/Desktop/diamond_price.csv"
data = pd.read_csv(data_path)
data.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


### General Analysis on the data

In [3]:
# id column can be removed:
data.drop('id', axis=1, inplace=True)

In [4]:
# Shape:
print(f"The dataset contains {data.shape[0]} rows and {data.shape[1]} columns")

The dataset contains 193573 rows and 10 columns


In [5]:
# Columns:
print(list(data.columns))

['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z', 'price']


In [6]:
# Duplicated rows:
data.duplicated().sum()

np.int64(0)

In [7]:
# Null values:
data.isna().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
price      0
dtype: int64

In [8]:
# Datatype of each column:
data.dtypes

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
x          float64
y          float64
z          float64
price        int64
dtype: object

In [9]:
# List of numeric and categoric varible:
num_col = []
cate_col = []

for col in data.columns:
    if data[col].dtype=='object':
        cate_col.append(col)
    else:
        num_col.append(col)

print(f"Numeric Columns: {num_col}\nCategoric Columns: {cate_col}")

Numeric Columns: ['carat', 'depth', 'table', 'x', 'y', 'z', 'price']
Categoric Columns: ['cut', 'color', 'clarity']


In [10]:
# Description of Numeric columns:
data[num_col].describe()

Unnamed: 0,carat,depth,table,x,y,z,price
count,193573.0,193573.0,193573.0,193573.0,193573.0,193573.0,193573.0
mean,0.790688,61.820574,57.227675,5.715312,5.720094,3.534246,3969.155414
std,0.462688,1.081704,1.918844,1.109422,1.102333,0.688922,4034.374138
min,0.2,52.1,49.0,0.0,0.0,0.0,326.0
25%,0.4,61.3,56.0,4.7,4.71,2.9,951.0
50%,0.7,61.9,57.0,5.7,5.72,3.53,2401.0
75%,1.03,62.4,58.0,6.51,6.51,4.03,5408.0
max,3.5,71.6,79.0,9.65,10.01,31.3,18818.0


In [11]:
# Description of Categoric columns:
data[cate_col].describe()

Unnamed: 0,cut,color,clarity
count,193573,193573,193573
unique,5,7,8
top,Ideal,G,SI1
freq,92454,44391,53272


In [12]:
# Unique values in categoric columns:
for col in cate_col:
    print(f"{col}---> {data[col].unique()}")

cut---> ['Premium' 'Very Good' 'Ideal' 'Good' 'Fair']
color---> ['F' 'J' 'G' 'E' 'D' 'H' 'I']
clarity---> ['VS2' 'SI2' 'VS1' 'SI1' 'IF' 'VVS2' 'VVS1' 'I1']


In [13]:
# value counts of categoric variables:
for col in cate_col:
    print(f"{data[col].value_counts()}\n{'-'*30}")

cut
Ideal        92454
Premium      49910
Very Good    37566
Good         11622
Fair          2021
Name: count, dtype: int64
------------------------------
color
G    44391
E    35869
F    34258
H    30799
D    24286
I    17514
J     6456
Name: count, dtype: int64
------------------------------
clarity
SI1     53272
VS2     48027
VS1     30669
SI2     30484
VVS2    15762
VVS1    10628
IF       4219
I1        512
Name: count, dtype: int64
------------------------------
