In [1]:
# Chemical Compound Analysis  
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool

In [2]:
# Let's now load the dataset
df = pd.read_csv('cosmetics.csv')

In [3]:
# Lets display the first five rows
print("Five Rows:")
print(df.sample(5))

Five Rows:
           Label         Brand  \
269  Sun protect      CLINIQUE   
189     Cleanser      SHISEIDO   
385  Moisturizer  BAREMINERALS   
478     Cleanser      CAUDALIE   
251  Moisturizer         TARTE   

                                                  Name  Price  Rank  \
269         Broad Spectrum SPF 50 Sunscreen Face Cream     26   4.2   
189                       Pureness Deep Cleansing Foam     22   4.2   
385  COMPLEXION RESCUE™ Tinted Moisturizer Broad Sp...     30   3.9   
478                          Deep Cleansing Exfoliator     35   4.6   
251  4-in-1 Setting Mist - Rainforest of the Sea™ C...     25   3.9   

                                           Ingredients  Combination  Dry  \
269  Oxybenzone 5.00% , Octisalate 5.00% , Homosala...            0    0   
189  Water, Stearic Acid, PEG-8, Myristic Acid, Gly...            1    1   
385  Water, Coconut Alkanes, Propanediol, Squalane,...            0    0   
478  Water, Polylactic Acid, Hexyldecyl Stearate, G..

In [4]:
#Here are all the columns of the data
df.columns

Index(['Label', 'Brand', 'Name', 'Price', 'Rank', 'Ingredients', 'Combination',
       'Dry', 'Normal', 'Oily', 'Sensitive'],
      dtype='object')

In [5]:
# Here is the count of each type of product in 'Label' column
print("\nProduct Type Distribution:")
print(df['Label'].value_counts())


Product Type Distribution:
Moisturizer    298
Cleanser       281
Face Mask      266
Treatment      248
Eye cream      209
Sun protect    170
Name: Label, dtype: int64


In [6]:

moisturizers = df[df['Label'] == 'Moisturizer']


In [7]:

moisturizers_dry = moisturizers[moisturizers['Dry'] == 1]

In [8]:
moisturizers_dry = moisturizers_dry.reset_index(drop=True) #index reset

In [9]:

def display_filtered_data():
    print("\nFiltered Data (Moisturizers for Dry Skin):")
    print(moisturizers_dry.head())

display_filtered_data()


Filtered Data (Moisturizers for Dry Skin):
         Label             Brand                                       Name  \
0  Moisturizer          CAUDALIE                                Grape Water   
1  Moisturizer  FIRST AID BEAUTY              Ultra Repair Face Moisturizer   
2  Moisturizer       MILK MAKEUP                              Cooling Water   
3  Moisturizer           LANEIGE                        Water Sleeping Mask   
4  Moisturizer       JOSIE MARAN  Argan Infinity Cream Intensive Creamy Oil   

   Price  Rank                                        Ingredients  \
0     10   4.5  Vitis Vinifera (Grape) Fruit Water*, Vitis Vin...   
1     24   4.2  Water, Glyceryl Stearate Se, Glycerin, Capryli...   
2     24   3.8  Water, Butylene Glycol, Bis-PEG-18 Methyl Ethe...   
3     25   4.4  Water, Butylene Glycol, Cyclopentasiloxane, Gl...   
4     28   4.5  -100 Percent Pure Argan Oil: Nourishes and pro...   

   Combination  Dry  Normal  Oily  Sensitive  
0            1    1

In [10]:

# Initialize variables
corpus = []
ingredient_idx = {}
idx = 0

# Tokenize the ingredients for each product
for ingredients in moisturizers_dry['Ingredients']:
    # Convert to lowercase and split by ', '
    tokens = ingredients.lower().split(', ')
    corpus.append(tokens)
    for token in tokens:
        if token not in ingredient_idx:
            ingredient_idx[token] = idx
            idx += 1

print("\nNumber of unique ingredients:", len(ingredient_idx))
print("Sample tokens:", corpus[:2])


Number of unique ingredients: 2231
Sample tokens: [['vitis vinifera (grape) fruit water*', 'vitis vinifera (grape) juice*', 'nitrogen. *plant origin.'], ['water', 'glyceryl stearate se', 'glycerin', 'caprylic/capric triglyceride', 'cetyl alcohol', 'collodial oatmeal', 'dimethicone', 'squalane', 'urea', 'allantoin', 'ceramide np', 'butyrospermum parkii (shea) butter', 'limnanthes alba (meadowfoam) seed oil', 'persea gratissima (avocado) oil', 'chrysanthemum parthenium (feverfew) extract', 'camellia sinensis leaf extract', 'glycyrrhiza glabra (licorice) root extract', 'caprylyl glycol', 'phenoxyethanol', 'carbomer', 'sodium hydroxide', 'leuconostoc/radish root ferment filtrate', 'edta.']]


In [11]:

# Get the dimensions of the matrix
M = len(moisturizers_dry) 
N = len(ingredient_idx)    


A = np.zeros((M, N))

print("\nDocument-Term Matrix shape:", A.shape)


Document-Term Matrix shape: (190, 2231)


In [12]:

def oh_encoder(tokens):
  
    x = np.zeros(N)
    for token in tokens:
        if token in ingredient_idx:
            idx = ingredient_idx[token]
            x[idx] = 1
    return x

sample_tokens = corpus[0]
print("\nOne-hot encoding for sample tokens:", oh_encoder(sample_tokens))


One-hot encoding for sample tokens: [1. 1. 1. ... 0. 0. 0.]


In [13]:

for i, tokens in enumerate(corpus):
    A[i] = oh_encoder(tokens)


print("\nPopulated Document-Term Matrix:")
print(A[:5, :10])


Populated Document-Term Matrix:
[[1. 1. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 1. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 1. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [14]:

model = TSNE(n_components=2, learning_rate=200, random_state=42)


tsne_features = model.fit_transform(A)


moisturizers_dry['X'] = tsne_features[:, 0]
moisturizers_dry['Y'] = tsne_features[:, 1]


print("\nt-SNE Features:")
print(moisturizers_dry[['X', 'Y']].head())




t-SNE Features:
          X         Y
0 -0.683878  0.327165
1 -1.752962  2.900554
2  0.977694  1.045864
3  0.339708  3.150104
4 -0.811539  0.364490


In [15]:

source = ColumnDataSource(moisturizers_dry)


plot = figure(title="t-SNE Visualization of Moisturizers for Dry Skin",
              x_axis_label="t-SNE 1",
              y_axis_label="t-SNE 2",
              width=800, height=600)


plot.circle(x="X", y="Y", source=source, size=10, color="navy", alpha=0.6)



In [18]:

hover = HoverTool(tooltips=[
    ("Item", "@Name"),
    ("Brand", "@Brand"),
    ("Price", "$@Price"),
    ("Rank", "@Rank")
])


plot.add_tools(hover)


show(plot)

In [17]:

product_1 = moisturizers_dry[moisturizers_dry['Name'] == "Color Control Cushion Compact Broad Spectrum SPF 50+"]
product_2 = moisturizers_dry[moisturizers_dry['Name'] == "BB Cushion Hydra Radiance SPF 50"]


if not product_1.empty and not product_2.empty:
    print("\nProduct 1 Details:")
    print(product_1[['Name', 'Brand', 'Price', 'Rank']])
    print("Ingredients:", product_1.iloc[0]['Ingredients'])

    print("\nProduct 2 Details:")
    print(product_2[['Name', 'Brand', 'Price', 'Rank']])
    print("Ingredients:", product_2.iloc[0]['Ingredients'])
else:
    print("\nOne or both products not found in the dataset.")


Product 1 Details:
                                                  Name         Brand  Price  \
117  Color Control Cushion Compact Broad Spectrum S...  AMOREPACIFIC     60   

     Rank  
117   4.0  
Ingredients: Phyllostachis Bambusoides Juice, Cyclopentasiloxane, Cyclohexasiloxane, Peg-10 Dimethicone, Phenyl Trimethicone, Butylene Glycol, Butylene Glycol Dicaprylate/Dicaprate, Alcohol, Arbutin, Lauryl Peg-9 Polydimethylsiloxyethyl Dimethicone, Acrylates/Ethylhexyl Acrylate/Dimethicone Methacrylate Copolymer, Polyhydroxystearic Acid, Sodium Chloride, Polymethyl Methacrylate, Aluminium Hydroxide, Stearic Acid, Disteardimonium Hectorite, Triethoxycaprylylsilane, Ethylhexyl Palmitate, Lecithin, Isostearic Acid, Isopropyl Palmitate, Phenoxyethanol, Polyglyceryl-3 Polyricinoleate, Acrylates/Stearyl Acrylate/Dimethicone Methacrylate Copolymer, Dimethicone, Disodium Edta, Trimethylsiloxysilicate, Ethylhexyglycerin, Dimethicone/Vinyl Dimethicone Crosspolymer, Water, Silica, Camellia Japoni