In [8]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

# 🌿 Cluster 002: Terpene Insights

_Datenexploration und Analyse der Terpenprofile aus dem Leafly Strain Dataset_

---

## Inhaltsverzeichnis

1. [Setup & Overview](#setup--overview)
2. [DataFrame Struktur](#dataframe-struktur)
3. [Deskriptive Statistik](#deskriptive-statistik)
4. [Visualisierung der Terpene](#visualisierung-der-terpene)
5. [Weitere Analysen & Ideen](#weitere-analysen--ideen)
6. [Fazit & To Dos](#fazit--to-dos)

---


---
## <a id="setup--overview"></a>1. Setup & Overview

- Dieses Notebook untersucht die wichtigsten Terpene verschiedener Strains.
- Alle Werte sind Median (%) oder Scores, je nach Datenquelle.
---


In [6]:
# import
df = pd.read_csv("../csv/strains.csv")
df = df.set_index("id").sort_values("id")

In [10]:
# b) Terpene
df_terpenes = df[
    [
        "name",
        "terp_caryophyllene_score",
        "terp_humulene_score",
        "terp_limonene_score",
        "terp_linalool_score",
        "terp_myrcene_score",
        "terp_ocimene_score",
        "terp_pinene_score",
        "terp_terpinolene_score",
    ]
].rename(
    columns={
        "terp_caryophyllene_score": "Caryophyllene",
        "terp_humulene_score": "Humulen",
        "terp_limonene_score": "Limonen",
        "terp_linalool_score": "Linalool",
        "terp_myrcene_score": "Mycene",
        "terp_ocimene_score": "Ocimene",
        "terp_pinene_score": "Pinen",
        "terp_terpinolene_score": "Terpinolen",
    }
)

---
## <a id="dataframe-struktur"></a>2. DataFrame Struktur

- Überblick über Spalten und Beispiel-Daten (`df_terpenes.head()`, `df_terpenes.info()`).
- Felder: name, caryophyllene_score, humulene_score, limonene_score, linalool_score, myrcene_score, ocimene_score, pinene_score, terpinolene_score.
---


In [11]:
df_terpenes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8492 entries, 1 to 518764
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           8492 non-null   object 
 1   Caryophyllene  3641 non-null   float64
 2   Humulen        3641 non-null   float64
 3   Limonen        3641 non-null   float64
 4   Linalool       3641 non-null   float64
 5   Mycene         3641 non-null   float64
 6   Ocimene        3641 non-null   float64
 7   Pinen          3641 non-null   float64
 8   Terpinolen     3641 non-null   float64
dtypes: float64(8), object(1)
memory usage: 663.4+ KB


In [12]:
df_terpenes.head()

Unnamed: 0_level_0,name,Caryophyllene,Humulen,Limonen,Linalool,Mycene,Ocimene,Pinen,Terpinolen
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,AK-47,0.294669,0.101312,0.129213,0.049176,1.08545,0.026986,0.455796,0.022313
2,Chemdawg,0.449714,0.152827,0.284038,0.048102,0.378439,0.007928,0.102327,0.0193
3,Mr. Nice Guy,0.3354,0.0953,0.3162,0.0982,0.3363,0.0344,0.0991,0.001
6,Super Silver Haze,0.296869,0.107192,0.16975,0.062835,0.46937,0.010418,0.118218,0.018945
7,Northern Lights #5,0.149105,0.070376,0.111898,0.065269,0.199376,0.058726,0.382547,0.281058


---

## <a id="deskriptive-statistik"></a>3. Deskriptive Statistik

- Verteilungen, Kenngrößen, Ausreißer.
- Genutzte Methoden: `describe()`, `isnull().sum()`, `mean()`, `median()`, `quantile()`, `corr()`.
- Besonderheiten und Anmerkungen.

---


In [13]:
df_terpenes.describe()

Unnamed: 0,Caryophyllene,Humulen,Limonen,Linalool,Mycene,Ocimene,Pinen,Terpinolen
count,3641.0,3641.0,3641.0,3641.0,3641.0,3641.0,3641.0,3641.0
mean,0.352884,0.117686,0.285296,0.108516,0.435062,0.045157,0.202359,0.089165
std,0.215479,0.072688,0.203753,0.082294,0.373052,0.100829,0.182808,0.222846
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.197,0.068,0.1242,0.04927,0.17187,0.0,0.1,0.0045
50%,0.309333,0.104167,0.243333,0.088333,0.337727,0.009111,0.151667,0.017825
75%,0.468717,0.152601,0.4,0.15,0.589646,0.04532,0.24,0.03
max,1.87,0.68,1.58,0.595,3.0805,1.429346,2.497889,1.952


In [17]:
df_terpenes.isnull().sum()

name                0
Caryophyllene    4851
Humulen          4851
Limonen          4851
Linalool         4851
Mycene           4851
Ocimene          4851
Pinen            4851
Terpinolen       4851
dtype: int64

In [22]:
df_terpenes.nunique()

name             8486
Caryophyllene    2655
Humulen          2335
Limonen          2617
Linalool         2302
Mycene           2689
Ocimene          1507
Pinen            2484
Terpinolen       1486
dtype: int64

---
## <a id="visualisierung-der-terpene"></a>4. Visualisierung der Terpene

- Histogramme, Boxplots, Scatterplots (z.B. Limonene vs. Myrcene).
- Interaktive Plots mit Plotly oder klassische mit Matplotlib.
---


In [25]:
df_terpenes.corr(numeric_only=True)

Unnamed: 0,Caryophyllene,Humulen,Limonen,Linalool,Mycene,Ocimene,Pinen,Terpinolen
Caryophyllene,1.0,0.899922,0.363939,0.372073,-0.140987,-0.157652,-0.121906,-0.143407
Humulen,0.899922,1.0,0.322147,0.337573,-0.141258,-0.129004,-0.133626,-0.130973
Limonen,0.363939,0.322147,1.0,0.56217,-0.140578,-0.137822,0.013042,-0.148567
Linalool,0.372073,0.337573,0.56217,1.0,-0.122647,-0.165112,-0.095897,-0.201603
Mycene,-0.140987,-0.141258,-0.140578,-0.122647,1.0,0.199393,0.29287,-0.067529
Ocimene,-0.157652,-0.129004,-0.137822,-0.165112,0.199393,1.0,0.150016,0.298766
Pinen,-0.121906,-0.133626,0.013042,-0.095897,0.29287,0.150016,1.0,0.020245
Terpinolen,-0.143407,-0.130973,-0.148567,-0.201603,-0.067529,0.298766,0.020245,1.0


In [None]:
# Extremwerte/Ausreißer identifizieren

df_terpenes[df_terpenes["Caryophyllene"] > df_terpenes["Caryophyllene"].quantile(0.99)]

Unnamed: 0_level_0,name,Caryophyllene,Humulen,Limonen,Linalool,Mycene,Ocimene,Pinen,Terpinolen
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
424,Superman OG,1.112,0.355,0.599,0.003,0.183,0.001,0.174,0.009
519,El Niño,1.03,0.26,0.4,0.12,0.13,0.0,0.19,0.04
83439,Phantom Cookies,1.076833,0.258333,0.342167,0.056167,0.270833,0.021833,0.087167,0.004333
149647,Green Lantern,1.3505,0.374633,0.23125,0.071997,0.228267,0.0,0.055617,0.011167
160879,Skunk Dawg,1.27,0.37,0.65,0.12,0.28,0.0,0.21,0.0
178417,Chemmy Jones,1.302875,0.2617,0.4101,0.077026,1.151575,0.063851,0.692438,0.03
214323,Vanilluna,1.123,0.36,0.422,0.12,0.192,0.019,0.117,0.023
500284,Platinum Sour Diesel,1.137,0.0705,0.5715,0.179,0.3795,0.0,0.092,0.001
500386,Blueberry Triple OG,1.12,0.32,0.49,0.11,0.26,0.0,0.12,0.03
500908,SPK,1.111667,0.318333,0.453333,0.115,0.11,0.07,0.128333,0.005


In [33]:
# Extremwerte/Ausreißer identifizieren
df_terpenes[df_terpenes["Humulen"] > df_terpenes["Humulen"].quantile(0.99)]

Unnamed: 0_level_0,name,Caryophyllene,Humulen,Limonen,Linalool,Mycene,Ocimene,Pinen,Terpinolen
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
169,Chemdawg #4,0.95375,0.3775,0.21375,0.11875,0.77625,0.0075,0.07125,0.02625
666,Venice OG,0.934,0.3634,0.1918,0.0972,0.5399,0.01,0.02896,0.02
3607,Armageddon,0.9445,0.3776,0.3191,0.4005,2.0571,0.3107,0.1822,0.0
65590,Cherry Limeade,0.892,0.365,0.423,0.073,0.077,0.0,0.101,0.0
87984,Cheesewreck,0.8205,0.419,0.2995,0.013,0.1115,0.0,0.071,0.0
149647,Green Lantern,1.3505,0.374633,0.23125,0.071997,0.228267,0.0,0.055617,0.011167
160879,Skunk Dawg,1.27,0.37,0.65,0.12,0.28,0.0,0.21,0.0
177263,The Original Z,0.867948,0.361303,0.356688,0.40618,0.059369,0.007037,0.097415,0.023148
206415,Titan Haze,0.29,0.42,0.09,0.03,0.88,0.06,0.32,0.0
500390,Stardawg 91,0.98,0.42,0.56,0.03,0.37,0.0,0.14,0.03


In [None]:
df_terpenes[df_terpenes["Limonen"] > df_terpenes["Limonen"].quantile(0.99)][
    ["name", "Limonen"]
]

Unnamed: 0_level_0,name,Limonen
id,Unnamed: 1_level_1,Unnamed: 2_level_1
744,Pluto Kush,0.94
10754,OCD,0.9418
71984,Bay 11,0.962
111567,Iced Widow,0.98
123663,Supermax OG,1.2362
155888,Satellite OG,0.926
168911,Ebola #7,0.95
229391,Blob OG,0.9785
288145,Pink Starburst,1.137667
503752,Blue Fire,0.919555


In [None]:
df_terpenes[df_terpenes["Linalool"] > df_terpenes["Linalool"].quantile(0.99)][
    ["name", "Linalool"]
]

Unnamed: 0_level_0,name,Linalool
id,Unnamed: 1_level_1,Unnamed: 2_level_1
734,Red Haze,0.400181
744,Pluto Kush,0.55
2329,King's Kush,0.445102
3607,Armageddon,0.4005
69231,Orange Skunk,0.47
123663,Supermax OG,0.42452
131792,Burkle,0.48
177263,The Original Z,0.40618
503725,Animal Gas,0.405
505109,Purple Gelato,0.53


In [None]:
df_terpenes[df_terpenes["Mycene"] > df_terpenes["Mycene"].quantile(0.99)][
    ["name", "Mycene"]
]

Unnamed: 0_level_0,name,Mycene
id,Unnamed: 1_level_1,Unnamed: 2_level_1
76,Pineapple Thai,1.774
3607,Armageddon,2.0571
3637,Maui,2.355316
58096,Alien Walker,2.452667
65487,Head Cheese,1.952289
95471,Chem Valley Kush,1.901667
100623,Purple Maui,2.112936
102063,La Niña,1.938
113105,Gemstone,2.81
114415,Dawg's Waltz,1.956667


In [None]:
df_terpenes[df_terpenes["Ocimene"] > df_terpenes["Ocimene"].quantile(0.99)][
    ["name", "Ocimene"]
]

Unnamed: 0_level_0,name,Ocimene
id,Unnamed: 1_level_1,Unnamed: 2_level_1
326,Arabian Gold,0.451667
415,Blue Bayou,0.522746
912,Deep Purple,1.126941
3637,Maui,0.455718
100623,Purple Maui,0.736781
113105,Gemstone,0.545
129743,Elvis,0.465
199216,Blue Sky,1.429346
226321,Moose and Lobsta,1.011176
240530,Sugar Pine,0.793


In [None]:
df_terpenes[df_terpenes["Pinen"] > df_terpenes["Pinen"].quantile(0.99)][
    ["name", "Pinen"]
]

Unnamed: 0_level_0,name,Pinen
id,Unnamed: 1_level_1,Unnamed: 2_level_1
3179,Jah Kush,1.187058
3733,Lavender Haze,1.390633
9699,Willy's Wonder,0.999391
65622,Guava Kush,0.9885
116847,Gigabud,1.153063
145199,Thor's Hammer,1.12
148626,B4,1.245
148783,Orange Turbo,1.194904
177623,Katsu,1.347064
230897,Chupacabra,1.03483


In [49]:
df_terpenes[df_terpenes["Terpinolen"] > df_terpenes["Terpinolen"].quantile(0.99)][
    ["name", "Terpinolen"]
]

Unnamed: 0_level_0,name,Terpinolen
id,Unnamed: 1_level_1,Unnamed: 2_level_1
8,Snowcap,1.5864
44,Cracker Jack,1.1483
52,Durban Poison,1.397397
298,Chocolope,1.285047
95631,Platinum Wreck,1.1575
97231,Moonshine Haze,1.229988
143984,Ice Queen,1.492222
160367,Dreamer’s Glass,1.89
176367,Pineapple Punch,1.4
240530,Sugar Pine,1.952


In [28]:
print("MAX:  \n", df_terpenes.idxmax(), "\n\nMIN:  \n", df_terpenes.idxmin())

MAX:  
 name             504333
Caryophyllene    516334
Humulen          506126
Limonen          515202
Linalool         506142
Mycene           505786
Ocimene          199216
Pinen            507623
Terpinolen       240530
dtype: int64 

MIN:  
 name             516532
Caryophyllene     76946
Humulen             148
Limonen             233
Linalool            392
Mycene           212847
Ocimene               8
Pinen            143503
Terpinolen           47
dtype: int64


---
## <a id="weitere-analysen--ideen"></a>5. Weitere Analysen & Ideen

- (Platz für Clusterings, spannende Strains, Korrelationen etc.)
- Hier können bei Bedarf noch Deep Dives oder Ausreißer-Betrachtungen ergänzt werden.
---


---
## <a id="fazit--to-dos"></a>6. Fazit & To Dos

- Wichtigste Erkenntnisse der Analyse.
- Eventuelle offene Fragen und nächste Schritte.
---
