In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# Importing Data

In [2]:
crop =pd.read_csv("data/Crop_recommendation.csv")
crop.head()

# Describing Data

In [3]:
crop.shape

In [4]:
crop.info()

In [5]:
crop.isnull().sum()

In [6]:
crop.duplicated().sum()

In [7]:
crop.describe()

In [8]:
crop['label'].value_counts()

# Barplot : R√©partition des classes (Distribution des labels)

In [9]:
plt.figure(figsize=(10,6))
sns.countplot(data=crop, x='label', palette='viridis')

plt.title("Distribution des classes", fontsize=16)
plt.xlabel("Classes", fontsize=14)
plt.ylabel("Nombre d'occurrences", fontsize=14)

# Rotation si beaucoup de classes
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

# Scatterplot

# Distribution des variables

In [11]:
# S√©lection des colonnes num√©riques (toutes sauf 'label')
num_cols = ["N","P","K","temperature","humidity","ph","rainfall"]
# Histogrammes pour visualiser la forme des distributions (asym√©tries, √©talement)
crop[num_cols].hist(figsize=(12,8), bins=20)
plt.suptitle("Distribution des variables num√©riques (brut)", fontsize=14)
plt.tight_layout(); plt.show()



# Calculer la matrice de corr√©lation

In [12]:
num_cols = ["N", "P", "K", "temperature", "humidity", "ph", "rainfall"]

plt.figure(figsize=(10, 6))
sns.heatmap(crop[num_cols].corr(), annot=True, cmap="Greens", fmt=".2f")
plt.title("Matrice de corr√©lation entre les variables")
plt.show()

Les couples importants :

1Ô∏è‚É£ P ‚Äì K (corr√©lation = 0.74)

Tr√®s forte corr√©lation ‚Üí IMP√âRATIF de l‚Äôutiliser en JointPlot

Indique que les cultures qui n√©cessitent beaucoup de Phosphore n√©cessitent aussi beaucoup de Potassium.

Tr√®s int√©ressant biologiquement et pour ton rapport.

2Ô∏è‚É£ temperature ‚Äì humidity (corr√©lation = 0.21)

Faible mais logique ‚Üí relation climat.

Utile pour comprendre quelles cultures pr√©f√®rent un climat chaud/humide.

3Ô∏è‚É£ humidity vs N ou humidity vs K

pour compl√©ter la partie nutrition / climat

# 1. JointPlot : P vs K (obligatoire ‚Äî corr√©lation forte 0.74)

In [13]:
sns.set(style="white", palette="muted")

x_var = "P"
y_var = "K"

g = sns.jointplot(
    data=crop,
    x=x_var,
    y=y_var,
    hue="label",
    kind="scatter",
    height=8,
    alpha=0.7,
    s=30
)

# Supprimer la l√©gende interne
leg = g.ax_joint.get_legend()
if leg is not None:
    handles, labels = g.ax_joint.get_legend_handles_labels()
    leg.remove()

    # Ajouter une l√©gende claire √† droite
    g.fig.legend(
        handles, labels, title="Culture",
        loc="center left",
        bbox_to_anchor=(0.99, 0.5),
        borderaxespad=0.2,
        frameon=True
    )

g.fig.suptitle("JointPlot ‚Äì Relation entre P (phosphore) et K (potassium)", y=1.02, fontsize=14)

plt.show()


# 2. JointPlot : temperature vs humidity (climat / culture)

In [14]:
sns.set(style="white", palette="muted")

x_var = "temperature"
y_var = "humidity"

g = sns.jointplot(
    data=crop,
    x=x_var,
    y=y_var,
    hue="label",
    kind="scatter",
    height=8,
    alpha=0.7,
    s=30
)

# L√©gende externe
leg = g.ax_joint.get_legend()
if leg is not None:
    handles, labels = g.ax_joint.get_legend_handles_labels()
    leg.remove()

    g.fig.legend(
        handles, labels, title="Culture",
        loc="center left",
        bbox_to_anchor=(0.99, 0.5),
        borderaxespad=0.2,
        frameon=True
    )

g.fig.suptitle("JointPlot ‚Äì Relation entre Temperature et Humidity", y=1.02, fontsize=14)

plt.show()


# JointPlot humidity vs K (climat + nutrition)

In [15]:
import seaborn as sns
import matplotlib.pyplot as plt

x_var = "K"              # potassium
y_var = "humidity"       # humidit√©

sns.set(style="white", palette="muted")   # style propre + couleurs visibles

# JointPlot principal
g = sns.jointplot(
    data=crop,
    x=x_var,
    y=y_var,
    hue="label",        # les cultures (22 classes)
    kind="scatter",
    height=8,
    alpha=0.7,
    s=30
)

# ------------------------------
# üîß Gestion de la l√©gende
# ------------------------------
leg = g.ax_joint.get_legend()
if leg is not None:
    handles, labels = g.ax_joint.get_legend_handles_labels()
    leg.remove()

    # L√©gende coll√©e √† droite, bien visible
    g.fig.legend(
        handles, labels, title="Culture",
        loc="center left",
        bbox_to_anchor=(1.02, 0.5),   # 1.02 = juste √† droite sans √™tre coup√©
        borderaxespad=0,
        frameon=True
    )

# ------------------------------
# üîß Titre
# ------------------------------
g.fig.suptitle("Joint Plot ‚Äì Relation entre Humidity et K (potassium)", 
               y=1.03, fontsize=14)

plt.show()


# Pair Plot Analysis (relations 2 √† 2)

In [16]:
VARS_CONT = ["N", "P", "K", "temperature", "humidity", "ph", "rainfall"]  # variables continues (ordre coh√©rent)
LABEL_COL = "label"                 # nom de la colonne cible (22 cultures)
MAX_SAMPLES_PER_CLASS = 120         # sous-√©chantillonnage stratifi√© pour lisibilit√© (None pour d√©sactiver)
HEIGHT = 1.4                        # taille d‚Äôune sous-figure du pairplot
POINT_SIZE = 12                     # taille des marqueurs
ALPHA = 0.7                         # transparence des points

# ---- 2) V√©rifications minimales ----
missing = [c for c in VARS_CONT if c not in crop.columns]
if missing:
    raise KeyError(f"Colonnes manquantes dans le DataFrame: {missing}\nColonnes disponibles: {list(crop.columns)}")
if LABEL_COL not in crop.columns:
    raise KeyError(f"La colonne de classe '{LABEL_COL}' est introuvable. Colonnes: {list(crop.columns)}")

# ---- 3) Sous-√©chantillonnage optionnel (stratifi√© par classe) ----
if MAX_SAMPLES_PER_CLASS:
    # On pr√©l√®ve au plus MAX_SAMPLES_PER_CLASS √©chantillons par classe pour un rendu lisible
    crop_pp = (
        crop.groupby(LABEL_COL, group_keys=False)
            .apply(lambda g: g.sample(min(len(g), MAX_SAMPLES_PER_CLASS), random_state=42))
            .reset_index(drop=True)
    )
else:
    crop_pp = crop.copy()

# ---- 4) Construction du pair plot (style de l‚Äôarticle) ----
g = sns.pairplot(
    data=crop_pp[VARS_CONT + [LABEL_COL]],
    vars=VARS_CONT,
    hue=LABEL_COL,                   # couleurs par culture
    diag_kind="kde",                 # densit√© en diagonale (plus informatif que hist)
    kind="scatter",                  # nuages de points hors diagonale
    plot_kws=dict(s=POINT_SIZE, alpha=ALPHA, edgecolor="none"),
    diag_kws=dict(common_norm=False),
    corner=False,                    # grille compl√®te
    height=HEIGHT
)

# ---- 5) Lisibilit√© : titre & l√©gende ----
g.fig.suptitle(
    "Pair Plot ‚Äî N, P, K, temperature, humidity, ph, rainfall",
    y=1.02, fontsize=14
)

# D√©place la l√©gende si n√©cessaire (certaines versions la placent sur la figure)
if getattr(g, "_legend", None) is not None:
    g._legend.set_title(LABEL_COL)
    g._legend.set_bbox_to_anchor((1.02, 1))  # √† droite de la figure

plt.show()

# Joint Plot Analysis (relation entre 2 variables)

In [17]:
x_var = "rainfall"
y_var = "humidity"
sns.set(style="white", palette="muted")

# Joint Plot principal (scatter avec distributions)
g = sns.jointplot(
    data=crop,
    x=x_var,
    y=y_var,
    hue="label",          # variable de classe (22 cultures)
    kind="scatter",       # affichage points
    height=8,             # taille de la figure
    alpha=0.7,            # transparence pour √©viter le chevauchement
    s=30                  # taille des points
)

# D√©placer la l√©gende 
leg = g.ax_joint.get_legend()
if leg is not None:
    handles, labels = g.ax_joint.get_legend_handles_labels()
    leg.remove()
    # L√©gende au niveau de la figure, coll√©e √† droite (√† l'int√©rieur de la figure pour √©viter la coupe)
    g.fig.legend(
        handles, labels, title="label",
        loc="center left", bbox_to_anchor=(0.99, 0.5),  # 0.99 la garde juste √† l'int√©rieur du bord droit
        borderaxespad=0, frameon=True
    )

g.fig.suptitle("Joint Plot ‚Äì Relation entre Rainfall et Humidity ", y=1.02, fontsize=14)

plt.show()

# Analyse de corr√©lation

In [18]:
corr = crop.corr(numeric_only=True)
corr #coefficient de correlation de Pearson (0 = pas de correlation lin√©aire)

# Encoding

In [19]:
labels = [
    'rice', 'maize', 'chickpea', 'kidneybeans', 'pigeonpeas', 'mothbeans', 'mungbean', 'blackgram',
    'lentil', 'pomegranate', 'banana', 'mango', 'grapes', 'watermelon', 'muskmelon', 'apple',
    'orange', 'papaya', 'coconut', 'cotton', 'jute', 'coffee'
]
label_dict = {label: rank for rank, label in enumerate(labels, start=1)}
print(label_dict)


In [20]:
crop_dict ={'rice': 1, 'maize': 2, 'chickpea': 3, 'kidneybeans': 4, 'pigeonpeas': 5, 'mothbeans': 6, 'mungbean': 7, 'blackgram': 8, 'lentil': 9, 'pomegranate': 10, 'banana': 11, 'mango': 12, 'grapes': 13, 'watermelon': 14, 'muskmelon': 15, 'apple': 16, 'orange': 17, 'papaya': 18, 'coconut': 19, 'cotton': 20, 'jute': 21, 'coffee': 22}

crop['crop_num']=crop['label'].map(crop_dict)

In [21]:
crop['crop_num'].value_counts()

In [22]:
crop.drop('label',axis=1,inplace=True)
crop.head()

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=37cd5642-eb00-4cb1-969e-a9bc85cf5e83' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>