In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
csv_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'

In [3]:
column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

In [4]:
iris = pd.read_csv(csv_url, names=column_names)

In [5]:
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [7]:
label_encoder = LabelEncoder()

In [8]:
iris['species_encoded'] = label_encoder.fit_transform(iris['species'])

'''label_encoder.fit_transform(iris_data['species']): This part of the code fits the label encoder to the unique values present in the 'species' column of the iris_data DataFrame and then transforms these labels into encoded numerical values.
iris_data['species_encoded']: This assigns the transformed values to a new column named 'species_encoded' in the iris_data DataFrame. Each unique species name in the 'species' column is encoded with a corresponding integer value and stored in this new column.'''

In [9]:
print(iris.head())

   sepal_length  sepal_width  petal_length  petal_width      species  \
0           5.1          3.5           1.4          0.2  Iris-setosa   
1           4.9          3.0           1.4          0.2  Iris-setosa   
2           4.7          3.2           1.3          0.2  Iris-setosa   
3           4.6          3.1           1.5          0.2  Iris-setosa   
4           5.0          3.6           1.4          0.2  Iris-setosa   

   species_encoded  
0                0  
1                0  
2                0  
3                0  
4                0  


In [10]:
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_encoded
0,5.1,3.5,1.4,0.2,Iris-setosa,0
1,4.9,3.0,1.4,0.2,Iris-setosa,0
2,4.7,3.2,1.3,0.2,Iris-setosa,0
3,4.6,3.1,1.5,0.2,Iris-setosa,0
4,5.0,3.6,1.4,0.2,Iris-setosa,0
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica,2
146,6.3,2.5,5.0,1.9,Iris-virginica,2
147,6.5,3.0,5.2,2.0,Iris-virginica,2
148,6.2,3.4,5.4,2.3,Iris-virginica,2


Explanation:

The 'species' column in the Iris dataset contains categorical data representing different species of iris flowers ('setosa', 'versicolor', and 'virginica'). Since most machine learning algorithms require numerical inputs, we need to encode these categorical labels into numerical format.

The LabelEncoder accomplishes this task by assigning a unique integer to each unique label present in the 'species' column. For example, 'setosa' might be assigned the label 0, 'versicolor' might be assigned 1, and 'virginica' might be assigned 2.

After applying the label encoding, the 'species_encoded' column in the DataFrame contains the numerical representations of the species labels, which can then be used as target variables for machine learning models. This enables us to use categorical data in our models effectively.