# DSA2040 END SEMESTER EXAM

## SECTION 2: DATA MINING

### Task 1: Data Preprocessing and Exploration

In [2]:
# Imports necessary

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split



In [3]:
# 1. Loading the dataset

iris = load_iris(as_frame=True)
df = iris.frame 

print("First 5 rows of dataset:")
print(df.head())



First 5 rows of dataset:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  


In [4]:
# 2. Preprocessing

# Handling missing values
print("\nMissing values check:")
print(df.isnull().sum()) 

# Normalizing features using Min-Max scaling
scaler = MinMaxScaler()
feature_cols = iris.feature_names
df[feature_cols] = scaler.fit_transform(df[feature_cols])

# One-Hot Encoding
encoder = OneHotEncoder(sparse_output=False)
encoded_labels = encoder.fit_transform(df[['target']])
encoded_labels_df = pd.DataFrame(encoded_labels, columns=encoder.get_feature_names_out(['target']))
df = pd.concat([df.drop('target', axis=1), encoded_labels_df], axis=1)

print("\nAfter preprocessing:")
print(df.head())




Missing values check:
sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
dtype: int64

After preprocessing:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0           0.222222          0.625000           0.067797          0.041667   
1           0.166667          0.416667           0.067797          0.041667   
2           0.111111          0.500000           0.050847          0.041667   
3           0.083333          0.458333           0.084746          0.041667   
4           0.194444          0.666667           0.067797          0.041667   

   target_0  target_1  target_2  
0       1.0       0.0       0.0  
1       1.0       0.0       0.0  
2       1.0       0.0       0.0  
3       1.0       0.0       0.0  
4       1.0       0.0       0.0  


In [5]:
# 3. Exploration

# Summary statistics
print("\nSummary statistics:")
print(df.describe())

# Pairplot
sns.pairplot(pd.DataFrame(scaler.inverse_transform(df[feature_cols]), columns=feature_cols))
plt.savefig("iris_pairplot.png")
plt.close()

# Correlation heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.savefig("iris_heatmap.png")
plt.close()

#  Outlier detection using boxplots
plt.figure(figsize=(10, 6))
sns.boxplot(data=pd.DataFrame(scaler.inverse_transform(df[feature_cols]), columns=feature_cols))
plt.title("Boxplot of Features")
plt.savefig("iris_boxplots.png")
plt.close()




Summary statistics:
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.000000   
mean            0.428704          0.440556           0.467458   
std             0.230018          0.181611           0.299203   
min             0.000000          0.000000           0.000000   
25%             0.222222          0.333333           0.101695   
50%             0.416667          0.416667           0.567797   
75%             0.583333          0.541667           0.694915   
max             1.000000          1.000000           1.000000   

       petal width (cm)    target_0    target_1    target_2  
count        150.000000  150.000000  150.000000  150.000000  
mean           0.458056    0.333333    0.333333    0.333333  
std            0.317599    0.472984    0.472984    0.472984  
min            0.000000    0.000000    0.000000    0.000000  
25%            0.083333    0.000000    0.000000    0.000000  
50%            0.5000

In [6]:
 #4. Split Function


def split_train_test(dataframe, test_size=0.2, random_state=42):
    """
    Splits data into train and test sets.
    """
    X = dataframe[feature_cols]
    y = dataframe[encoded_labels_df.columns] 
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

# Example
X_train, X_test, y_train, y_test = split_train_test(df)
print(f"\nTrain set size: {X_train.shape[0]} rows")
print(f"Test set size: {X_test.shape[0]} rows")



Train set size: 120 rows
Test set size: 30 rows
