In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [2]:
from google.colab import files
uploaded = files.upload()  # Upload CSV file manually
file_name = list(uploaded.keys())[0]  # Get the uploaded filename

df = pd.read_csv(file_name)  # Load the dataset using the file_name variable
print("Original Dataset:\n", df.head())

Saving exp3.csv to exp3.csv
Original Dataset:
    ID     Name   Age        Department  Marks Graduated
0   1    Alice  21.0  Computer Science     85       Yes
1   2      Bob  22.0        Mechanical     78        No
2   3  Charlie  20.0        Electrical     92       Yes
3   4    David   NaN             Civil     74        No
4   5      Eva  21.0                IT     88       Yes


In [3]:
df.head()

Unnamed: 0,ID,Name,Age,Department,Marks,Graduated
0,1,Alice,21.0,Computer Science,85,Yes
1,2,Bob,22.0,Mechanical,78,No
2,3,Charlie,20.0,Electrical,92,Yes
3,4,David,,Civil,74,No
4,5,Eva,21.0,IT,88,Yes


In [4]:
df = df.drop_duplicates()  # Remove duplicates
df["Marks"] = df["Marks"].apply(lambda x: x if x >= 0 else np.nan)  # Replace incorrect marks with NaN
print("\nAfter Removing Duplicates and Handling Incorrect Data:\n", df)


After Removing Duplicates and Handling Incorrect Data:
    ID     Name   Age        Department  Marks Graduated
0   1    Alice  21.0  Computer Science     85       Yes
1   2      Bob  22.0        Mechanical     78        No
2   3  Charlie  20.0        Electrical     92       Yes
3   4    David   NaN             Civil     74        No
4   5      Eva  21.0                IT     88       Yes
5   6    Frank  22.0       Electronics     81       Yes
6   7    Grace  20.0     Biotechnology     90       Yes
7   8     Hank  -5.0         Aerospace     76        No
8   9      Ivy  21.0           Physics     87       Yes
9  10     Jack  22.0         Chemistry     79        No


In [5]:
df["Age"].fillna(df["Age"].mean(), inplace=True)  # Fill missing numeric value with mean
df["Department"].fillna(df["Department"].mode()[0], inplace=True)  # Fill missing categorical value with mode
df["Marks"].fillna(df["Marks"].median(), inplace=True)  # Fill missing marks with median
print("\nAfter Handling Missing Data:\n", df)


After Handling Missing Data:
    ID     Name        Age        Department  Marks Graduated
0   1    Alice  21.000000  Computer Science     85       Yes
1   2      Bob  22.000000        Mechanical     78        No
2   3  Charlie  20.000000        Electrical     92       Yes
3   4    David  18.222222             Civil     74        No
4   5      Eva  21.000000                IT     88       Yes
5   6    Frank  22.000000       Electronics     81       Yes
6   7    Grace  20.000000     Biotechnology     90       Yes
7   8     Hank  -5.000000         Aerospace     76        No
8   9      Ivy  21.000000           Physics     87       Yes
9  10     Jack  22.000000         Chemistry     79        No


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].mean(), inplace=True)  # Fill missing numeric value with mean
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Department"].fillna(df["Department"].mode()[0], inplace=True)  # Fill missing categorical value with mode
The behavior will change in panda

In [6]:
scaler = MinMaxScaler()
df["Age_Scaled"] = scaler.fit_transform(df[["Age"]])
df["Marks_Scaled"] = scaler.fit_transform(df[["Marks"]])

In [7]:
encoder = LabelEncoder()
df["Department_Encoded"] = encoder.fit_transform(df["Department"])
df["Graduated_Encoded"] = encoder.fit_transform(df["Graduated"])

print("\nFinal Processed Data:\n", df)


Final Processed Data:
    ID     Name        Age        Department  Marks Graduated  Age_Scaled  \
0   1    Alice  21.000000  Computer Science     85       Yes    0.962963   
1   2      Bob  22.000000        Mechanical     78        No    1.000000   
2   3  Charlie  20.000000        Electrical     92       Yes    0.925926   
3   4    David  18.222222             Civil     74        No    0.860082   
4   5      Eva  21.000000                IT     88       Yes    0.962963   
5   6    Frank  22.000000       Electronics     81       Yes    1.000000   
6   7    Grace  20.000000     Biotechnology     90       Yes    0.925926   
7   8     Hank  -5.000000         Aerospace     76        No    0.000000   
8   9      Ivy  21.000000           Physics     87       Yes    0.962963   
9  10     Jack  22.000000         Chemistry     79        No    1.000000   

   Marks_Scaled  Department_Encoded  Graduated_Encoded  
0      0.611111                   4                  1  
1      0.222222          

In [8]:
Q1 = df["Marks"].quantile(0.25)
Q3 = df["Marks"].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df["Marks"] = df["Marks"].apply(lambda x: upper_bound if x > upper_bound else (lower_bound if x < lower_bound else x))

In [9]:
df["Performance"] = df["Marks"].apply(lambda x: "Excellent" if x >= 85 else ("Good" if x >= 75 else "Average"))

In [10]:
scaler = MinMaxScaler()
df["Age_Scaled"] = scaler.fit_transform(df[["Age"]])
df["Marks_Scaled"] = scaler.fit_transform(df[["Marks"]])

encoder = LabelEncoder()
df["Department_Encoded"] = encoder.fit_transform(df["Department"])
df["Graduated_Encoded"] = encoder.fit_transform(df["Graduated"])

In [11]:
!pip install scikit-learn --upgrade
import pandas as pd
from sklearn.preprocessing import OneHotEncoder # Import OneHotEncoder

# ... (rest of your code) ...

# Use handle_unknown='ignore' instead of sparse=False for older versions
onehot = OneHotEncoder(handle_unknown='ignore')
encoded_cols = onehot.fit_transform(df[["Department"]])
encoded_df = pd.DataFrame(encoded_cols.toarray(), columns=onehot.get_feature_names_out(["Department"])) # Convert to dense array
df = df.join(encoded_df)



In [12]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split # Import train_test_split

# ... (rest of your code) ...

# Split data into training and testing sets
X = df.drop(columns=["ID", "Name", "Graduated", "Performance"])  # Features
y = df["Graduated_Encoded"]  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nTraining Set:\n", X_train.head())
print("\nTest Set:\n", X_test.head())


Training Set:
     Age        Department  Marks  Age_Scaled  Marks_Scaled  \
5  22.0       Electronics     81    1.000000      0.388889   
0  21.0  Computer Science     85    0.962963      0.611111   
7  -5.0         Aerospace     76    0.000000      0.111111   
2  20.0        Electrical     92    0.925926      1.000000   
9  22.0         Chemistry     79    1.000000      0.277778   

   Department_Encoded  Graduated_Encoded  Department_Aerospace  \
5                   6                  1                   0.0   
0                   4                  1                   0.0   
7                   0                  0                   1.0   
2                   5                  1                   0.0   
9                   2                  0                   0.0   

   Department_Biotechnology  Department_Chemistry  Department_Civil  \
5                       0.0                   0.0               0.0   
0                       0.0                   0.0               0.0   
7

In [14]:
# Print final cleaned and trasformed data
print("\nFinal cleaned and transformed data:")
print(df)


Final cleaned and transformed data:
   ID     Name        Age        Department  Marks Graduated  Age_Scaled  \
0   1    Alice  21.000000  Computer Science     85       Yes    0.962963   
1   2      Bob  22.000000        Mechanical     78        No    1.000000   
2   3  Charlie  20.000000        Electrical     92       Yes    0.925926   
3   4    David  18.222222             Civil     74        No    0.860082   
4   5      Eva  21.000000                IT     88       Yes    0.962963   
5   6    Frank  22.000000       Electronics     81       Yes    1.000000   
6   7    Grace  20.000000     Biotechnology     90       Yes    0.925926   
7   8     Hank  -5.000000         Aerospace     76        No    0.000000   
8   9      Ivy  21.000000           Physics     87       Yes    0.962963   
9  10     Jack  22.000000         Chemistry     79        No    1.000000   

   Marks_Scaled  Department_Encoded  Graduated_Encoded  ...  \
0      0.611111                   4                  1  ...   
