<a href="https://colab.research.google.com/github/vedrajiit/GoogleCollab/blob/main/DataPreprocessingAndNormalization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd

#Load data
data = pd.DataFrame({
    'ID': [1, 2, 3, 4, 5, 6, 7],
    'Name': ['John Doe', 'Bob Jonhson', 'Alice Brown', 'Charlie Black', 'David White', 'Emma Green', 'Jane Smith'],
    'Age': [28, 35, 45, None, 32, 29, 26],
    'Department': ['HR', 'IT', 'HR', 'IT', 'Finance', 'Markiting', 'HR'],
    'Salary': [50000, 70000, 60000, None, 65000, 75000, 60000],
    'Date of Joining': ['2022-01-25', '2021-06-20', '2020-12-10', '2023-03-01', None, '2022-09-15', '2024-01-10']
})

In [3]:
data

Unnamed: 0,ID,Name,Age,Department,Salary,Date of Joining
0,1,John Doe,28.0,HR,50000.0,2022-01-25
1,2,Bob Jonhson,35.0,IT,70000.0,2021-06-20
2,3,Alice Brown,45.0,HR,60000.0,2020-12-10
3,4,Charlie Black,,IT,,2023-03-01
4,5,David White,32.0,Finance,65000.0,
5,6,Emma Green,29.0,Markiting,75000.0,2022-09-15
6,7,Jane Smith,26.0,HR,60000.0,2024-01-10


In [4]:
#Fill the missing value
data['Age'] = data['Age'].fillna(data['Age'].median())
data['Salary'] = data['Salary'].fillna(data['Salary'].mean()).round(2) #Round to 2 decimal point
data['Date of Joining'] = data['Date of Joining'].fillna('Unknown')

#Drop duplicate rows
data = data.drop_duplicates()

In [5]:
data

Unnamed: 0,ID,Name,Age,Department,Salary,Date of Joining
0,1,John Doe,28.0,HR,50000.0,2022-01-25
1,2,Bob Jonhson,35.0,IT,70000.0,2021-06-20
2,3,Alice Brown,45.0,HR,60000.0,2020-12-10
3,4,Charlie Black,30.5,IT,63333.33,2023-03-01
4,5,David White,32.0,Finance,65000.0,Unknown
5,6,Emma Green,29.0,Markiting,75000.0,2022-09-15
6,7,Jane Smith,26.0,HR,60000.0,2024-01-10


In [6]:
#Convert 'Department' to numerical value
data['Department'] = data['Department'].astype('category').cat.codes

#Convert 'Date of Joining' to datetime
data['Date of Joining'] = pd.to_datetime(data['Date of Joining'], errors='coerce')

In [7]:
data

Unnamed: 0,ID,Name,Age,Department,Salary,Date of Joining
0,1,John Doe,28.0,1,50000.0,2022-01-25
1,2,Bob Jonhson,35.0,2,70000.0,2021-06-20
2,3,Alice Brown,45.0,1,60000.0,2020-12-10
3,4,Charlie Black,30.5,2,63333.33,2023-03-01
4,5,David White,32.0,0,65000.0,NaT
5,6,Emma Green,29.0,3,75000.0,2022-09-15
6,7,Jane Smith,26.0,1,60000.0,2024-01-10


In [8]:
#Drop the 'Name' column as it's not useful for model training
data_reduce = data.drop(columns=['Name'])

In [9]:
data_reduce

Unnamed: 0,ID,Age,Department,Salary,Date of Joining
0,1,28.0,1,50000.0,2022-01-25
1,2,35.0,2,70000.0,2021-06-20
2,3,45.0,1,60000.0,2020-12-10
3,4,30.5,2,63333.33,2023-03-01
4,5,32.0,0,65000.0,NaT
5,6,29.0,3,75000.0,2022-09-15
6,7,26.0,1,60000.0,2024-01-10


Data Normalization

In [11]:
from sklearn.preprocessing import MinMaxScaler

#initialize the scaler
scaler = MinMaxScaler()

#Apply normalization
data_normalized = data_reduce.copy()
data_normalized[['Age', 'Salary']] = scaler.fit_transform(data_reduce[['Age', 'Salary']])

In [12]:
data_normalized

Unnamed: 0,ID,Age,Department,Salary,Date of Joining
0,1,0.105263,1,0.0,2022-01-25
1,2,0.473684,2,0.8,2021-06-20
2,3,1.0,1,0.4,2020-12-10
3,4,0.236842,2,0.533333,2023-03-01
4,5,0.315789,0,0.6,NaT
5,6,0.157895,3,1.0,2022-09-15
6,7,0.0,1,0.4,2024-01-10


In [13]:
import zlib

#Convert Dataframe to CSV string
csv_data = data_reduce.to_csv(index=False)

#Compress the CSV data
compressed_data = zlib.compress(csv_data.encode())

In [19]:
print("csv_data:\n", csv_data)
print("\n\ncompressed_data:\n", compressed_data)
print("\nBefore compressed data size in bytes: ", len(csv_data))
print("After compressed data size in bytes: ", len(compressed_data))

csv_data:
 ID,Age,Department,Salary,Date of Joining
1,28.0,1,50000.0,2022-01-25
2,35.0,2,70000.0,2021-06-20
3,45.0,1,60000.0,2020-12-10
4,30.5,2,63333.33,2023-03-01
5,32.0,0,65000.0,
6,29.0,3,75000.0,2022-09-15
7,26.0,1,60000.0,2024-01-10



compressed_data:
 b'x\x9c]N;\x0e\xc2P\x0c\xdb\xdf)8\x80S\xe5\xd3\xbc\xd2\x11\xa9\x0b\xac\x9c\xe0\r\xa5\xaa\x04-\xaa\xbap{R\x18\x8a\xb0"\xc5\x8ac\xd9\xe7\x0e\xa7\xa1G\xd7?\xcb\xb2>\xfai\xc5\xb5\xdc\xcb\xf2BW\xd6\xfe0\xdf\x0e\x97y\x9c\xc6iH\x02=V\x0c\x81s \x98\xb2*\xb1\x90zR\x98o\x174\xbb&\xc4\x99\x94\x93\xa1\xf6\x8f/\xef\x1a\x93(\t\xa7\x1a\xc6\x95\x87/[\xa02\xdbD#\x8e\x91\xe40\x8dwF\xf6\xaf1eh\x1b\xdb\xd0\xf8o\x85\x96\xc4S\x03\xcd\xff1\xf5V/b\xde-F0\x81'

Before compressed data size in bytes:  228
After compressed data size in bytes:  152


On Iris dataset

In [22]:
import pandas as pd

#Load iris dataset from URL or JSON file
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
df = pd.read_csv(url, header=None, names=columns)

#initialize data overview
print(df.head())
print("\n")
print(df.info())

   sepal_length  sepal_width  petal_length  petal_width      species
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None


Data cleaning

In [23]:
#Check for missing values and duplicates
print("Missing values:\n", df.isnull().sum())
df.drop_duplicates(inplace=True)


Missing values:
 sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64


Data Traansformation

In [24]:
#Encode species as numeric
df['species'] = pd.Categorical(df['species']).codes

Dimensionality Reduction (PCA)

In [25]:
from sklearn.decomposition import PCA

#Apply PCA (2 components)
pca = PCA(n_components=2)
reduced_df = pca.fit_transform(df.iloc[:, :-1])

#Create Dataframe with reduced features
reduced_df = pd.DataFrame(reduced_df, columns=['PC1', 'PC2'])
reduced_df['species'] = df['species']

print("Reduced data:\n", reduced_df.head())

Reduced data:
         PC1       PC2  species
0 -2.710782  0.322125      0.0
1 -2.741763 -0.175061      0.0
2 -2.916691 -0.141509      0.0
3 -2.773363 -0.315205      0.0
4 -2.755418  0.330133      0.0


Data Compression

In [27]:
import zlib
import pickle

#Compress and Decompress data
compressed_data = zlib.compress(pickle.dumps(df))
decompressed_data = pickle.loads(zlib.decompress(compressed_data))

#Verify compression
print("Compressed data size in Bytes: ", len(compressed_data))
print("Data Integirity check: ", df.equals(decompressed_data))

Compressed data size in Bytes:  1710
Data Integirity check:  True


Data Normalization

In [28]:
scaler = MinMaxScaler()
scaled_df = scaler.fit_transform(df.iloc[:, :-1])

In [29]:
scaled_df

array([[0.22222222, 0.625     , 0.06779661, 0.04166667],
       [0.16666667, 0.41666667, 0.06779661, 0.04166667],
       [0.11111111, 0.5       , 0.05084746, 0.04166667],
       [0.08333333, 0.45833333, 0.08474576, 0.04166667],
       [0.19444444, 0.66666667, 0.06779661, 0.04166667],
       [0.30555556, 0.79166667, 0.11864407, 0.125     ],
       [0.08333333, 0.58333333, 0.06779661, 0.08333333],
       [0.19444444, 0.58333333, 0.08474576, 0.04166667],
       [0.02777778, 0.375     , 0.06779661, 0.04166667],
       [0.16666667, 0.45833333, 0.08474576, 0.        ],
       [0.30555556, 0.70833333, 0.08474576, 0.04166667],
       [0.13888889, 0.58333333, 0.10169492, 0.04166667],
       [0.13888889, 0.41666667, 0.06779661, 0.        ],
       [0.        , 0.41666667, 0.01694915, 0.        ],
       [0.41666667, 0.83333333, 0.03389831, 0.04166667],
       [0.38888889, 1.        , 0.08474576, 0.125     ],
       [0.30555556, 0.79166667, 0.05084746, 0.125     ],
       [0.22222222, 0.625     ,

 Model Training

In [65]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#Split data into train/test set
X_train, X_test, Y_train, Y_test = train_test_split(scaled_df, df['species'], test_size=0.3, random_state=2025) #stratify=y

#Train and evaluate a logistic resgression model
model = LogisticRegression()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)

print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 0.89
