In [1]:
import pandas as pd
import numpy as np

In [2]:
data = {
    'Name': ['Pavan', 'Kapil', 'Lalit', 'Ishan', 'Om'],
    'Age': [25, None, 44, 23, None],
    'Salary': [50000, 60000, 70000, None, None]
}

df=pd.DataFrame(data)

In [3]:
df

Unnamed: 0,Name,Age,Salary
0,Pavan,25.0,50000.0
1,Kapil,,60000.0
2,Lalit,44.0,70000.0
3,Ishan,23.0,
4,Om,,


# DATA PREPROCESSING

In [4]:
df.isnull().sum()

Name      0
Age       2
Salary    2
dtype: int64

In [5]:
df.dropna()

Unnamed: 0,Name,Age,Salary
0,Pavan,25.0,50000.0
2,Lalit,44.0,70000.0


In [6]:
df.fillna(0)

Unnamed: 0,Name,Age,Salary
0,Pavan,25.0,50000.0
1,Kapil,0.0,60000.0
2,Lalit,44.0,70000.0
3,Ishan,23.0,0.0
4,Om,0.0,0.0


In [7]:
df["Age"]=pd.to_numeric(df["Age"], errors="coerce")
df["Salary"]=pd.to_numeric(df["Salary"], errors="coerce")

In [8]:
df["Age"]=df["Age"].fillna(df["Age"].mean())
df["Salary"]=df["Salary"].fillna(df["Salary"].mean())

In [9]:
df.head()

Unnamed: 0,Name,Age,Salary
0,Pavan,25.0,50000.0
1,Kapil,30.666667,60000.0
2,Lalit,44.0,70000.0
3,Ishan,23.0,60000.0
4,Om,30.666667,60000.0


In [10]:
print(df.isnull().mean()*100)    ## This will tell how many missing values are there in percentage

Name      0.0
Age       0.0
Salary    0.0
dtype: float64


# ENCODING CATEGORICAL VALUES

1. Label Encoding
2. One Hot Encoding

In [11]:
from sklearn.preprocessing import LabelEncoder

data=pd.read_csv("sample_data.csv")

In [12]:
df_label= data.copy()

In [13]:
df_label.head() 

Unnamed: 0,Name,Gender,City,Passed
0,Aman,Male,Delhi,Yes
1,Priya,Female,Mumbai,Yes
2,Rahul,Male,Bangalore,No
3,Anjali,Female,Mumbai,Yes
4,Ravi,Male,Delhi,Yes


In [14]:
le=LabelEncoder()

In [15]:
df_label["Gender_Encoded"]=le.fit_transform(df_label["Gender"])
df_label["Passed_Encoded"]=le.fit_transform(df_label["Passed"]) 

In [16]:
print("\nLABEL ENCODED DATA")
print(df_label[["Name", "Gender","Gender_Encoded","Passed", "Passed_Encoded"]])


LABEL ENCODED DATA
      Name  Gender  Gender_Encoded Passed  Passed_Encoded
0     Aman    Male               1    Yes               1
1    Priya  Female               0    Yes               1
2    Rahul    Male               1     No               0
3   Anjali  Female               0    Yes               1
4     Ravi    Male               1    Yes               1
5    Meera  Female               0     No               0
6    Arjun    Male               1    Yes               1
7     Neha  Female               0    Yes               1
8    Imran    Male               1     No               0
9    Sneha  Female               0    Yes               1
10     Raj    Male               1    Yes               1
11   Divya  Female               0     No               0
12   Kabir    Male               1    Yes               1
13  Simran  Female               0    Yes               1
14   Karan    Male               1     No               0
15   Pooja  Female               0    Yes           

## ONE-HOT ENCODING

In [17]:
df_encoded=pd.get_dummies(df_label, columns=["City"], dtype=int)

In [18]:
df_encoded.head()

Unnamed: 0,Name,Gender,Passed,Gender_Encoded,Passed_Encoded,City_Bangalore,City_Chennai,City_Delhi,City_Mumbai
0,Aman,Male,Yes,1,1,0,0,1,0
1,Priya,Female,Yes,0,1,0,0,0,1
2,Rahul,Male,No,1,0,1,0,0,0
3,Anjali,Female,Yes,0,1,0,0,0,1
4,Ravi,Male,Yes,1,1,0,0,1,0


In [19]:
df_encoded

Unnamed: 0,Name,Gender,Passed,Gender_Encoded,Passed_Encoded,City_Bangalore,City_Chennai,City_Delhi,City_Mumbai
0,Aman,Male,Yes,1,1,0,0,1,0
1,Priya,Female,Yes,0,1,0,0,0,1
2,Rahul,Male,No,1,0,1,0,0,0
3,Anjali,Female,Yes,0,1,0,0,0,1
4,Ravi,Male,Yes,1,1,0,0,1,0
5,Meera,Female,No,0,0,0,1,0,0
6,Arjun,Male,Yes,1,1,1,0,0,0
7,Neha,Female,Yes,0,1,0,0,1,0
8,Imran,Male,No,1,0,0,1,0,0
9,Sneha,Female,Yes,0,1,0,0,0,1


## FEATURE SCALING

In [20]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

## Splitting Data

In [21]:
from sklearn.model_selection import train_test_split


data={
    'StudyHours':[1,2,3,4,5],
    'TestScore':[40,50,60,70,80]
}

df=pd.DataFrame(data)

In [22]:
df

Unnamed: 0,StudyHours,TestScore
0,1,40
1,2,50
2,3,60
3,4,70
4,5,80


## STANDARDIZATION (STANDARD SCALER)

In [23]:
Standard_scaler=StandardScaler()
Standard_scaled=Standard_scaler.fit_transform(df)

In [26]:
print("\nSTANDARD SCALED OUTPUT")
pd.DataFrame(Standard_scaled, columns=["StudyHours","TestScore"]) 


STANDARD SCALER OUTPUT


Unnamed: 0,StudyHours,TestScore
0,-1.414214,-1.414214
1,-0.707107,-0.707107
2,0.0,0.0
3,0.707107,0.707107
4,1.414214,1.414214


## Normalization (MinMaxScaler)

In [32]:
Minmax_scaler= MinMaxScaler()
Minmax_scaled= minmax_scaler.fit_transform(df)

In [35]:
print("\nMINMAX SCALED OUTPUT")
pd.DataFrame(Minmax_scaled, columns=["StudyHours","TestScore"])


MINMAX SCALED OUTPUT


Unnamed: 0,StudyHours,TestScore
0,0.0,0.0
1,0.25,0.25
2,0.5,0.5
3,0.75,0.75
4,1.0,1.0


## Train_Test_Split

In [37]:
X=df[["StudyHours"]]    ## we have to pass this as a data frame (so that can not get this as a test series data
y=df[["TestScore"]]

In [38]:
X_train,X_test,y_train, y_test=train_test_split(X,y, test_size=0.2, random_state=42)

In [40]:
("\nTEST DATA")
X_train  ## for the training

Unnamed: 0,StudyHours
4,5
2,3
0,1
3,4


In [44]:
X_test  ## this data is for the test 

y_train
y_test

Unnamed: 0,TestScore
1,50


In [45]:
y_train

Unnamed: 0,TestScore
4,80
2,60
0,40
3,70


In [46]:
y_test

Unnamed: 0,TestScore
1,50
