# 80% of a data scientist's valuable time is spent simply
finding , cleansing and organizing data, leaving only 20% to actually perform analysis

# What are my features?
It’s pretty easy to infer the following features from the column names:

ST_NUM: Street number

ST_NAME: Street name

OWN_OCCUPIED: Is the residence owner occupied

NUM_BEDROOMS: Number of bedrooms

We can also answer, what are the expected types?

ST_NUM: float or int… some sort of numeric type

ST_NAME: string

OWN_OCCUPIED: string… Y (“Yes”) or N (“No”)

NUM_BEDROOMS: float or int, a numeric type

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df=pd.read_csv('raw.githubusercontent.com_fazlyrabbi77_DataProcessing_master_real-estate.csv')

In [3]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


# Rewnaming Columns with inplace

In [4]:
df.rename(columns={"NUM_BEDROOMS":"BEDROOMS","NUM_BATH":"BATH"},inplace=True)

In [5]:
df.head(9)

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


# Checking Null Value

In [6]:
df.isnull()

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,True,False,False,True,False,False
3,False,False,False,False,False,True,False
4,True,False,False,False,False,False,False
5,False,False,False,False,True,False,False
6,False,True,False,True,False,False,False
7,False,False,False,False,False,False,True
8,False,False,False,False,False,False,False


In [7]:
df.isnull().values.any()

True

In [8]:
df.isnull().values.sum()

8

# Showing the null values as per attributes

In [9]:
null_columns=df.columns[df.isnull().any()]


In [10]:
df[null_columns].isnull().sum()

PID             1
ST_NUM          2
OWN_OCCUPIED    1
BEDROOMS        2
BATH            1
SQ_FT           1
dtype: int64

In [11]:
df.isnull().sum()

PID             1
ST_NUM          2
ST_NAME         0
OWN_OCCUPIED    1
BEDROOMS        2
BATH            1
SQ_FT           1
dtype: int64

# Checking all null columns

In [12]:
print(df[df.isnull().any(axis=1)][null_columns].head())

           PID  ST_NUM OWN_OCCUPIED BEDROOMS    BATH SQ_FT
2  100003000.0     NaN            N      NaN       1   850
3  100004000.0   201.0           12        1     NaN   700
4          NaN   203.0            Y        3       2  1600
5  100006000.0   207.0            Y      NaN       1   800
6  100007000.0     NaN          NaN        2  HURLEY   950


# Handling Null Values

# Filling null values with specific value

In [13]:
df['PID'].fillna(100005000,inplace=True)

In [14]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,100005000.0,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


In [15]:
df1=df.copy()

In [16]:
df1

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,100005000.0,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


# Remove Row or Column

In [17]:
df1.drop(["PID"],axis=1,inplace=True)

In [18]:
df1

Unnamed: 0,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,104.0,PUTNAM,Y,3,1,1000
1,197.0,LEXINGTON,N,3,1.5,--
2,,LEXINGTON,N,,1,850
3,201.0,BERKELEY,12,1,,700
4,203.0,BERKELEY,Y,3,2,1600
5,207.0,BERKELEY,Y,,1,800
6,,WASHINGTON,,2,HURLEY,950
7,213.0,TREMONT,Y,1,1,
8,215.0,TREMONT,Y,na,2,1800


In [19]:
df1.drop([7,8],inplace=True)

In [20]:
df1

Unnamed: 0,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,104.0,PUTNAM,Y,3.0,1,1000
1,197.0,LEXINGTON,N,3.0,1.5,--
2,,LEXINGTON,N,,1,850
3,201.0,BERKELEY,12,1.0,,700
4,203.0,BERKELEY,Y,3.0,2,1600
5,207.0,BERKELEY,Y,,1,800
6,,WASHINGTON,,2.0,HURLEY,950


In [21]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,100005000.0,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


# Data filling Based on Row wise

In [22]:
df.loc[2,'ST_NUM']=197
df.loc[6,'ST_NUM']=208
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,197.0,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,100005000.0,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,208.0,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


# Unwanted Value treatment from own_occupied Column

In [23]:
count=0
for row in df['OWN_OCCUPIED']:
    try:
        int(row)
        df.loc[count,'OWN_OCCUPIED']=np.nan
    except ValueError:
        pass
    count+=1
    
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,197.0,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,,1,,700
4,100005000.0,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,208.0,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


# Unwanted Value treatment from BEDROOMS,BATH, SQFT

In [24]:
df['BEDROOMS']=pd.to_numeric(df['BEDROOMS'],errors='coerce')
df['BATH']=pd.to_numeric(df['BATH'],errors='coerce')
df['SQ_FT']=pd.to_numeric(df['SQ_FT'],errors='coerce')


In [25]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1.0,1000.0
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,
2,100003000.0,197.0,LEXINGTON,N,,1.0,850.0
3,100004000.0,201.0,BERKELEY,,1.0,,700.0
4,100005000.0,203.0,BERKELEY,Y,3.0,2.0,1600.0
5,100006000.0,207.0,BERKELEY,Y,,1.0,800.0
6,100007000.0,208.0,WASHINGTON,,2.0,,950.0
7,100008000.0,213.0,TREMONT,Y,1.0,1.0,
8,100009000.0,215.0,TREMONT,Y,,2.0,1800.0


# Null Value fill up through Mode Value

In [26]:
df['OWN_OCCUPIED'].fillna(df['OWN_OCCUPIED'].mode()[0],inplace=True)

In [27]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1.0,1000.0
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,
2,100003000.0,197.0,LEXINGTON,N,,1.0,850.0
3,100004000.0,201.0,BERKELEY,Y,1.0,,700.0
4,100005000.0,203.0,BERKELEY,Y,3.0,2.0,1600.0
5,100006000.0,207.0,BERKELEY,Y,,1.0,800.0
6,100007000.0,208.0,WASHINGTON,Y,2.0,,950.0
7,100008000.0,213.0,TREMONT,Y,1.0,1.0,
8,100009000.0,215.0,TREMONT,Y,,2.0,1800.0


# Alternative way

In [28]:
own_occupied_mode=df.OWN_OCCUPIED.mode()

In [29]:
own_occupied_mode

0    Y
dtype: object

In [30]:
df.OWN_OCCUPIED=df.OWN_OCCUPIED.fillna(own_occupied_mode)

In [31]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1.0,1000.0
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,
2,100003000.0,197.0,LEXINGTON,N,,1.0,850.0
3,100004000.0,201.0,BERKELEY,Y,1.0,,700.0
4,100005000.0,203.0,BERKELEY,Y,3.0,2.0,1600.0
5,100006000.0,207.0,BERKELEY,Y,,1.0,800.0
6,100007000.0,208.0,WASHINGTON,Y,2.0,,950.0
7,100008000.0,213.0,TREMONT,Y,1.0,1.0,
8,100009000.0,215.0,TREMONT,Y,,2.0,1800.0


# Use of Median:We want to see that how is the median value of the bedrooms apartment 1,2,3

In [32]:
#Group By parameter Check

In [33]:
df.groupby('BEDROOMS')['SQ_FT'].median()

BEDROOMS
1.0     700.0
2.0     950.0
3.0    1300.0
Name: SQ_FT, dtype: float64

# Filling Null value with group by vparameter

In [34]:
df['SQ_FT']
df['SQ_FT'].fillna(df.groupby('BEDROOMS')['SQ_FT'].transform('median'))
df['SQ_FT']=df['SQ_FT'].fillna(df['SQ_FT'].median())
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1.0,1000.0
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,950.0
2,100003000.0,197.0,LEXINGTON,N,,1.0,850.0
3,100004000.0,201.0,BERKELEY,Y,1.0,,700.0
4,100005000.0,203.0,BERKELEY,Y,3.0,2.0,1600.0
5,100006000.0,207.0,BERKELEY,Y,,1.0,800.0
6,100007000.0,208.0,WASHINGTON,Y,2.0,,950.0
7,100008000.0,213.0,TREMONT,Y,1.0,1.0,950.0
8,100009000.0,215.0,TREMONT,Y,,2.0,1800.0


# Filling Null Value of bed rooms

In [35]:
df.loc[2,'BEDROOMS']=1
df.loc[5,'BEDROOMS']=1
df.loc[8,'BEDROOMS']=3
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1.0,1000.0
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,950.0
2,100003000.0,197.0,LEXINGTON,N,1.0,1.0,850.0
3,100004000.0,201.0,BERKELEY,Y,1.0,,700.0
4,100005000.0,203.0,BERKELEY,Y,3.0,2.0,1600.0
5,100006000.0,207.0,BERKELEY,Y,1.0,1.0,800.0
6,100007000.0,208.0,WASHINGTON,Y,2.0,,950.0
7,100008000.0,213.0,TREMONT,Y,1.0,1.0,950.0
8,100009000.0,215.0,TREMONT,Y,3.0,2.0,1800.0


# Filling Null values of BATH with bfill and ffill

In [36]:
df['BATH']=df['BATH'].fillna(method='bfill')
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1.0,1000.0
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,950.0
2,100003000.0,197.0,LEXINGTON,N,1.0,1.0,850.0
3,100004000.0,201.0,BERKELEY,Y,1.0,2.0,700.0
4,100005000.0,203.0,BERKELEY,Y,3.0,2.0,1600.0
5,100006000.0,207.0,BERKELEY,Y,1.0,1.0,800.0
6,100007000.0,208.0,WASHINGTON,Y,2.0,1.0,950.0
7,100008000.0,213.0,TREMONT,Y,1.0,1.0,950.0
8,100009000.0,215.0,TREMONT,Y,3.0,2.0,1800.0


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PID           9 non-null      float64
 1   ST_NUM        9 non-null      float64
 2   ST_NAME       9 non-null      object 
 3   OWN_OCCUPIED  9 non-null      object 
 4   BEDROOMS      9 non-null      float64
 5   BATH          9 non-null      float64
 6   SQ_FT         9 non-null      float64
dtypes: float64(5), object(2)
memory usage: 632.0+ bytes


# Converting to Int

In [38]:
df.PID=df.PID.astype('int64')
df.ST_NUM=df.ST_NUM.astype('int64')
df.BEDROOMS=df.BEDROOMS.astype('int64')
df.BATH=df.BATH.astype('int64')
df.SQFT=df.SQ_FT.astype('int64')

  df.SQFT=df.SQ_FT.astype('int64')


In [39]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000,104,PUTNAM,Y,3,1,1000.0
1,100002000,197,LEXINGTON,N,3,1,950.0
2,100003000,197,LEXINGTON,N,1,1,850.0
3,100004000,201,BERKELEY,Y,1,2,700.0
4,100005000,203,BERKELEY,Y,3,2,1600.0
5,100006000,207,BERKELEY,Y,1,1,800.0
6,100007000,208,WASHINGTON,Y,2,1,950.0
7,100008000,213,TREMONT,Y,1,1,950.0
8,100009000,215,TREMONT,Y,3,2,1800.0


In [40]:
df2=df
df=df

# Data Transformation or Feature Engineering 

# Label Encoder

In [41]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

In [42]:
df['ST_NAME']=le.fit_transform(df['ST_NAME'])
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000,104,2,Y,3,1,1000.0
1,100002000,197,1,N,3,1,950.0
2,100003000,197,1,N,1,1,850.0
3,100004000,201,0,Y,1,2,700.0
4,100005000,203,0,Y,3,2,1600.0
5,100006000,207,0,Y,1,1,800.0
6,100007000,208,4,Y,2,1,950.0
7,100008000,213,3,Y,1,1,950.0
8,100009000,215,3,Y,3,2,1800.0


# Mapping Function==>Convertion from numerical to categorical or categorical to numerical

In [43]:
mapping={'Y':1,'N':2}
df['OWN_OCCUPIED']=df['OWN_OCCUPIED'].map(mapping)

In [44]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000,104,2,1,3,1,1000.0
1,100002000,197,1,2,3,1,950.0
2,100003000,197,1,2,1,1,850.0
3,100004000,201,0,1,1,2,700.0
4,100005000,203,0,1,3,2,1600.0
5,100006000,207,0,1,1,1,800.0
6,100007000,208,4,1,2,1,950.0
7,100008000,213,3,1,1,1,950.0
8,100009000,215,3,1,3,2,1800.0


# One Hot Encoding for Nominal Data

In [45]:
df1=pd.get_dummies(df,columns=['OWN_OCCUPIED'])
df1

Unnamed: 0,PID,ST_NUM,ST_NAME,BEDROOMS,BATH,SQ_FT,OWN_OCCUPIED_1,OWN_OCCUPIED_2
0,100001000,104,2,3,1,1000.0,1,0
1,100002000,197,1,3,1,950.0,0,1
2,100003000,197,1,1,1,850.0,0,1
3,100004000,201,0,1,2,700.0,1,0
4,100005000,203,0,3,2,1600.0,1,0
5,100006000,207,0,1,1,800.0,1,0
6,100007000,208,4,2,1,950.0,1,0
7,100008000,213,3,1,1,950.0,1,0
8,100009000,215,3,3,2,1800.0,1,0


# Min Max Normalizer or Normalization

# Min Max Normalizer of Full Data Set

In [46]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()

In [47]:
scaler.fit(df)

In [48]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000,104,2,1,3,1,1000.0
1,100002000,197,1,2,3,1,950.0
2,100003000,197,1,2,1,1,850.0
3,100004000,201,0,1,1,2,700.0
4,100005000,203,0,1,3,2,1600.0
5,100006000,207,0,1,1,1,800.0
6,100007000,208,4,1,2,1,950.0
7,100008000,213,3,1,1,1,950.0
8,100009000,215,3,1,3,2,1800.0


In [49]:
scaled=pd.DataFrame(scaler.transform(df),columns=df.columns)

In [50]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000,104,2,1,3,1,1000.0
1,100002000,197,1,2,3,1,950.0
2,100003000,197,1,2,1,1,850.0
3,100004000,201,0,1,1,2,700.0
4,100005000,203,0,1,3,2,1600.0
5,100006000,207,0,1,1,1,800.0
6,100007000,208,4,1,2,1,950.0
7,100008000,213,3,1,1,1,950.0
8,100009000,215,3,1,3,2,1800.0


In [51]:
scaled

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,0.0,0.0,0.5,0.0,1.0,0.0,0.272727
1,0.125,0.837838,0.25,1.0,1.0,0.0,0.227273
2,0.25,0.837838,0.25,1.0,0.0,0.0,0.136364
3,0.375,0.873874,0.0,0.0,0.0,1.0,0.0
4,0.5,0.891892,0.0,0.0,1.0,1.0,0.818182
5,0.625,0.927928,0.0,0.0,0.0,0.0,0.090909
6,0.75,0.936937,1.0,0.0,0.5,0.0,0.227273
7,0.875,0.981982,0.75,0.0,0.0,0.0,0.227273
8,1.0,1.0,0.75,0.0,1.0,1.0,1.0


In [52]:
df5=df.copy()
df5

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000,104,2,1,3,1,1000.0
1,100002000,197,1,2,3,1,950.0
2,100003000,197,1,2,1,1,850.0
3,100004000,201,0,1,1,2,700.0
4,100005000,203,0,1,3,2,1600.0
5,100006000,207,0,1,1,1,800.0
6,100007000,208,4,1,2,1,950.0
7,100008000,213,3,1,1,1,950.0
8,100009000,215,3,1,3,2,1800.0


In [53]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000,104,2,1,3,1,1000.0
1,100002000,197,1,2,3,1,950.0
2,100003000,197,1,2,1,1,850.0
3,100004000,201,0,1,1,2,700.0
4,100005000,203,0,1,3,2,1600.0
5,100006000,207,0,1,1,1,800.0
6,100007000,208,4,1,2,1,950.0
7,100008000,213,3,1,1,1,950.0
8,100009000,215,3,1,3,2,1800.0


In [54]:
df4=df

In [55]:
df4

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000,104,2,1,3,1,1000.0
1,100002000,197,1,2,3,1,950.0
2,100003000,197,1,2,1,1,850.0
3,100004000,201,0,1,1,2,700.0
4,100005000,203,0,1,3,2,1600.0
5,100006000,207,0,1,1,1,800.0
6,100007000,208,4,1,2,1,950.0
7,100008000,213,3,1,1,1,950.0
8,100009000,215,3,1,3,2,1800.0


# Min Max Normalizer of any column of the Data Set

In [56]:
mmx=MinMaxScaler()

In [57]:
street_name_fit=mmx.fit(df4[['ST_NUM']])

In [58]:
street_name_fit

In [59]:
df4['ST_NAME']=mmx.transform(df4[['ST_NUM']])

In [60]:
df4

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000,104,0.0,1,3,1,1000.0
1,100002000,197,0.837838,2,3,1,950.0
2,100003000,197,0.837838,2,1,1,850.0
3,100004000,201,0.873874,1,1,2,700.0
4,100005000,203,0.891892,1,3,2,1600.0
5,100006000,207,0.927928,1,1,1,800.0
6,100007000,208,0.936937,1,2,1,950.0
7,100008000,213,0.981982,1,1,1,950.0
8,100009000,215,1.0,1,3,2,1800.0


# At a time fit and transform of PID column of df4

In [61]:
df4['PID']=mmx.fit_transform(df4[['PID']])

In [62]:
df4

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,0.0,104,0.0,1,3,1,1000.0
1,0.125,197,0.837838,2,3,1,950.0
2,0.25,197,0.837838,2,1,1,850.0
3,0.375,201,0.873874,1,1,2,700.0
4,0.5,203,0.891892,1,3,2,1600.0
5,0.625,207,0.927928,1,1,1,800.0
6,0.75,208,0.936937,1,2,1,950.0
7,0.875,213,0.981982,1,1,1,950.0
8,1.0,215,1.0,1,3,2,1800.0


# At a time fit and transform of SQ_FT column of df4 for different range (3,5)

In [63]:
mmx2=MinMaxScaler(feature_range=(3,5))
mmx2

In [64]:
df4['SQ_FT']=mmx2.fit_transform(df4[['SQ_FT']])

In [65]:
df4

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,0.0,104,0.0,1,3,1,3.545455
1,0.125,197,0.837838,2,3,1,3.454545
2,0.25,197,0.837838,2,1,1,3.272727
3,0.375,201,0.873874,1,1,2,3.0
4,0.5,203,0.891892,1,3,2,4.636364
5,0.625,207,0.927928,1,1,1,3.181818
6,0.75,208,0.936937,1,2,1,3.454545
7,0.875,213,0.981982,1,1,1,3.454545
8,1.0,215,1.0,1,3,2,5.0


In [66]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,0.0,104,0.0,1,3,1,3.545455
1,0.125,197,0.837838,2,3,1,3.454545
2,0.25,197,0.837838,2,1,1,3.272727
3,0.375,201,0.873874,1,1,2,3.0
4,0.5,203,0.891892,1,3,2,4.636364
5,0.625,207,0.927928,1,1,1,3.181818
6,0.75,208,0.936937,1,2,1,3.454545
7,0.875,213,0.981982,1,1,1,3.454545
8,1.0,215,1.0,1,3,2,5.0


In [67]:
df5


Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000,104,2,1,3,1,1000.0
1,100002000,197,1,2,3,1,950.0
2,100003000,197,1,2,1,1,850.0
3,100004000,201,0,1,1,2,700.0
4,100005000,203,0,1,3,2,1600.0
5,100006000,207,0,1,1,1,800.0
6,100007000,208,4,1,2,1,950.0
7,100008000,213,3,1,1,1,950.0
8,100009000,215,3,1,3,2,1800.0


In [68]:
df=df5.copy()
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,100001000,104,2,1,3,1,1000.0
1,100002000,197,1,2,3,1,950.0
2,100003000,197,1,2,1,1,850.0
3,100004000,201,0,1,1,2,700.0
4,100005000,203,0,1,3,2,1600.0
5,100006000,207,0,1,1,1,800.0
6,100007000,208,4,1,2,1,950.0
7,100008000,213,3,1,1,1,950.0
8,100009000,215,3,1,3,2,1800.0


# Alternative way to Normalize of full data set

In [69]:
from sklearn.preprocessing import normalize
data_normalized = normalize(df)
data_normalized = pd.DataFrame(data_normalized, columns=df.columns)
data_normalized
     

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,1.0,1e-06,1.99998e-08,9.9999e-09,2.99997e-08,9.9999e-09,1e-05
1,1.0,2e-06,9.9998e-09,1.99996e-08,2.99994e-08,9.9998e-09,9e-06
2,1.0,2e-06,9.9997e-09,1.99994e-08,9.9997e-09,9.9997e-09,8e-06
3,1.0,2e-06,0.0,9.9996e-09,9.9996e-09,1.99992e-08,7e-06
4,1.0,2e-06,0.0,9.9995e-09,2.99985e-08,1.9999e-08,1.6e-05
5,1.0,2e-06,0.0,9.9994e-09,9.9994e-09,9.9994e-09,8e-06
6,1.0,2e-06,3.99972e-08,9.9993e-09,1.99986e-08,9.9993e-09,9e-06
7,1.0,2e-06,2.99976e-08,9.9992e-09,9.9992e-09,9.9992e-09,9e-06
8,1.0,2e-06,2.99973e-08,9.9991e-09,2.99973e-08,1.99982e-08,1.8e-05


# Standardization by doing standard deviation

In [70]:
from sklearn.preprocessing import StandardScaler
std = StandardScaler()

df9 = pd.DataFrame(std.fit_transform(df.values.reshape(-1, 1)))
     

In [71]:
df9

Unnamed: 0,0
0,2.449375
1,-0.408251
2,-0.408254
3,-0.408254
4,-0.408254
...,...
58,-0.408254
59,-0.408254
60,-0.408254
61,-0.408254


In [72]:
from sklearn .preprocessing import StandardScaler
ss=StandardScaler()
ss

In [73]:
df_scalestd=pd.DataFrame(ss.fit_transform(df),columns=df.columns)
df_scalestd

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,-1.549193,-2.779199,0.312348,-0.534522,1.06066,-0.707107,-0.188982
1,-1.161895,0.09619,-0.390434,1.870829,1.06066,-0.707107,-0.330719
2,-0.774597,0.09619,-0.390434,1.870829,-1.06066,-0.707107,-0.614192
3,-0.387298,0.219862,-1.093216,-0.534522,-1.06066,1.414214,-1.039402
4,0.0,0.281699,-1.093216,-0.534522,1.06066,1.414214,1.511858
5,0.387298,0.405371,-1.093216,-0.534522,-1.06066,-0.707107,-0.755929
6,0.774597,0.43629,1.717911,-0.534522,0.0,-0.707107,-0.330719
7,1.161895,0.59088,1.015129,-0.534522,-1.06066,-0.707107,-0.330719
8,1.549193,0.652717,1.015129,-0.534522,1.06066,1.414214,2.078805


# Robust Scaler

In [74]:
from sklearn.preprocessing import RobustScaler
robust=RobustScaler()
robust

In [75]:
robust_scaled_df=robust.fit_transform(df)
robust_scaled_df=pd.DataFrame(robust_scaled_df,columns=df.columns)
robust_scaled_df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,BEDROOMS,BATH,SQ_FT
0,-1.0,-9.0,0.333333,0.0,0.5,0.0,0.333333
1,-0.75,-0.545455,0.0,1.0,0.5,0.0,0.0
2,-0.5,-0.545455,0.0,1.0,-0.5,0.0,-0.666667
3,-0.25,-0.181818,-0.333333,0.0,-0.5,1.0,-1.666667
4,0.0,0.0,-0.333333,0.0,0.5,1.0,4.333333
5,0.25,0.363636,-0.333333,0.0,-0.5,0.0,-1.0
6,0.5,0.454545,1.0,0.0,0.0,0.0,0.0
7,0.75,0.909091,0.666667,0.0,-0.5,0.0,0.0
8,1.0,1.090909,0.666667,0.0,0.5,1.0,5.666667


# Transpose function

In [76]:
transposed_df=df.transpose()

In [77]:
transposed_df

Unnamed: 0,0,1,2,3,4,5,6,7,8
PID,100001000.0,100002000.0,100003000.0,100004000.0,100005000.0,100006000.0,100007000.0,100008000.0,100009000.0
ST_NUM,104.0,197.0,197.0,201.0,203.0,207.0,208.0,213.0,215.0
ST_NAME,2.0,1.0,1.0,0.0,0.0,0.0,4.0,3.0,3.0
OWN_OCCUPIED,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0
BEDROOMS,3.0,3.0,1.0,1.0,3.0,1.0,2.0,1.0,3.0
BATH,1.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,2.0
SQ_FT,1000.0,950.0,850.0,700.0,1600.0,800.0,950.0,950.0,1800.0
