In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [9]:
data = pd.read_csv('/content/housing.csv')
print("Description of the data:")
print(data.describe())


Description of the data:
          longitude      latitude  housing_median_age   total_rooms  \
count  20640.000000  20640.000000        20640.000000  20640.000000   
mean    -119.569704     35.631861           28.639486   2635.763081   
std        2.003532      2.135952           12.585558   2181.615252   
min     -124.350000     32.540000            1.000000      2.000000   
25%     -121.800000     33.930000           18.000000   1447.750000   
50%     -118.490000     34.260000           29.000000   2127.000000   
75%     -118.010000     37.710000           37.000000   3148.000000   
max     -114.310000     41.950000           52.000000  39320.000000   

       total_bedrooms    population    households  median_income  \
count    20433.000000  20640.000000  20640.000000   20640.000000   
mean       537.870553   1425.476744    499.539680       3.870671   
std        421.385070   1132.462122    382.329753       1.899822   
min          1.000000      3.000000      1.000000       0.49990

In [10]:
# b) Find data type and shape of each column
print("\nData types of each column:")
print(data.dtypes)
print("\nShape of the data:")
print(data.shape)


Data types of each column:
longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
ocean_proximity        object
dtype: object

Shape of the data:
(20640, 10)


In [11]:
# c) Find the null values (if yes fill the null values with ‘0’ or mean of that column)
null_values = data.isnull().sum()
print("\nNull values in the data:")
print(null_values)



Null values in the data:
longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64


In [12]:
# d) find features and target variables
# Assuming the target variable is in the last column
features = data.iloc[:, :-1]
target = data.iloc[:, -1]
print(features)
print(target)


       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0        -122.23     37.88                41.0        880.0           129.0   
1        -122.22     37.86                21.0       7099.0          1106.0   
2        -122.24     37.85                52.0       1467.0           190.0   
3        -122.25     37.85                52.0       1274.0           235.0   
4        -122.25     37.85                52.0       1627.0           280.0   
...          ...       ...                 ...          ...             ...   
20635    -121.09     39.48                25.0       1665.0           374.0   
20636    -121.21     39.49                18.0        697.0           150.0   
20637    -121.22     39.43                17.0       2254.0           485.0   
20638    -121.32     39.43                18.0       1860.0           409.0   
20639    -121.24     39.37                16.0       2785.0           616.0   

       population  households  median_income  media

In [13]:
# e) Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
print(X_train,y_train)
print(X_test,y_test)

       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
14196    -117.03     32.71                33.0       3126.0           627.0   
8267     -118.16     33.77                49.0       3382.0           787.0   
17445    -120.48     34.66                 4.0       1897.0           331.0   
14265    -117.11     32.69                36.0       1421.0           367.0   
2271     -119.80     36.78                43.0       2382.0           431.0   
...          ...       ...                 ...          ...             ...   
11284    -117.96     33.78                35.0       1330.0           201.0   
11964    -117.43     34.02                33.0       3084.0           570.0   
5390     -118.38     34.03                36.0       2101.0           569.0   
860      -121.96     37.58                15.0       3575.0           597.0   
15795    -122.42     37.77                52.0       4226.0          1315.0   

       population  households  median_income  media

In [14]:
# f) Normalize the data with min-max scaling
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(X_train_scaled)
print(X_test_scaled)

[[0.72908367 0.01702128 0.62745098 ... 0.10228581 0.19032151 0.18144461]
 [0.61653386 0.12978723 0.94117647 ... 0.12415721 0.22845202 0.75690616]
 [0.38545817 0.22446809 0.05882353 ... 0.05508962 0.25216204 0.32494918]
 ...
 [0.59462151 0.15744681 0.68627451 ... 0.08649893 0.16789424 0.42701061]
 [0.23804781 0.53510638 0.2745098  ... 0.09176122 0.35994676 0.55360803]
 [0.19223108 0.55531915 1.         ... 0.20407828 0.14314285 0.63917468]]
[[0.53187251 0.37340426 0.47058824 ... 0.0588719  0.08146784 0.06742446]
 [0.48705179 0.27553191 0.56862745 ... 0.09587239 0.14009462 0.06350695]
 [0.19023904 0.55851064 1.         ... 0.15819766 0.2055282  1.        ]
 ...
 [0.22908367 0.50638298 0.47058824 ... 0.09324124 0.60205376 1.        ]
 [0.45717131 0.44893617 0.68627451 ... 0.07778326 0.15759093 0.1181459 ]
 [0.59561753 0.17765957 0.31372549 ... 0.07350765 0.21049365 0.2814442 ]]
