# Regression Problem

## Download the data

In [1]:
import os
import tarfile
import urllib

In [2]:
# URL for downloading the dataset
download_root = "https://github.com/ageron/handson-ml2/raw/refs/heads/master/"
data_path = os.path.join("datasets", "housing")  # Local directory to store the dataset
data_url = download_root + "datasets/housing/housing.tgz"  # Complete URL to the dataset

In [3]:
#function for download dataset
# Function to download and extract the dataset
def fetch_data(housing_url=data_url, housing_path=data_path):
    os.makedirs(housing_path, exist_ok=True)  # Create directory if it doesn't exist
    tgz_path = os.path.join(housing_path, "housing.tgz")  # Path to save the .tgz file
    urllib.request.urlretrieve(housing_url, tgz_path)  # Download the .tgz file
    housing_tgz = tarfile.open(tgz_path)  # Open the .tgz file
    housing_tgz.extractall(housing_path)  # Extract its contents
    housing_tgz.close()  # Close the file

# Fetch the dataset
fetch_data()

In [3]:
# Now load the data using pandas
import pandas as pd
def load_data(path=data_path):
    csv_path = os.path.join(path, "housing.csv")
    return pd.read_csv(csv_path)

## Take a Quick look at the data structure

In [5]:
housing = load_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [6]:
try:
    housing_data = load_data()  # Load the data
    print(housing_data.head())  # Display the first few rows of the DataFrame
except FileNotFoundError:
    print("The file housing.csv was not found in the specified directory.")
except Exception as e:
    print(f"An error occurred: {e}")

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  


In [7]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


Notice that the **total_bed**

In [10]:
housing.shape

(20640, 10)