In [3]:
import pandas as pd
pd.__version__


'1.5.3'

In [7]:
import csv

FILENAME = 'housing.csv'

with open('housing.csv', 'r') as file:
    csv_reader = csv.reader(file)
    
def get_num_columns(filename):
    with open(filename, 'r') as file:
        csv_reader = csv.reader(file)
        max_num_columns = len(next(csv_reader))
        return max_num_columns

num_columns = get_num_columns(filename)
print("Number of columns:", num_columns)

Number of columns: 10


In [8]:
def get_columns_with_missing_data(filename):
    columns_with_missing_data = set()
    with open(filename, 'r') as file:
        csv_reader = csv.reader(file)
        # Read the first row to get the column headers
        headers = next(csv_reader)
        
        for row in csv_reader:
            # Check each column for missing data
            for index, value in enumerate(row):
                if not value:
                    columns_with_missing_data.add(index)
    
    # Retrieve the column headers for columns with missing data
    headers_with_missing_data = [headers[index] for index in columns_with_missing_data]
    return headers_with_missing_data

columns_missing_data = get_columns_with_missing_data(FILENAME)

print('Columns with missing data:')
for column_header in columns_missing_data:
    print(column_header)

Columns with missing data:
total_bedrooms


In [9]:
def get_unique_values_for_column(filename, header):
    unique_values = set()
    with open(filename, 'r') as file:
        csv_reader = csv.DictReader(file)
        for row in csv_reader:
            value = row.get(header)
            if value:
                unique_values.add(value)
    return unique_values


target_header = 'ocean_proximity'  
unique_values = get_unique_values_for_column(filename, target_header)

print(f'Unique values for the column with header "{target_header}":')
for value in unique_values:
    print(value)

Unique values for the column with header "ocean_proximity":
ISLAND
NEAR BAY
<1H OCEAN
INLAND
NEAR OCEAN


In [10]:
def calculate_average_median_house_value_near_bay(filename):
    total_median_house_value = 0
    count_near_bay = 0

    with open(filename, 'r') as file:
        csv_reader = csv.DictReader(file)
        for row in csv_reader:
            if row['ocean_proximity'] == 'NEAR BAY':
                try:
                    total_median_house_value += float(row['median_house_value'])
                    count_near_bay += 1
                except ValueError:
                    # Handle the case where 'median_house_value' is not a valid float
                    pass

    # Calculate the average median house value for 'NEAR BAY'
    average_median_house_value = total_median_house_value / count_near_bay if count_near_bay > 0 else 0

    return average_median_house_value

average_median_house_value_near_bay = calculate_average_median_house_value_near_bay(FILENAME)

print(f'Average median house value for "NEAR BAY": ${average_median_house_value_near_bay:.2f}')

Average median house value for "NEAR BAY": $259212.31


In [11]:

df = pd.read_csv(FILENAME)

# Step 1: Calculate the initial average of the 'total_bedrooms' column
initial_average = df['total_bedrooms'].mean()

# Step 2: Fill missing values in 'total_bedrooms' with the mean value
df['total_bedrooms'].fillna(initial_average, inplace=True)

# Step 3: Calculate the average of 'total_bedrooms' again after filling missing values
average_after_filling = df['total_bedrooms'].mean()

print(f'Initial average of total_bedrooms: {initial_average:.2f}')
print(f'Average of total_bedrooms after filling missing values: {average_after_filling:.2f}')

# Save the DataFrame to a new CSV file with filled missing values
df.to_csv('file_with_filled_values.csv', index=False)

Initial average of total_bedrooms: 537.87
Average of total_bedrooms after filling missing values: 537.87


In [12]:
import numpy as np

df = pd.read_csv(FILENAME)

# Step 1: Select rows with ocean_proximity = 'ISLAND' and specified columns
selected_rows = df[df['ocean_proximity'] == 'ISLAND'][['housing_median_age', 'total_rooms', 'total_bedrooms']]

# Step 2: Compute the transpose of the selected data and perform matrix-matrix multiplication
X = selected_rows.values
XTX = X.T @ X

# Step 3: Compute the inverse of XTX
XTX_inverse = np.linalg.inv(XTX)

# Step 4: Create array y
y = np.array([950, 1300, 800, 1000, 1300])

# Step 5: Calculate w
w = XTX_inverse @ X.T @ y

# Step 6: Get the value of the last element of w
last_element_of_w = w[-1]

print(f'The value of the last element of w is: {last_element_of_w:.2f}')


The value of the last element of w is: 5.70
