In [None]:
import pandas as pd

file_path = "/Users/zeynepsalihoglu/Downloads/RealHaven/san_jose_houses_sample_data.csv"
df = pd.read_csv(file_path)

# Display first few rows to understand the structure
df.head()

Unnamed: 0,Address,Price,Bedrooms,Bathrooms,Square Footage,Property Type
0,"4311 Main St, San Jose, CA 95132",$2850K,5,4,2596,Single Family
1,"7446 Main St, San Jose, CA 95158",$1483K,6,4,1796,Apartment
2,"2515 Main St, San Jose, CA 95160",$649K,4,4,3511,Apartment
3,"6960 Main St, San Jose, CA 95128",$1177K,1,4,1427,Apartment
4,"3687 Main St, San Jose, CA 95183",$2136K,2,2,4544,Condo


In [74]:
df.columns

Index(['Street Address', 'City', 'State', 'ZIP Code', 'Price', 'Bedrooms',
       'Bathrooms', 'Square Footage', 'Property Type'],
      dtype='object')

In [76]:
num_rows = df.shape[0]
num_rows

500

In [66]:
# basic information about dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Address         500 non-null    object
 1   Price           500 non-null    object
 2   Bedrooms        500 non-null    int64 
 3   Bathrooms       500 non-null    int64 
 4   Square Footage  500 non-null    int64 
 5   Property Type   500 non-null    object
dtypes: int64(3), object(3)
memory usage: 23.6+ KB
None


In [67]:
# check if there are any column with null 
print(df.isnull().sum())

Address           0
Price             0
Bedrooms          0
Bathrooms         0
Square Footage    0
Property Type     0
dtype: int64


In [68]:
# Convert price from string to numerical
df["Price"] = df["Price"].replace({"[$K,]": ""}, regex=True).astype(float)

df["Price"] = df["Price"].apply(lambda x: f"{int(x)}K")

# Convert Bedrooms, Bathrooms, Square Footage to integer
df["Bedrooms"] = df["Bedrooms"].astype(int)
df["Bathrooms"] = df["Bathrooms"].astype(int)
df["Square Footage"] = df["Square Footage"].astype(int)

# Convert Date columns if they exist
if "Listing Date" in df.columns:
    df["Listing Date"] = pd.to_datetime(df["Listing Date"])

In [69]:
df.head()

Unnamed: 0,Address,Price,Bedrooms,Bathrooms,Square Footage,Property Type
0,"4311 Main St, San Jose, CA 95132",2850K,5,4,2596,Single Family
1,"7446 Main St, San Jose, CA 95158",1483K,6,4,1796,Apartment
2,"2515 Main St, San Jose, CA 95160",649K,4,4,3511,Apartment
3,"6960 Main St, San Jose, CA 95128",1177K,1,4,1427,Apartment
4,"3687 Main St, San Jose, CA 95183",2136K,2,2,4544,Condo


In [70]:
# Convert "Price" column: Remove "K", convert to full number, format with commas
df["Price"] = df["Price"].str.replace("K", "").astype(float) * 1_000  # Convert to full number
df["Price"] = df["Price"].astype(int)  # Ensure it's an integer

In [71]:
df.head()

Unnamed: 0,Address,Price,Bedrooms,Bathrooms,Square Footage,Property Type
0,"4311 Main St, San Jose, CA 95132",2850000,5,4,2596,Single Family
1,"7446 Main St, San Jose, CA 95158",1483000,6,4,1796,Apartment
2,"2515 Main St, San Jose, CA 95160",649000,4,4,3511,Apartment
3,"6960 Main St, San Jose, CA 95128",1177000,1,4,1427,Apartment
4,"3687 Main St, San Jose, CA 95183",2136000,2,2,4544,Condo


In [72]:
# Ensure the address format is correctly split into Street Address, City, State, and ZIP Code
df[['Street Address', 'City', 'State', 'ZIP Code']] = df['Address'].str.extract(r'^(.*),\s*(.*),\s*(\w{2})\s*(\d{5})$')

# Drop the original Address column
df = df.drop(columns=['Address'])

# Reorder columns for better readability
column_order = ['Street Address', 'City', 'State', 'ZIP Code', 'Price', 'Bedrooms', 'Bathrooms', 'Square Footage', 'Property Type']
df = df[column_order]
df.head()

Unnamed: 0,Street Address,City,State,ZIP Code,Price,Bedrooms,Bathrooms,Square Footage,Property Type
0,4311 Main St,San Jose,CA,95132,2850000,5,4,2596,Single Family
1,7446 Main St,San Jose,CA,95158,1483000,6,4,1796,Apartment
2,2515 Main St,San Jose,CA,95160,649000,4,4,3511,Apartment
3,6960 Main St,San Jose,CA,95128,1177000,1,4,1427,Apartment
4,3687 Main St,San Jose,CA,95183,2136000,2,2,4544,Condo


In [77]:
updated_csv_path = "/Users/zeynepsalihoglu/Downloads/RealHaven/san_jose_houses_sample_data.csv"
df.to_csv(updated_csv_path, index=False)