In [7]:
import pandas as pd
from datetime import datetime

## Loading and Parsing the CSV Data

In [8]:
# Load the CSV file
file_path = 'Resources/Property Sales Data/2013_to_2023_property-sales-data.csv'

dtype_dict = {
    'FinishedSqft': 'str',
    'Lotsize': 'str',
    'Sale_price': 'str'
}
data = pd.read_csv(file_path)

# Function to parse and standardize the date format
def parse_date(date_str):
    for fmt in ("%Y-%m-%d", "%m/%d/%Y", "%d-%b-%y", "%b-%y"):
        try:
            return datetime.strptime(date_str, fmt).strftime("%Y-%m-%d")
        except ValueError:
            pass
    return None

# Apply the function to the 'Sale_date' column
data['Sale_date'] = data['Sale_date'].apply(parse_date)

# Check for any dates that couldn't be parsed
unparsed_dates = data[data['Sale_date'].isna()]
unparsed_dates_count = len(unparsed_dates)


  data = pd.read_csv(file_path)


## Cleaning Numeric Columns

In [9]:
# Function to clean numeric columns
def clean_numeric(column):
    # Ensure the column is a string, replace unwanted characters, and convert to numeric
    return pd.to_numeric(column.astype(str).str.replace('[\\$,]', '', regex=True).str.replace(',', ''), errors='coerce')


## Cleaning and Converting Key Numeric Columns


In [10]:
# Cleaning and converting the 'FinishedSqft', 'Lotsize', and 'Sale_price' columns
data['FinishedSqft'] = clean_numeric(data['FinishedSqft'])
data['Lotsize'] = clean_numeric(data['Lotsize'])
data['Sale_price'] = clean_numeric(data['Sale_price'])
data

Unnamed: 0,PropertyID,PropType,taxkey,Address,CondoProject,District,nbhd,Style,Extwall,Stories,Year_Built,Rooms,FinishedSqft,Units,Bdrms,Fbath,Hbath,Lotsize,Sale_date,Sale_price
0,98459.0,Commercial,30091000,9220 N 107TH ST,,9.0,6202.0,Office Building - 1 Story,Concrete Block,1.0,1981.0,,12960.0,5,,0.0,0.0,54885.0,2023-08-15,530000.0
1,98477.0,Residential,39995000,9574 N 107TH ST,,9.0,40.0,Cape Cod,Aluminum/Vinyl,1.0,1942.0,7.0,1182.0,1,4.0,1.0,1.0,33541.0,2023-12-15,160000.0
2,98500.0,Residential,40071000,9531 N MICHAEL CT,,9.0,40.0,Ranch,Fiber Cement/Hardiplank,1.0,2006.0,9.0,1880.0,1,3.0,2.0,0.0,10607.0,2023-08-31,387500.0
3,98566.0,Residential,50047000,9350 N JOYCE AV,,9.0,40.0,Ranch,Aluminum/Vinyl,1.0,1980.0,6.0,1489.0,1,3.0,3.0,0.0,8640.0,2023-08-25,335000.0
4,98598.0,Residential,50079000,9355 N BURBANK AV,,9.0,40.0,Ranch,Aluminum/Vinyl,1.0,1986.0,5.0,1209.0,1,3.0,1.0,1.0,7200.0,2023-06-20,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56227,,Commercial,4280769100,106 W SEEBOTH ST,,12.0,6275.0,"Store Building - Single tenant, 1 story",,1.0,1926.0,0.0,1533.0,1,0.0,0.0,0.0,421.0,2013-12-01,203000.0
56228,,Commercial,5000407000,2659 S KINNICKINNIC AV,,14.0,6282.0,Tavern,,2.0,1906.0,0.0,2880.0,1,0.0,0.0,0.0,3000.0,2013-12-01,225000.0
56229,,Commercial,3921041111,417 E CHICAGO ST,,4.0,6296.0,Warehouse Building - 1 Story,,1.0,1964.0,0.0,42141.0,1,0.0,0.0,0.0,91440.0,2013-12-01,10000000.0
56230,,Commercial,2452125100,4290 N TEUTONIA AV,,1.0,6234.0,Warehouse Building - 1 Story,,1.0,1951.0,0.0,11297.0,1,0.0,0.0,0.0,16403.0,2013-12-01,200000.0


In [11]:
data.info(type)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56232 entries, 0 to 56231
Data columns (total 20 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PropertyID    32049 non-null  float64
 1   PropType      56224 non-null  object 
 2   taxkey        56232 non-null  int64  
 3   Address       56228 non-null  object 
 4   CondoProject  9164 non-null   object 
 5   District      56230 non-null  float64
 6   nbhd          56175 non-null  float64
 7   Style         56141 non-null  object 
 8   Extwall       45726 non-null  object 
 9   Stories       56063 non-null  float64
 10  Year_Built    56171 non-null  float64
 11  Rooms         54602 non-null  float64
 12  FinishedSqft  56131 non-null  float64
 13  Units         56232 non-null  int64  
 14  Bdrms         54603 non-null  float64
 15  Fbath         55577 non-null  float64
 16  Hbath         51392 non-null  float64
 17  Lotsize       56231 non-null  float64
 18  Sale_date     56232 non-nu

## Property Type distribution information 

In [12]:
data.PropType.value_counts()

PropType
Residential      43742
Condominium       8428
Commercial        2007
Lg Apartment      1916
Vacant Land         97
Manufacturing       29
Exempt               5
Name: count, dtype: int64

## Filtering residential properties

In [13]:
# data = data[data.PropType != 'Commercial']
# data = data[data.PropType != 'Vacant Land']
# data = data[data.PropType != 'Manufacturing']
# data = data[data.PropType != 'Exempt']
data = data[data.PropType == 'Residential']
data

Unnamed: 0,PropertyID,PropType,taxkey,Address,CondoProject,District,nbhd,Style,Extwall,Stories,Year_Built,Rooms,FinishedSqft,Units,Bdrms,Fbath,Hbath,Lotsize,Sale_date,Sale_price
1,98477.0,Residential,39995000,9574 N 107TH ST,,9.0,40.0,Cape Cod,Aluminum/Vinyl,1.0,1942.0,7.0,1182.0,1,4.0,1.0,1.0,33541.0,2023-12-15,160000.0
2,98500.0,Residential,40071000,9531 N MICHAEL CT,,9.0,40.0,Ranch,Fiber Cement/Hardiplank,1.0,2006.0,9.0,1880.0,1,3.0,2.0,0.0,10607.0,2023-08-31,387500.0
3,98566.0,Residential,50047000,9350 N JOYCE AV,,9.0,40.0,Ranch,Aluminum/Vinyl,1.0,1980.0,6.0,1489.0,1,3.0,3.0,0.0,8640.0,2023-08-25,335000.0
4,98598.0,Residential,50079000,9355 N BURBANK AV,,9.0,40.0,Ranch,Aluminum/Vinyl,1.0,1986.0,5.0,1209.0,1,3.0,1.0,1.0,7200.0,2023-06-20,250000.0
5,98624.0,Residential,50105000,8210 W GLENBROOK RD,,9.0,40.0,Colonial,Aluminum/Vinyl,2.0,1989.0,6.0,1663.0,1,3.0,2.0,1.0,7200.0,2023-01-10,270000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56178,,Residential,5820515000,3930 S PINE AV,,14.0,4700.0,Ranch,Aluminum / Vinyl,1.0,1951.0,0.0,725.0,1,2.0,1.0,0.0,5546.0,2013-12-01,97500.0
56179,,Residential,5410321000,3457 S ILLINOIS AV,,14.0,4600.0,Residence O/S,Aluminum / Vinyl,1.0,1926.0,0.0,937.0,1,2.0,1.0,0.0,4560.0,2013-12-01,149000.0
56180,,Residential,3240979000,1916 W WRIGHT ST,,15.0,2940.0,Duplex O/S,Aluminum / Vinyl,1.5,1893.0,0.0,1503.0,2,5.0,2.0,0.0,3600.0,2013-12-01,14000.0
56181,,Residential,3280710000,2529 N 46TH ST,,15.0,2520.0,Milwaukee Bungalow,Brick,1.0,1918.0,0.0,2059.0,1,4.0,2.0,0.0,4920.0,2013-12-01,102900.0


### Filtering out properties with sales price lower thn $50000

In [14]:
data = data[data.Sale_price >= 50000]
data

Unnamed: 0,PropertyID,PropType,taxkey,Address,CondoProject,District,nbhd,Style,Extwall,Stories,Year_Built,Rooms,FinishedSqft,Units,Bdrms,Fbath,Hbath,Lotsize,Sale_date,Sale_price
1,98477.0,Residential,39995000,9574 N 107TH ST,,9.0,40.0,Cape Cod,Aluminum/Vinyl,1.0,1942.0,7.0,1182.0,1,4.0,1.0,1.0,33541.0,2023-12-15,160000.0
2,98500.0,Residential,40071000,9531 N MICHAEL CT,,9.0,40.0,Ranch,Fiber Cement/Hardiplank,1.0,2006.0,9.0,1880.0,1,3.0,2.0,0.0,10607.0,2023-08-31,387500.0
3,98566.0,Residential,50047000,9350 N JOYCE AV,,9.0,40.0,Ranch,Aluminum/Vinyl,1.0,1980.0,6.0,1489.0,1,3.0,3.0,0.0,8640.0,2023-08-25,335000.0
4,98598.0,Residential,50079000,9355 N BURBANK AV,,9.0,40.0,Ranch,Aluminum/Vinyl,1.0,1986.0,5.0,1209.0,1,3.0,1.0,1.0,7200.0,2023-06-20,250000.0
5,98624.0,Residential,50105000,8210 W GLENBROOK RD,,9.0,40.0,Colonial,Aluminum/Vinyl,2.0,1989.0,6.0,1663.0,1,3.0,2.0,1.0,7200.0,2023-01-10,270000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56176,,Residential,5070444000,2937 S 9TH ST,,14.0,4180.0,Milwaukee Bungalow,Aluminum / Vinyl,1.0,1922.0,0.0,988.0,1,3.0,1.0,0.0,3660.0,2013-12-01,61500.0
56177,,Residential,5470325000,3812 S LOGAN AV,,14.0,4620.0,Ranch,Aluminum / Vinyl,1.0,1950.0,0.0,1119.0,1,3.0,1.0,0.0,5120.0,2013-12-01,65000.0
56178,,Residential,5820515000,3930 S PINE AV,,14.0,4700.0,Ranch,Aluminum / Vinyl,1.0,1951.0,0.0,725.0,1,2.0,1.0,0.0,5546.0,2013-12-01,97500.0
56179,,Residential,5410321000,3457 S ILLINOIS AV,,14.0,4600.0,Residence O/S,Aluminum / Vinyl,1.0,1926.0,0.0,937.0,1,2.0,1.0,0.0,4560.0,2013-12-01,149000.0


In [15]:
# droping columns with unique values b/c they can't be used in machine learning process
data = data.drop(columns=['PropertyID', 'taxkey', 'Address', 'CondoProject', 'PropType'])
data

Unnamed: 0,District,nbhd,Style,Extwall,Stories,Year_Built,Rooms,FinishedSqft,Units,Bdrms,Fbath,Hbath,Lotsize,Sale_date,Sale_price
1,9.0,40.0,Cape Cod,Aluminum/Vinyl,1.0,1942.0,7.0,1182.0,1,4.0,1.0,1.0,33541.0,2023-12-15,160000.0
2,9.0,40.0,Ranch,Fiber Cement/Hardiplank,1.0,2006.0,9.0,1880.0,1,3.0,2.0,0.0,10607.0,2023-08-31,387500.0
3,9.0,40.0,Ranch,Aluminum/Vinyl,1.0,1980.0,6.0,1489.0,1,3.0,3.0,0.0,8640.0,2023-08-25,335000.0
4,9.0,40.0,Ranch,Aluminum/Vinyl,1.0,1986.0,5.0,1209.0,1,3.0,1.0,1.0,7200.0,2023-06-20,250000.0
5,9.0,40.0,Colonial,Aluminum/Vinyl,2.0,1989.0,6.0,1663.0,1,3.0,2.0,1.0,7200.0,2023-01-10,270000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56176,14.0,4180.0,Milwaukee Bungalow,Aluminum / Vinyl,1.0,1922.0,0.0,988.0,1,3.0,1.0,0.0,3660.0,2013-12-01,61500.0
56177,14.0,4620.0,Ranch,Aluminum / Vinyl,1.0,1950.0,0.0,1119.0,1,3.0,1.0,0.0,5120.0,2013-12-01,65000.0
56178,14.0,4700.0,Ranch,Aluminum / Vinyl,1.0,1951.0,0.0,725.0,1,2.0,1.0,0.0,5546.0,2013-12-01,97500.0
56179,14.0,4600.0,Residence O/S,Aluminum / Vinyl,1.0,1926.0,0.0,937.0,1,2.0,1.0,0.0,4560.0,2013-12-01,149000.0


# Handling missing values

In [16]:
# Filling in missing needed values
# Most missing Extwall were condos. Filling with brick, since the remaining condos had brick.
# data['Extwall'] = data['Extwall'].fillna("brick")
# Filling with 0 for bdrms and rooms
# data['Bdrms'] = data['Bdrms'].fillna(0.0)
# Filling with 1 for stories
data['Stories'] = data['Stories'].fillna(1.0)
data

Unnamed: 0,District,nbhd,Style,Extwall,Stories,Year_Built,Rooms,FinishedSqft,Units,Bdrms,Fbath,Hbath,Lotsize,Sale_date,Sale_price
1,9.0,40.0,Cape Cod,Aluminum/Vinyl,1.0,1942.0,7.0,1182.0,1,4.0,1.0,1.0,33541.0,2023-12-15,160000.0
2,9.0,40.0,Ranch,Fiber Cement/Hardiplank,1.0,2006.0,9.0,1880.0,1,3.0,2.0,0.0,10607.0,2023-08-31,387500.0
3,9.0,40.0,Ranch,Aluminum/Vinyl,1.0,1980.0,6.0,1489.0,1,3.0,3.0,0.0,8640.0,2023-08-25,335000.0
4,9.0,40.0,Ranch,Aluminum/Vinyl,1.0,1986.0,5.0,1209.0,1,3.0,1.0,1.0,7200.0,2023-06-20,250000.0
5,9.0,40.0,Colonial,Aluminum/Vinyl,2.0,1989.0,6.0,1663.0,1,3.0,2.0,1.0,7200.0,2023-01-10,270000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56176,14.0,4180.0,Milwaukee Bungalow,Aluminum / Vinyl,1.0,1922.0,0.0,988.0,1,3.0,1.0,0.0,3660.0,2013-12-01,61500.0
56177,14.0,4620.0,Ranch,Aluminum / Vinyl,1.0,1950.0,0.0,1119.0,1,3.0,1.0,0.0,5120.0,2013-12-01,65000.0
56178,14.0,4700.0,Ranch,Aluminum / Vinyl,1.0,1951.0,0.0,725.0,1,2.0,1.0,0.0,5546.0,2013-12-01,97500.0
56179,14.0,4600.0,Residence O/S,Aluminum / Vinyl,1.0,1926.0,0.0,937.0,1,2.0,1.0,0.0,4560.0,2013-12-01,149000.0


# Converting Sale Date and extracting Year and Month 

In [17]:

# Converting 'Sale_date' to datetime with mixed format and adding sale-year and sale-month columns 
data['Sale_date'] = pd.to_datetime(data['Sale_date'], errors='coerce')
data['sale_year'] = data['Sale_date'].dt.year
data['sale_month'] = data['Sale_date'].dt.month
data

Unnamed: 0,District,nbhd,Style,Extwall,Stories,Year_Built,Rooms,FinishedSqft,Units,Bdrms,Fbath,Hbath,Lotsize,Sale_date,Sale_price,sale_year,sale_month
1,9.0,40.0,Cape Cod,Aluminum/Vinyl,1.0,1942.0,7.0,1182.0,1,4.0,1.0,1.0,33541.0,2023-12-15,160000.0,2023,12
2,9.0,40.0,Ranch,Fiber Cement/Hardiplank,1.0,2006.0,9.0,1880.0,1,3.0,2.0,0.0,10607.0,2023-08-31,387500.0,2023,8
3,9.0,40.0,Ranch,Aluminum/Vinyl,1.0,1980.0,6.0,1489.0,1,3.0,3.0,0.0,8640.0,2023-08-25,335000.0,2023,8
4,9.0,40.0,Ranch,Aluminum/Vinyl,1.0,1986.0,5.0,1209.0,1,3.0,1.0,1.0,7200.0,2023-06-20,250000.0,2023,6
5,9.0,40.0,Colonial,Aluminum/Vinyl,2.0,1989.0,6.0,1663.0,1,3.0,2.0,1.0,7200.0,2023-01-10,270000.0,2023,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56176,14.0,4180.0,Milwaukee Bungalow,Aluminum / Vinyl,1.0,1922.0,0.0,988.0,1,3.0,1.0,0.0,3660.0,2013-12-01,61500.0,2013,12
56177,14.0,4620.0,Ranch,Aluminum / Vinyl,1.0,1950.0,0.0,1119.0,1,3.0,1.0,0.0,5120.0,2013-12-01,65000.0,2013,12
56178,14.0,4700.0,Ranch,Aluminum / Vinyl,1.0,1951.0,0.0,725.0,1,2.0,1.0,0.0,5546.0,2013-12-01,97500.0,2013,12
56179,14.0,4600.0,Residence O/S,Aluminum / Vinyl,1.0,1926.0,0.0,937.0,1,2.0,1.0,0.0,4560.0,2013-12-01,149000.0,2013,12


# Standardizing 

In [18]:
# Standardizing data
# Standardize column names to lowercase
data.columns = [col.lower() for col in data.columns]
data

Unnamed: 0,district,nbhd,style,extwall,stories,year_built,rooms,finishedsqft,units,bdrms,fbath,hbath,lotsize,sale_date,sale_price,sale_year,sale_month
1,9.0,40.0,Cape Cod,Aluminum/Vinyl,1.0,1942.0,7.0,1182.0,1,4.0,1.0,1.0,33541.0,2023-12-15,160000.0,2023,12
2,9.0,40.0,Ranch,Fiber Cement/Hardiplank,1.0,2006.0,9.0,1880.0,1,3.0,2.0,0.0,10607.0,2023-08-31,387500.0,2023,8
3,9.0,40.0,Ranch,Aluminum/Vinyl,1.0,1980.0,6.0,1489.0,1,3.0,3.0,0.0,8640.0,2023-08-25,335000.0,2023,8
4,9.0,40.0,Ranch,Aluminum/Vinyl,1.0,1986.0,5.0,1209.0,1,3.0,1.0,1.0,7200.0,2023-06-20,250000.0,2023,6
5,9.0,40.0,Colonial,Aluminum/Vinyl,2.0,1989.0,6.0,1663.0,1,3.0,2.0,1.0,7200.0,2023-01-10,270000.0,2023,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56176,14.0,4180.0,Milwaukee Bungalow,Aluminum / Vinyl,1.0,1922.0,0.0,988.0,1,3.0,1.0,0.0,3660.0,2013-12-01,61500.0,2013,12
56177,14.0,4620.0,Ranch,Aluminum / Vinyl,1.0,1950.0,0.0,1119.0,1,3.0,1.0,0.0,5120.0,2013-12-01,65000.0,2013,12
56178,14.0,4700.0,Ranch,Aluminum / Vinyl,1.0,1951.0,0.0,725.0,1,2.0,1.0,0.0,5546.0,2013-12-01,97500.0,2013,12
56179,14.0,4600.0,Residence O/S,Aluminum / Vinyl,1.0,1926.0,0.0,937.0,1,2.0,1.0,0.0,4560.0,2013-12-01,149000.0,2013,12


In [19]:
# Standardizig categorical columns to consistent capitalization
# data['proptype'] = data['proptype'].str.lower()
data['style'] = data['style'].str.lower()
data['extwall'] = data['extwall'].str.lower()
data

Unnamed: 0,district,nbhd,style,extwall,stories,year_built,rooms,finishedsqft,units,bdrms,fbath,hbath,lotsize,sale_date,sale_price,sale_year,sale_month
1,9.0,40.0,cape cod,aluminum/vinyl,1.0,1942.0,7.0,1182.0,1,4.0,1.0,1.0,33541.0,2023-12-15,160000.0,2023,12
2,9.0,40.0,ranch,fiber cement/hardiplank,1.0,2006.0,9.0,1880.0,1,3.0,2.0,0.0,10607.0,2023-08-31,387500.0,2023,8
3,9.0,40.0,ranch,aluminum/vinyl,1.0,1980.0,6.0,1489.0,1,3.0,3.0,0.0,8640.0,2023-08-25,335000.0,2023,8
4,9.0,40.0,ranch,aluminum/vinyl,1.0,1986.0,5.0,1209.0,1,3.0,1.0,1.0,7200.0,2023-06-20,250000.0,2023,6
5,9.0,40.0,colonial,aluminum/vinyl,2.0,1989.0,6.0,1663.0,1,3.0,2.0,1.0,7200.0,2023-01-10,270000.0,2023,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56176,14.0,4180.0,milwaukee bungalow,aluminum / vinyl,1.0,1922.0,0.0,988.0,1,3.0,1.0,0.0,3660.0,2013-12-01,61500.0,2013,12
56177,14.0,4620.0,ranch,aluminum / vinyl,1.0,1950.0,0.0,1119.0,1,3.0,1.0,0.0,5120.0,2013-12-01,65000.0,2013,12
56178,14.0,4700.0,ranch,aluminum / vinyl,1.0,1951.0,0.0,725.0,1,2.0,1.0,0.0,5546.0,2013-12-01,97500.0,2013,12
56179,14.0,4600.0,residence o/s,aluminum / vinyl,1.0,1926.0,0.0,937.0,1,2.0,1.0,0.0,4560.0,2013-12-01,149000.0,2013,12


In [20]:
# Handling Missing Values
# For simplicity, we'll drop rows with missing target values (sale_price) and rows with excessive missing values.
data_cleaned = data.dropna(subset=['sale_price', 'sale_date', 'finishedsqft', 'district', 'nbhd', 'lotsize', 'hbath', 'fbath', 'rooms', 'extwall'])
data_cleaned

Unnamed: 0,district,nbhd,style,extwall,stories,year_built,rooms,finishedsqft,units,bdrms,fbath,hbath,lotsize,sale_date,sale_price,sale_year,sale_month
1,9.0,40.0,cape cod,aluminum/vinyl,1.0,1942.0,7.0,1182.0,1,4.0,1.0,1.0,33541.0,2023-12-15,160000.0,2023,12
2,9.0,40.0,ranch,fiber cement/hardiplank,1.0,2006.0,9.0,1880.0,1,3.0,2.0,0.0,10607.0,2023-08-31,387500.0,2023,8
3,9.0,40.0,ranch,aluminum/vinyl,1.0,1980.0,6.0,1489.0,1,3.0,3.0,0.0,8640.0,2023-08-25,335000.0,2023,8
4,9.0,40.0,ranch,aluminum/vinyl,1.0,1986.0,5.0,1209.0,1,3.0,1.0,1.0,7200.0,2023-06-20,250000.0,2023,6
5,9.0,40.0,colonial,aluminum/vinyl,2.0,1989.0,6.0,1663.0,1,3.0,2.0,1.0,7200.0,2023-01-10,270000.0,2023,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56176,14.0,4180.0,milwaukee bungalow,aluminum / vinyl,1.0,1922.0,0.0,988.0,1,3.0,1.0,0.0,3660.0,2013-12-01,61500.0,2013,12
56177,14.0,4620.0,ranch,aluminum / vinyl,1.0,1950.0,0.0,1119.0,1,3.0,1.0,0.0,5120.0,2013-12-01,65000.0,2013,12
56178,14.0,4700.0,ranch,aluminum / vinyl,1.0,1951.0,0.0,725.0,1,2.0,1.0,0.0,5546.0,2013-12-01,97500.0,2013,12
56179,14.0,4600.0,residence o/s,aluminum / vinyl,1.0,1926.0,0.0,937.0,1,2.0,1.0,0.0,4560.0,2013-12-01,149000.0,2013,12


## Handling Dupicates in the data 


In [21]:
# Removing Duplicate Records
data_cleaned = data_cleaned.drop_duplicates()
data_cleaned

Unnamed: 0,district,nbhd,style,extwall,stories,year_built,rooms,finishedsqft,units,bdrms,fbath,hbath,lotsize,sale_date,sale_price,sale_year,sale_month
1,9.0,40.0,cape cod,aluminum/vinyl,1.0,1942.0,7.0,1182.0,1,4.0,1.0,1.0,33541.0,2023-12-15,160000.0,2023,12
2,9.0,40.0,ranch,fiber cement/hardiplank,1.0,2006.0,9.0,1880.0,1,3.0,2.0,0.0,10607.0,2023-08-31,387500.0,2023,8
3,9.0,40.0,ranch,aluminum/vinyl,1.0,1980.0,6.0,1489.0,1,3.0,3.0,0.0,8640.0,2023-08-25,335000.0,2023,8
4,9.0,40.0,ranch,aluminum/vinyl,1.0,1986.0,5.0,1209.0,1,3.0,1.0,1.0,7200.0,2023-06-20,250000.0,2023,6
5,9.0,40.0,colonial,aluminum/vinyl,2.0,1989.0,6.0,1663.0,1,3.0,2.0,1.0,7200.0,2023-01-10,270000.0,2023,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56176,14.0,4180.0,milwaukee bungalow,aluminum / vinyl,1.0,1922.0,0.0,988.0,1,3.0,1.0,0.0,3660.0,2013-12-01,61500.0,2013,12
56177,14.0,4620.0,ranch,aluminum / vinyl,1.0,1950.0,0.0,1119.0,1,3.0,1.0,0.0,5120.0,2013-12-01,65000.0,2013,12
56178,14.0,4700.0,ranch,aluminum / vinyl,1.0,1951.0,0.0,725.0,1,2.0,1.0,0.0,5546.0,2013-12-01,97500.0,2013,12
56179,14.0,4600.0,residence o/s,aluminum / vinyl,1.0,1926.0,0.0,937.0,1,2.0,1.0,0.0,4560.0,2013-12-01,149000.0,2013,12


In [22]:
#checking for null values
print(data_cleaned.isnull().sum())

null_rows = data_cleaned[data_cleaned.isnull().any(axis=1)]
null_rows

district        0
nbhd            0
style           0
extwall         0
stories         0
year_built      0
rooms           0
finishedsqft    0
units           0
bdrms           0
fbath           0
hbath           0
lotsize         0
sale_date       0
sale_price      0
sale_year       0
sale_month      0
dtype: int64


Unnamed: 0,district,nbhd,style,extwall,stories,year_built,rooms,finishedsqft,units,bdrms,fbath,hbath,lotsize,sale_date,sale_price,sale_year,sale_month


In [23]:
#investigating null condo extwall entries
#condos = data_cleaned[data_cleaned['proptype'].str.contains("condominium")]
#condos_not_null = condos[condos['extwall'].notnull()]
#condos_not_null

#extwall = condos['extwall']

In [24]:
# Resetting the index
data_cleaned = data_cleaned.reset_index(drop=True)
data_cleaned

Unnamed: 0,district,nbhd,style,extwall,stories,year_built,rooms,finishedsqft,units,bdrms,fbath,hbath,lotsize,sale_date,sale_price,sale_year,sale_month
0,9.0,40.0,cape cod,aluminum/vinyl,1.0,1942.0,7.0,1182.0,1,4.0,1.0,1.0,33541.0,2023-12-15,160000.0,2023,12
1,9.0,40.0,ranch,fiber cement/hardiplank,1.0,2006.0,9.0,1880.0,1,3.0,2.0,0.0,10607.0,2023-08-31,387500.0,2023,8
2,9.0,40.0,ranch,aluminum/vinyl,1.0,1980.0,6.0,1489.0,1,3.0,3.0,0.0,8640.0,2023-08-25,335000.0,2023,8
3,9.0,40.0,ranch,aluminum/vinyl,1.0,1986.0,5.0,1209.0,1,3.0,1.0,1.0,7200.0,2023-06-20,250000.0,2023,6
4,9.0,40.0,colonial,aluminum/vinyl,2.0,1989.0,6.0,1663.0,1,3.0,2.0,1.0,7200.0,2023-01-10,270000.0,2023,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37905,14.0,4180.0,milwaukee bungalow,aluminum / vinyl,1.0,1922.0,0.0,988.0,1,3.0,1.0,0.0,3660.0,2013-12-01,61500.0,2013,12
37906,14.0,4620.0,ranch,aluminum / vinyl,1.0,1950.0,0.0,1119.0,1,3.0,1.0,0.0,5120.0,2013-12-01,65000.0,2013,12
37907,14.0,4700.0,ranch,aluminum / vinyl,1.0,1951.0,0.0,725.0,1,2.0,1.0,0.0,5546.0,2013-12-01,97500.0,2013,12
37908,14.0,4600.0,residence o/s,aluminum / vinyl,1.0,1926.0,0.0,937.0,1,2.0,1.0,0.0,4560.0,2013-12-01,149000.0,2013,12


In [25]:
# Saving the cleaned data back to a CSV file

cleaned_file_path = 'Resources/clean_property_data.csv'
data_cleaned.to_csv(cleaned_file_path, index=False)