In [1]:
# Import the numpy and pandas package

import numpy as np
import pandas as pd

# Data Visualisation

import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
# Importing the Property.csv file

property = pd.DataFrame(pd.read_csv("D:\Python\Statistics for Decision Making_17_June\Property.csv"))

In [3]:
# Check the head of the dataset

property.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


# Data Inspection

In [4]:
property.shape

(13580, 21)

In [5]:
property.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

In [6]:
property.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


# Data Cleaning

In [7]:
# Checking Null values
null_values = property.isnull().sum()
null_values

Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

In [8]:
# Convert Data Type:

# Convert Date column to datetime
property['Date'] = pd.to_datetime(property['Date'], format='%d/%m/%Y')

In [9]:
# Handle Missing Values:

# Fill missing values in 'BuildingArea' and 'YearBuilt' with median values
property['BuildingArea'].fillna(property['BuildingArea'].median(), inplace=True)
property['YearBuilt'].fillna(property['YearBuilt'].median(), inplace=True)

In [10]:
# Fill missing values in 'CouncilArea' with after cross referencing with values in Suburb, Regionname, Postcode.

# Create a reference DataFrame with unique combinations
reference_df = property[['Suburb', 'Regionname', 'Postcode', 'CouncilArea']].drop_duplicates().dropna()

# Create a dictionary mapping (Suburb, Regionname, Postcode) to CouncilArea
council_area_map = reference_df.set_index(['Suburb', 'Regionname', 'Postcode'])['CouncilArea'].to_dict()

# Function to fill missing 'CouncilArea' values
def fill_council_area(row):
    if pd.isnull(row['CouncilArea']):
        return council_area_map.get((row['Suburb'], row['Regionname'], row['Postcode']), row['CouncilArea'])
    else:
        return row['CouncilArea']

# Apply the function to fill missing 'CouncilArea' values
property['CouncilArea'] = property.apply(fill_council_area, axis=1)

In [11]:
# Fill missing value in 'Car' with 0.

property['Car'].fillna(0, inplace=True)

In [12]:
# Remove Duplicates:

property.drop_duplicates(inplace=True)

In [13]:
property.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,2016-12-03,2.5,3067.0,...,1.0,1.0,202.0,126.0,1970.0,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,2016-02-04,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,2017-03-04,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,2017-03-04,2.5,3067.0,...,2.0,1.0,94.0,126.0,1970.0,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,2016-06-04,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [14]:
# Standardize categorical data

property['Suburb'] = property['Suburb'].str.capitalize()
property['Type'] = property['Type'].str.lower()
property['Method'] = property['Method'].str.upper()
property['SellerG'] = property['SellerG'].str.capitalize()
property['CouncilArea'] = property['CouncilArea'].str.capitalize()
property['Regionname'] = property['Regionname'].str.capitalize()

In [15]:
# Verify data cleaning
print("Missing values after cleaning:\n", property.isnull().sum())
print(property.info())
print(property.describe())

Missing values after cleaning:
 Suburb           0
Address          0
Rooms            0
Type             0
Price            0
Method           0
SellerG          0
Date             0
Distance         0
Postcode         0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
YearBuilt        0
CouncilArea      7
Lattitude        0
Longtitude       0
Regionname       0
Propertycount    0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Suburb         13580 non-null  object        
 1   Address        13580 non-null  object        
 2   Rooms          13580 non-null  int64         
 3   Type           13580 non-null  object        
 4   Price          13580 non-null  float64       
 5   Method         13580 non-null  object        
 6   SellerG        13580 non-null  obj

'''
# Question 1:

For the suburb Altona, it is postulated that a typical property sells for $800,000.
Use the data at hand to test this assumption. 
Is the typical property price really $800,000 or has it increased? 
Use a significance level of 5%.

# Here are the steps we will follow:

Extract property prices for the suburb Altona from the dataset.
Calculate the sample mean and standard deviation of these prices.
Perform the one-sample t-test.
Determine if the observed mean is significantly different from $800,000 using a significance level of 5%
'''

In [16]:
# Filter the data for the suburb 'Altona'
altona_property = property[property['Suburb'] == 'Altona']

In [17]:
altona_property.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
141,Altona,158 Queen St,3,h,520000.0,VB,Greg,2016-09-03,13.8,3018.0,...,2.0,1.0,352.0,242.0,2015.0,Hobsons bay,-37.87,144.825,Western metropolitan,5301.0
142,Altona,59 Bracken Gr,5,h,1525000.0,S,Greg,2016-12-03,13.8,3018.0,...,3.0,3.0,729.0,268.0,2005.0,Hobsons bay,-37.8721,144.8105,Western metropolitan,5301.0
143,Altona,1/123 Blyth St,2,t,720000.0,S,Hockingstuart,2016-02-04,13.8,3018.0,...,1.0,2.0,292.0,125.0,2013.0,Hobsons bay,-37.8687,144.8197,Western metropolitan,5301.0
144,Altona,4 Blyth St,3,h,1120000.0,S,Barlow,2017-03-04,13.8,3018.0,...,1.0,2.0,506.0,120.0,1930.0,Hobsons bay,-37.8693,144.8375,Western metropolitan,5301.0
145,Altona,20 Linnet St,4,h,780000.0,PI,Sweeney,2016-06-04,13.8,3018.0,...,1.0,4.0,655.0,126.0,1970.0,Hobsons bay,-37.868,144.8154,Western metropolitan,5301.0


In [18]:
# Extract the property prices
altona_prices = altona_property['Price'].dropna()

In [19]:
# Calculate the sample mean and standard deviation
mean_price = altona_prices.mean()
std_dev_price = altona_prices.std()
sample_size = len(altona_prices)

In [20]:
from scipy.stats import ttest_1samp

# Perform the one-sample t-test
t_statistic, p_value = ttest_1samp(altona_prices, 800000)

# Print the results
print(f"Mean price: {mean_price}")
print(f"Standard deviation: {std_dev_price}")
print(f"Sample size: {sample_size}")
print(f"T-statistic: {t_statistic}")
print(f"P-value: {p_value}")

Mean price: 834830.4054054054
Standard deviation: 291546.04547367844
Sample size: 74
T-statistic: 1.0277020770199676
P-value: 0.307483271305555


In [21]:
# Determine if we reject the null hypothesis
alpha = 0.05
if p_value < alpha:
    print("We reject the null hypothesis. The typical property price for suburb Altona is significantly $800,000.")
else:
    print("We fail to reject the null hypothesis. The typical property price for suburb Altona is not significantly $800,000.")

We fail to reject the null hypothesis. The typical property price for suburb Altona is not significantly $800,000.


'''
# Question 2:

For the year 2016, is there any difference in prices of properties sold in the summer months vs winter months? 
Consider months from October till March as winter months and rest as summer months. 
Use a significance level of 5%.

# Here are the steps we will follow:

Filter the data for the year 2016.
Categorize the sales into winter months (October to March) and summer months (April to September).
Extract the prices for each category.
Perform an independent samples t-test.
Determine if the observed mean prices are significantly different using a significance level of 5%.
'''

In [22]:
from scipy.stats import ttest_ind

# Filter the data for the year 2016
property_2016 = property[property['Date'].dt.year == 2016]

# Categorize the data into winter and summer months
winter_months = [10, 11, 12, 1, 2, 3]
summer_months = [4, 5, 6, 7, 8, 9]

winter_data = property_2016[property_2016['Date'].dt.month.isin(winter_months)]
summer_data = property_2016[property_2016['Date'].dt.month.isin(summer_months)]

# Extract the property prices
winter_prices = winter_data['Price'].dropna()
summer_prices = summer_data['Price'].dropna()

# Perform an independent samples t-test
t_statistic, p_value = ttest_ind(winter_prices, summer_prices)

# Print the results
print(f"Winter Prices Mean: {winter_prices.mean()}")
print(f"Summer Prices Mean: {summer_prices.mean()}")
print(f"T-statistic: {t_statistic}")
print(f"P-value: {p_value}")

# Determine if we reject the null hypothesis
alpha = 0.05
if p_value < alpha:
    print("We reject the null hypothesis. There is a significant difference in property prices between winter and summer months.")
else:
    print("We fail to reject the null hypothesis. There is no significant difference in property prices between winter and summer months.")

Winter Prices Mean: 1116647.5917391304
Summer Prices Mean: 1048054.7286917741
T-statistic: 4.043386317851058
P-value: 5.3309767667631686e-05
We reject the null hypothesis. There is a significant difference in property prices between winter and summer months.


# Question 3:

For the suburb of Abbotsford, what is the probability that out of 10 properties sold, 3 will not have car parking?
Use the column car in the dataset.
Round off your answer to 3 decimal places. 

In [23]:
from scipy.stats import binom

# Filter data for Abbotsford
abbotsford_property = property[property['Suburb'] == 'Abbotsford']

# Calculate the proportion of properties without car parking
no_car_parking = abbotsford_property['Car'].isna().sum() + (abbotsford_property['Car'] == 0).sum()
total_properties = len(abbotsford_property)
p_no_car_parking = no_car_parking / total_properties

# Calculate the probability using binomial distribution
n = 10  # number of trials
k = 3   # number of successes
probability = binom.pmf(k, n, p_no_car_parking)

# Print the probability
print(f"The probability that out of 10 properties sold, 3 will not have car parking: {probability:.3f}")

The probability that out of 10 properties sold, 3 will not have car parking: 0.260


# Question 4:

In the suburb Abbotsford, what are the chances of finding a property with 3 rooms?
Round your answer to 3 decimal places.

In [24]:
# Calculate the proportion of properties with 3 rooms
three_rooms = (abbotsford_property['Rooms'] == 3).sum()
p_three_rooms = three_rooms / total_properties

# Print the probability
print(f"The probability of finding a property with 3 rooms in Abbotsford: {p_three_rooms:.3f}")

The probability of finding a property with 3 rooms in Abbotsford: 0.357


# Question 5:

In the suburb Abbotsford, what are the chances of finding a property with 2 bathrooms?
Round your answer to 3 decimal places.

In [25]:
# Calculate the proportion of properties with 2 bathrooms
two_bathrooms = (abbotsford_property['Bathroom'] == 2).sum()
p_two_bathrooms = two_bathrooms / total_properties

# Print the probability
print(f"The probability of finding a property with 2 bathrooms in Abbotsford: {p_two_bathrooms:.3f}")

The probability of finding a property with 2 bathrooms in Abbotsford: 0.339
