In [12]:
import pandas as pd

# Load the data
df = pd.read_csv('forecast_history.csv')

# Display initial rows and data types
print("Initial Data:")
print(df.head())
print("\nData Types:")
print(df.dtypes)

# Check for non-numeric values in relevant columns
for column in ["Median house price", "Westpac: 4 year forecast", 
               "Joe Bloggs: 2 year forecast", "Harry Spent: 5 year forecast"]:
    print(f"\nUnique values in {column}:")
    print(df[column].unique())

# Convert relevant columns to numeric, forcing errors to NaN
df["Median house price"] = pd.to_numeric(df["Median house price"], errors='coerce')
df["Westpac: 4 year forecast"] = pd.to_numeric(df["Westpac: 4 year forecast"], errors='coerce')
df["Joe Bloggs: 2 year forecast"] = pd.to_numeric(df["Joe Bloggs: 2 year forecast"], errors='coerce')
df["Harry Spent: 5 year forecast"] = pd.to_numeric(df["Harry Spent: 5 year forecast"], errors='coerce')

# Display the DataFrame after conversion
print("\nData after conversion to numeric:")
print(df.head())

# Remove rows with missing values
df_before_drop = df.shape[0]
df = df.dropna()
df_after_drop = df.shape[0]

# Display the number of rows before and after dropping NaNs
print(f"\nRows before dropping NaNs: {df_before_drop}")
print(f"Rows after dropping NaNs: {df_after_drop}")

# Calculate percentage changes
df["Actual Change (%)"] = df["Median house price"].pct_change() * 100
df["Westpac Change (%)"] = df["Westpac: 4 year forecast"].pct_change() * 100
df["Joe Bloggs Change (%)"] = df["Joe Bloggs: 2 year forecast"].pct_change() * 100
df["Harry Spent Change (%)"] = df["Harry Spent: 5 year forecast"].pct_change() * 100

# Calculate errors
df["Westpac Error (%)"] = df["Westpac Change (%)"] - df["Actual Change (%)"]
df["Joe Bloggs Error (%)"] = df["Joe Bloggs Change (%)"] - df["Actual Change (%)"]
df["Harry Spent Error (%)"] = df["Harry Spent Change (%)"] - df["Actual Change (%)"]

# Print results
print("Analysis of Forecasters:")
print(df[["Unnamed: 0", "Median house price", "Westpac Error (%)", "Joe Bloggs Error (%)", "Harry Spent Error (%)"]])


Initial Data:
   Unnamed: 0 Median house price Westpac: 4 year forecast  \
0        2011             340000                      56%   
1        2012             370000                      53%   
2        2013             350000                      NaN   
3        2014             420000                      13%   
4        2015             425000                      33%   

  Joe Bloggs: 2 year forecast Harry Spent: 5 year forecast  
0                         23%                         -20%  
1                         34$                         -80%  
2                         19%                         -70%  
3                         42%                         -80%  
4                         23%                         -50%  

Data Types:
Unnamed: 0                       int64
Median house price              object
Westpac: 4 year forecast        object
Joe Bloggs: 2 year forecast     object
Harry Spent: 5 year forecast    object
dtype: object

Unique values in Median house 

In [13]:
import pandas as pd

# Load the data
df = pd.read_csv('forecast_history.csv')

# Display initial rows and data types
print("Initial Data:")
print(df.head())
print("\nData Types:")
print(df.dtypes)

# Clean forecast columns: remove percentage signs, dollar signs, and fix typos
for column in ["Westpac: 4 year forecast", "Joe Bloggs: 2 year forecast", "Harry Spent: 5 year forecast"]:
    df[column] = df[column].str.replace('%', '', regex=False)  # Remove percentage signs
    df[column] = df[column].str.replace('$', '', regex=False)  # Remove dollar signs
    df[column] = df[column].str.replace('I', '1', regex=False)  # Fix typo
    df[column] = pd.to_numeric(df[column], errors='coerce')  # Convert to numeric

# Convert Median house price to numeric
df["Median house price"] = pd.to_numeric(df["Median house price"], errors='coerce')

# Remove rows with missing values
df = df.dropna()

# Check how many rows are left after dropping NaNs
print(f"\nRows after cleaning: {df.shape[0]}")

# Calculate percentage changes
df["Actual Change (%)"] = df["Median house price"].pct_change() * 100
df["Westpac Change (%)"] = df["Westpac: 4 year forecast"].pct_change() * 100
df["Joe Bloggs Change (%)"] = df["Joe Bloggs: 2 year forecast"].pct_change() * 100
df["Harry Spent Change (%)"] = df["Harry Spent: 5 year forecast"].pct_change() * 100

# Calculate errors
df["Westpac Error (%)"] = df["Westpac Change (%)"] - df["Actual Change (%)"]
df["Joe Bloggs Error (%)"] = df["Joe Bloggs Change (%)"] - df["Actual Change (%)"]
df["Harry Spent Error (%)"] = df["Harry Spent Change (%)"] - df["Actual Change (%)"]

# Print results
print("Analysis of Forecasters:")
print(df[["Unnamed: 0", "Median house price", "Westpac Error (%)", "Joe Bloggs Error (%)", "Harry Spent Error (%)"]])


Initial Data:
   Unnamed: 0 Median house price Westpac: 4 year forecast  \
0        2011             340000                      56%   
1        2012             370000                      53%   
2        2013             350000                      NaN   
3        2014             420000                      13%   
4        2015             425000                      33%   

  Joe Bloggs: 2 year forecast Harry Spent: 5 year forecast  
0                         23%                         -20%  
1                         34$                         -80%  
2                         19%                         -70%  
3                         42%                         -80%  
4                         23%                         -50%  

Data Types:
Unnamed: 0                       int64
Median house price              object
Westpac: 4 year forecast        object
Joe Bloggs: 2 year forecast     object
Harry Spent: 5 year forecast    object
dtype: object

Rows after cleaning: 11
Analys