In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Team 4
## Real Estate Score Card

The data we are analyzing is related to real estate prices from Zillow. The data is in the form of a CSV file, containing the following columns:
- **RegionID**: Unique identifier for the region
- **SizeRank**: Rank of the region based on size
- **RegionName**: Name of the region
- **RegionType**: Type of the region: neighborhood
- **StateName**: Name of the state
- **State**: Abbreviation of the state
- **City**
- **Metro**
- **CountyName**
- **Series of Dates from 2000-01 to 2024-09**, incremented per month

The values associated we are looking at is called the Zillow Home Value Index (ZHVI).

**Zillow Home Value Index (ZHVI)**: A measure of the typical home value and market changes across a given region and housing type. It reflects the typical value for homes in the 35th to 65th percentile range. More info about ZHVI: [Zillow Research](https://www.zillow.com/research/methodology-neural-zhvi-32128/)

Our analysis will be focused on the Chicago land area consisting of all its neighborhoods and regions associated.


In [None]:
# Read in the file
df = pd.read_csv('Neighborhood_zillow.csv')
Neighborhoods_zillow_df = df.copy()
print("shape:", df.shape)


In [None]:
# Clean the data to only include chicago neighborhoods
chicago_df = df[df['City'] == 'Chicago']

# List the neighborhoods

print("Number of unique region names in Chicago:", df['RegionName'].unique().size)

# Remove the columns that are not needed
chicago_df = chicago_df.drop(columns=['StateName', 'Metro', 'CountyName'])

# Save the df into a csv for chicago
chicago_df.to_csv('Chicago_Neighborhoods.csv', index=False)




In [None]:
# Filter regions into unique dataframes

# North Side
north_side_regions = [
    'Rogers Park', 'Edgewater', 'Uptown', 'Lake View', 'Lincoln Park', 
    'North Center', 'Lincoln Square', 'West Ridge', 'Irving Park', 
    'Albany Park', 'Avondale'
]
north_side_df = chicago_df[chicago_df['RegionName'].isin(north_side_regions)]

# South Side
south_side_regions = [
    'Armour Square', 'Bridgeport', 'Brighton Park', 'New City (Back of the Yards)', 
    'Englewood', 'Greater Grand Crossing', 'Hyde Park', 'Kenwood', 'Oakland', 
    'South Shore', 'Washington Park', 'Woodlawn', 'Chatham', 'South Chicago', 
    'Auburn Gresham', 'Calumet Heights', 'Roseland', 'Pullman', 'West Pullman', 
    'Riverdale'
]
south_side_df = chicago_df[chicago_df['RegionName'].isin(south_side_regions)]

# East Side
east_side_regions = [
    'Hegewisch', 'East Side', 'South Shore', 'Hyde Park', 'Kenwood'
    
]
east_side_df = chicago_df[chicago_df['RegionName'].isin(east_side_regions)]

# West Side
west_side_regions = [
    'Austin', 'East Garfield Park', 'West Garfield Park', 'North Lawndale', 
    'South Lawndale (Little Village)', 'Humboldt Park', 'Near West Side', 'West Town'
]
west_side_df = chicago_df[chicago_df['RegionName'].isin(west_side_regions)]

# Northwest Side
northwest_side_regions = [
    'Jefferson Park', 'Portage Park', 'Norwood Park', 'Dunning', 'Belmont Cragin', 
    'Montclare', 'Irving Park', 'Hermosa'
]
northwest_side_df = chicago_df[chicago_df['RegionName'].isin(northwest_side_regions)]

# Southwest Side
southwest_side_regions = [
    'Garfield Ridge', 'Archer Heights', 'Brighton Park', 'Gage Park', 'West Elsdon', 
    'West Lawn', 'Chicago Lawn (Marquette Park)', 'Ashburn', 'Clearing'
]
southwest_side_df = chicago_df[chicago_df['RegionName'].isin(southwest_side_regions)]

print("North Side neighborhoods: \n", north_side_df.head(3))
# print("South Side neighborhoods: \n", south_side_df.head(8))
# print("East Side neighborhoods: \n", east_side_df.head(8))
# print("West Side neighborhoods: \n", west_side_df.head(8))
# print("Northwest Side neighborhoods: \n", northwest_side_df.head(8))
# print("Southwest Side neighborhoods: \n", southwest_side_df.head(8))

# print("North Side neighborhoods: \n", north_side_df.head())
# print("South Side neighborhoods: \n", south_side_df.head())
# print("East Side neighborhoods: \n", east_side_df.head())
# print("West Side neighborhoods: \n", west_side_df.head())
# print("Northwest Side neighborhoods: \n", northwest_side_df.head())
# print("Southwest Side neighborhoods: \n", southwest_side_df.head())

In [None]:
# Unpivot date columns so data is more rectangular
def reshape_dates(df):
    # Identify date columns 
    date_columns = [col for col in df.columns if col.startswith('20')]
    
    # Melt dataframe to convert date columns into rows
    df_melted = df.melt(id_vars=['RegionID', 'SizeRank', 'RegionName'],
                        value_vars=date_columns,
                        var_name='Date', value_name='ZHVI')
    
    # Convert Date to datetime fromat
    df_melted['Date'] = pd.to_datetime(df_melted['Date'])

    return df_melted

reshaped_df = reshape_dates(chicago_df)
reshaped_df.head()


In [None]:
# Filter for Lake View
lake_view_df = reshaped_df[reshaped_df['RegionName'] == 'Lake View']

# Plot ZHVI over time for Lake View
plt.figure(figsize=(12, 6))
sns.lineplot(x='Date', y='ZHVI', data=lake_view_df)
plt.title("ZHVI Value Over Time for Lake View")
plt.xlabel("Date")
plt.ylabel("ZHVI")
plt.show()

In [None]:
cens = pd.read_csv('Data/Census-2020.csv')
cens['Label (Grouping)'].to_list

In [None]:
def clean_census(year):
    filename = 'Data/Census-' + str(year) + '.csv'
    raw_df = pd.read_csv(filename)

    # Filter only Total Estimate columns
    columns = raw_df.columns.str
    df = raw_df.loc[:, columns.contains('Total!!Estimate')]
    df.loc[:,'Label'] = raw_df['Label (Grouping)']

    # Transpose columns and rows
    df = df.set_index('Label').transpose().reset_index()

    # Rename columns that have Year in the name
    df = df.rename(columns={'index': 'ZIP Code', 
                            'INCOME IN THE PAST 12 MONTHS (IN ' + str(year) + ' INFLATION-ADJUSTED DOLLARS)':
                            'INCOME IN THE PAST 12 MONTHS (IN INFLATION-ADJUSTED DOLLARS)', 
                            'EARNINGS IN THE PAST 12 MONTHS (IN ' + str(year) + ' INFLATION-ADJUSTED DOLLARS) FOR FULL-TIME, YEAR-ROUND WORKERS': 
                            'EARNINGS IN THE PAST 12 MONTHS (IN INFLATION-ADJUSTED DOLLARS) FOR FULL-TIME, YEAR-ROUND WORKERS'})
    
    
    # Rename columns so each is unique
    new_columns = []
    prefix = ''

    for col in df.columns:
        col = col.replace('\xa0', '')
        if col.isupper():
            # If the column name is all uppercase, set it as the prefix
            prefix = col
            new_columns.append(col)
        else:
            # Add the prefix to the column name
            new_columns.append(f"{prefix} - {col}")
    
    df.columns = new_columns
    df.columns = df.columns.str.strip()

    # Remove columns with all NaN values
    df = df.dropna(axis=1, how='all')

    # This column was causing trouble so remove it
    for col in df.columns:
        if 'with related children' in col.lower():
            df.drop(columns=[col])

    df = df.rename(columns={'- ZIP Code': 'ZIP Code', '- Total population': 'Total population'})
    df['ZIP Code'] = df['ZIP Code'].str[6:11]

    for col in df.columns:
        try:
            # Check if the column is of type 'object'
            if df[col].dtype == 'object':

                # Remove commas
                df[col] = df[col].replace({',': ''}, regex=True)
                
                # Check if the column contains percentages
                if df[col].str.contains('%').any():
                    # Convert to float after removing % sign and divide by 100
                    df[col] = df[col].str.rstrip('%').astype('float') / 100
                else:
                    df[col] = pd.to_numeric(df[col])

        except Exception as e:
            nonee = ''

    # Clean up column names and add year column
    df['Year'] = year

    return df


census_df = clean_census(2011)

# Loop through each year and add it to the census_df
for year in range(2012,2023):
    df = clean_census(year)

    # Remove columns not in the intersection so they can be concated
    common_columns = census_df.columns.intersection(df.columns)
    census_df = census_df[common_columns]
    df = df[common_columns]

    census_df = pd.concat([census_df, df], ignore_index=True)

census_df['ZIP Code'] = census_df['ZIP Code'].astype(str)

# Set display options to show all rows and columns
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
census_df

In [None]:
# print(census_df.dtypes)
# Group by Zip and get average of population
grouped_df = census_df.groupby('ZIP Code')['Total population'].mean().reset_index()

# Get the top 10 ZIP codes by average population
top_10 = grouped_df.sort_values('Total population', ascending=False).head(10)

plt.figure(figsize=(12, 6))
sns.barplot(x='ZIP Code', y='Total population', data=top_10)
plt.title('Top 10 ZIP Codes by Average Total Population')
plt.show()

This visualization shows the top 10 ZIP codes by average population, highlighting areas with the highest population density, which can indicate regions with greater housing demand, community resources, and potential market opportunities for real estate investment.

In [None]:
# Clean up populations csv's
def clean_population(year):
    filename = 'Chicago_ZIP_Populations_' + str(year) + '.csv'
    df = pd.read_csv(filename)

    # Transpose columns and rows
    df = df.set_index('Label (Grouping)').transpose().reset_index()

    # Rename columns
    df.columns = ['Zip Code', 'Total']

    # Remove prefix from Zip codes
    df['Zip Code'] = df['Zip Code'].str[6:]
    df['Year'] = year
    
    return df

pop_2010 = clean_population(2010)
pop_2020 = clean_population(2020)

# Combine both years into one dataframe
population_df = pd.concat([pop_2010, pop_2020], ignore_index=True)

population_df

In [None]:
# north side 

reshaped_north_side_df = reshape_dates(north_side_df)

# print ("North Side neighborhoods: \n", reshaped_north_side_df["RegionName"].unique())
# print ("North Side neighborhoods: \n", reshaped_north_side_df.columns)

# Ensure Date column is in datetime format
reshaped_north_side_df['Date'] = pd.to_datetime(reshaped_north_side_df['Date'])

# Plot ZHVI over time for Lake View
plt.figure(figsize=(12, 6))
sns.lineplot(x='Date', y='ZHVI', data=reshaped_north_side_df)
plt.title("ZHVI Value Over Time for North Side (Combined)")
plt.xlabel("Date")
plt.ylabel("ZHVI")
plt.show()

# 1. ZHVI Over Time for North Side - Line Graph
plt.figure(figsize=(14, 8))
sns.lineplot(data=reshaped_north_side_df, x='Date', y='ZHVI', hue='RegionName')
plt.title('ZHVI Over Time for North Side Regions')
plt.xlabel('Date')
plt.ylabel('ZHVI')
plt.legend(title='Region')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 2. Average ZHVI by Region - Bar Chart
average_zhvi_by_region = reshaped_north_side_df.groupby('RegionName')['ZHVI'].mean().reset_index()

plt.figure(figsize=(10, 6))
sns.barplot(data=average_zhvi_by_region, x='RegionName', y='ZHVI', palette='viridis')
plt.title('Average ZHVI by Region')
plt.xlabel('Region')
plt.ylabel('Average ZHVI')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 3. ZHVI Distribution by Region - Box Plot
plt.figure(figsize=(12, 8))
sns.boxplot(data=reshaped_north_side_df, x='RegionName', y='ZHVI', palette='pastel')
plt.title('ZHVI Distribution by Region')
plt.xlabel('Region')
plt.ylabel('ZHVI')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

ZHVI Value Over Time for North Side (Combined) - This visualization provides a broad view of the overall Zillow Home Value Index (ZHVI) trends across all neighborhoods in the North Side, giving a quick overview of how property values have evolved in this region as a whole.

ZHVI Over Time for North Side Regions - By breaking down ZHVI trends for individual neighborhoods within the North Side, this graph allows for a more detailed comparison, helping identify which neighborhoods have higher growth rates or stability over time.

Average ZHVI by Region - Bar Chart - This bar chart highlights the average ZHVI across North Side neighborhoods, offering a clear comparison of property values among neighborhoods, which is useful for assessing relative affordability and investment potential.

ZHVI Distribution by Region - Box Plot - The box plot shows the range and variability of ZHVI within each neighborhood, illustrating price distribution and identifying areas with higher volatility or consistency in home values, which can indicate market stability.

In [12]:
Neighborhoods_zillow_df = Neighborhoods_zillow_df[Neighborhoods_zillow_df['City'] == "Chicago"]
Neighborhoods_zillow_df.to_csv('Chicago_Neighborhoods_Zillow.csv', index=False)

In [None]:

# 1. Load the dataset
df = pd.read_csv('Chicago_Neighborhoods_Zillow.csv')

# 2. Define the East Side regions
east_side_regions = [
    'Hegewisch', 'East Side', 'South Shore', 'Hyde Park', 'Kenwood'
]

# 3. Filter the DataFrame to include only the East Side regions
df = df[df['RegionName'].isin(east_side_regions)]

# 4. Reshape the data from wide to long format
# Identify non-date columns
non_date_cols = ['RegionID', 'SizeRank', 'RegionName', 'RegionType', 'StateName', 
                 'State', 'City', 'Metro', 'CountyName']

# Reshape the DataFrame
df_long = df.melt(id_vars=non_date_cols, var_name='Date', value_name='MedianHomeValue')

# Convert 'Date' to datetime format
df_long['Date'] = pd.to_datetime(df_long['Date'])

# 5. Handle missing values
df_long.dropna(subset=['MedianHomeValue'], inplace=True)

# 6. Feature engineering
# Extract 'Year' and 'Month' from 'Date'
df_long['Year'] = df_long['Date'].dt.year
df_long['Month'] = df_long['Date'].dt.month

# 7. Create lag features
# Sort values
df_long.sort_values(['RegionName', 'Date'], inplace=True)

# Create lag features for the past 12 months
for lag in range(1, 13):
    df_long[f'Lag_{lag}'] = df_long.groupby('RegionName')['MedianHomeValue'].shift(lag)

# Drop rows with NaN values due to lagging
df_long.dropna(inplace=True)

# 8. Encode categorical variables
# One-hot encode 'RegionType' if necessary
df_encoded = pd.get_dummies(df_long, columns=['RegionType'], drop_first=True)

# 9. Define features and target variable
# Lag features
lag_features = [f'Lag_{lag}' for lag in range(1, 13)]
# Categorical features (if any)
categorical_features = [col for col in df_encoded.columns if col.startswith('RegionType_')]
# All features
features = lag_features + ['Year', 'Month'] + categorical_features

X = df_encoded[features]
y = df_encoded['MedianHomeValue']

# 10. Split the data using TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=3)

# 11. Model training and selection
# Initialize the model
rf = RandomForestRegressor(random_state=42)

# Simplify hyperparameter tuning for quicker runtime
param_grid = {
    'n_estimators': [1000],
    'max_depth': [200],
    'min_samples_split': [2]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=tscv,
                           scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X, y)

# Best estimator
best_model = grid_search.best_estimator_
print(f'Best parameters: {grid_search.best_params_}')

# 12. Evaluate the model
# Predictions on the training set
y_pred = best_model.predict(X)

# Calculate RMSE 
rmse = np.sqrt(mean_squared_error(y, y_pred))
print(f'RMSE: {rmse}')

# 13. Forecast future values
# Predict the next 12 months
future_dates = pd.date_range(start=df_long['Date'].max() + pd.DateOffset(months=1),
                             periods=12, freq='ME')  #change periods to 24 for 2 years

# Create a DataFrame for future predictions
future_df = pd.DataFrame({'Date': future_dates})

# Prepare future predictions for each region
regions = df_long['RegionName'].unique()
future_predictions_list = []  # Use a list to collect DataFrames

for region in regions:
    # Get the last known data for the region
    region_data = df_encoded[df_encoded['RegionName'] == region].sort_values('Date')
    last_row = region_data.iloc[-1]
    
    # Initialize lag values
    lag_values = last_row[[f'Lag_{lag}' for lag in range(1, 13)]].values
    # The most recent value is 'MedianHomeValue' from the last row
    last_value = last_row['MedianHomeValue']
    
    # Create future data for the region
    temp_df = future_df.copy()
    temp_df['RegionName'] = region
    temp_df['Year'] = temp_df['Date'].dt.year
    temp_df['Month'] = temp_df['Date'].dt.month
    
    # Include any encoded 'RegionType' columns
    for col in categorical_features:
        temp_df[col] = last_row[col]
    
    # Initialize a DataFrame to store lag features
    predicted_values = []
    
    for i in range(len(temp_df)):
        # Create a dictionary to hold features for this date
        features_dict = {}
        # Set lag features
        for lag in range(1, 13):
            features_dict[f'Lag_{lag}'] = lag_values[lag-1]
        # Combine features
        features_input = pd.DataFrame(features_dict, index=[0])
        features_input['Year'] = temp_df.iloc[i]['Year']
        features_input['Month'] = temp_df.iloc[i]['Month']
        for col in categorical_features:
            features_input[col] = temp_df.iloc[i][col]
        # Ensure all features are present
        features_input = features_input[features]
        # Predict the median home value
        predicted_value = best_model.predict(features_input)[0]
        predicted_values.append(predicted_value)
        # Update lag values
        lag_values = np.roll(lag_values, 1)
        lag_values[0] = predicted_value  # The most recent lag is the predicted value
    
    # Add the predicted values to temp_df
    temp_df['PredictedMedianHomeValue'] = predicted_values
    
    # Collect temp_df in the list
    future_predictions_list.append(temp_df[['Date', 'RegionName', 'PredictedMedianHomeValue']])

# Concatenate all future predictions into a single DataFrame
future_predictions = pd.concat(future_predictions_list, ignore_index=True)

# 14. Visualize the predictions
# Select a region to visualize (e.g., 'East Side')
region_to_plot = 'East Side'  # Replace with an actual region name from your data

# Actual data
actual_data = df_long[df_long['RegionName'] == region_to_plot]

# Predicted data
predicted_data = future_predictions[future_predictions['RegionName'] == region_to_plot]

# Plotting
plt.figure(figsize=(14, 7))
plt.plot(actual_data['Date'], actual_data['MedianHomeValue'], label='Actual')
plt.plot(predicted_data['Date'], predicted_data['PredictedMedianHomeValue'],
         label='Predicted', linestyle='--')
plt.title(f'Median Home Value Trends for {region_to_plot}')
plt.xlabel('Date')
plt.ylabel('Median Home Value')
plt.legend()
plt.show()




This visualization is beneficial as it focuses on the East Side regions, allowing for a detailed view of historical and predicted home value trends in a specific area, which is useful for regional market analysis and targeted investment decisions.

In [None]:

# 1. Load the dataset
df = pd.read_csv('Chicago_Neighborhoods_Zillow.csv')

# 2. Reshape the data from wide to long format
# Identify non-date columns
non_date_cols = ['RegionID', 'SizeRank', 'RegionName', 'RegionType', 'StateName', 
                 'State', 'City', 'Metro', 'CountyName']

# Reshape the DataFrame
df_long = df.melt(id_vars=non_date_cols, var_name='Date', value_name='MedianHomeValue')

# Convert 'Date' to datetime format
df_long['Date'] = pd.to_datetime(df_long['Date'])

# 3. Handle missing values
df_long.dropna(subset=['MedianHomeValue'], inplace=True)

# 4. Aggregate the data across all regions
# Calculate the average median home value for each date
aggregate_data = df_long.groupby('Date')['MedianHomeValue'].mean().reset_index()

# 5. Feature engineering
# Extract 'Year' and 'Month' from 'Date'
aggregate_data['Year'] = aggregate_data['Date'].dt.year
aggregate_data['Month'] = aggregate_data['Date'].dt.month

# 6. Create lag features
# Sort values
aggregate_data.sort_values('Date', inplace=True)

# Create lag features for the past 12 months
for lag in range(1, 13):
    aggregate_data[f'Lag_{lag}'] = aggregate_data['MedianHomeValue'].shift(lag)

# Drop rows with NaN values due to lagging
aggregate_data.dropna(inplace=True)

# 7. Define features and target variable
# Lag features
lag_features = [f'Lag_{lag}' for lag in range(1, 13)]
# All features
features = lag_features + ['Year', 'Month']

X = aggregate_data[features]
y = aggregate_data['MedianHomeValue']

# 8. Split the data using TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=3)

# 9. Model training and selection
# Initialize the model
rf = RandomForestRegressor(random_state=42)

# Adjust hyperparameters as needed
param_grid = {
    'n_estimators': [1000],  # Adjust based on your system's capacity
    'max_depth': [200],
    'min_samples_split': [2]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=tscv,
                           scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X, y)

# Best estimator
best_model = grid_search.best_estimator_
print(f'Best parameters: {grid_search.best_params_}')

# 10. Evaluate the model
# Predictions on the training set
y_pred = best_model.predict(X)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y, y_pred))
print(f'RMSE: {rmse}')

# 11. Forecast future values
# Predict the next 12 months
future_dates = pd.date_range(start=aggregate_data['Date'].max() + pd.DateOffset(months=1),
                             periods=12, freq='ME')  # Change periods to 24 for 2 years if needed

# Create a DataFrame for future predictions
future_df = pd.DataFrame({'Date': future_dates})
future_df['Year'] = future_df['Date'].dt.year
future_df['Month'] = future_df['Date'].dt.month

# Initialize lag values with the last available data
lag_values = aggregate_data.iloc[-1][lag_features].values

predicted_values = []

for i in range(len(future_df)):
    # Create a dictionary to hold features for this date
    features_dict = {}
    # Set lag features
    for lag in range(1, 13):
        features_dict[f'Lag_{lag}'] = lag_values[lag-1]
    # Combine features
    features_input = pd.DataFrame(features_dict, index=[0])
    features_input['Year'] = future_df.iloc[i]['Year']
    features_input['Month'] = future_df.iloc[i]['Month']
    features_input = features_input[features]
    # Predict the median home value
    predicted_value = best_model.predict(features_input)[0]
    predicted_values.append(predicted_value)
    # Update lag values
    lag_values = np.roll(lag_values, 1)
    lag_values[0] = predicted_value  # The most recent lag is the predicted value

# Add the predicted values to future_df
future_df['PredictedMedianHomeValue'] = predicted_values

# 12. Visualize the predictions
# Plotting
plt.figure(figsize=(14, 7))
plt.plot(aggregate_data['Date'], aggregate_data['MedianHomeValue'], label='Actual')
plt.plot(future_df['Date'], future_df['PredictedMedianHomeValue'],
         label='Predicted', linestyle='--')
plt.title('Median Home Value Trends for Chicago')
plt.xlabel('Date')
plt.ylabel('Median Home Value')
plt.legend()
plt.show()


This graph is valuable because it displays both the historical trends in median home values for Chicago and the model’s future predictions, providing a clear visual representation of past market behavior and expected future trends, which is useful for understanding housing market dynamics and informing investment decisions.

<h2>ML Analysis</h2>


In the second graph titled __Median Home Value Trends for Chicago__, the line shows the median home value for all neighborhoods from around 2001 to 2024 and then we used a model to predict the home values for the future. The model predicts the median home value in Chicago to remain fairly constant with no noticable increase or decrease.


However in the first graph titled __Median Home Value Trends for East Side__ we perform the same ML analysis on just the East Side neighborhood of Chicago. The model predicts a significant increase in the median home value for houses in the East Side. Given the fact that the median home value for across neighborhoods in Chicago is projected to remain pretty constant, we can assume that the East Side far outperforms the rest of the other neighborhoods since there is a significant increase compared to the baseline.

    •    Data cleaning: show clearly how you cleaned your data.

We collected data from Zillow for home value estimations and Census data for data about the people in each Chicago ZIP Code. To clean the Zillow data we had to transpose both data frames so that each ZIP Code was part of the row instead of a column. Then we had to remove the columns that weren’t total estimates. The original data set had nested data so we had to rename the columns to add the top level column name to the lower levels so that each column had a unique name. Then we iterated through the columns and converted to numeric data types based on whether the data was a normal number or a percentage. Next we added a Year column to each dataframe and combined all the dataframes into one, dropping any column that would stop the data frames from being concatenated. 


    •    Exploratory data analysis: explain what your data looks like (words are fine, but visualizations are often better). Include any interesting issues or preliminary conclusions you have about your data.

The data for Zillow is organized by Neighborhood and ZHVI which is Zillows house value estimation. The Census data is much more vast and can be used in a multitude of ways. Some of the most notable are age, educational attainment, income level, and household size. With this data we were able to assess what types of people are congregating in which areas and how their house values correlate with the people in the zip code. We concluded that higher income residents that have higher education live in areas with higher average ZHVI. One issue that we will have to solve is accurately mapping the ZIP Codes to the neighborhood values that the Zillow dataset uses. Once we do that it will be much easier to do accurate data analysis regarding census and ZHVI data.

## Peer Assessment

### Contributions and Roles

- **Adam Nimer**: Spearheaded the EDA. Added Visualizations and justifications. **10/10**

- **Daniel Mroz**: Performed Analysis of ML model and provided justification for results provided by the model. **10/10**

- **Abdullah Ali**: Data Collection and Cleaning as well as validation of data sources. **10/10**

- **Umar Ahmed Khan**: Data Cleaning, initial setup of data for EDA. **10/10**

- **Bilal Naseer**: ML Model setup and execution. **10/10**

- **Suraj Pillarisetti**: Did not contribute any commits to the GitHub repository or provide any insights on project goals. Regurgitated information already presented by Adam. **1/10**