In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor

# Load the CSV file into a DataFrame
home_prices_df = pd.read_csv("Metro_US_All_Home_Prices.csv")

# Create a new column "Metro" by extracting the city name from "Region Name"
# We split the string in "Region Name" at the comma and take the first part (city)
home_prices_df["Metro"] = home_prices_df["RegionName"].str.split(',').str[0]

# Display the updated DataFrame to verify the new "Metro" column
home_prices_df.head()

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,2000-01-31,2000-02-29,2000-03-31,2000-04-30,2000-05-31,...,2023-12-31,2024-01-31,2024-02-29,2024-03-31,2024-04-30,2024-05-31,2024-06-30,2024-07-31,2024-08-31,Metro
0,102001,0,United States,country,,122710.838539,122926.726242,123194.458782,123768.692526,124429.566719,...,354242.717001,354677.065211,355473.042348,357021.050068,358884.957514,360324.712897,360888.668935,361037.327498,361281.717048,United States
1,394913,1,"New York, NY",msa,NY,218233.216724,219160.584763,220096.541902,221993.153236,223957.159952,...,639265.650987,642178.317205,646255.694516,651704.39572,657863.854229,662561.428749,665667.764258,668437.168324,672210.970325,New York
2,753899,2,"Los Angeles, CA",msa,CA,222104.914025,222931.426109,224032.332323,226222.765868,228618.413461,...,926554.539271,926708.887683,923740.986284,922618.466845,926105.99673,932034.718291,936012.934083,940417.087163,945635.978294,Los Angeles
3,394463,3,"Chicago, IL",msa,IL,153956.315498,154098.834197,154370.26357,155045.512395,155857.755512,...,311913.816691,312578.14075,313972.158826,316395.381263,319279.856054,321409.733463,322638.67466,323378.146509,324304.487366,Chicago
4,394514,4,"Dallas, TX",msa,TX,126111.301852,126167.472266,126232.045908,126399.736872,126620.832817,...,372112.028509,372133.731174,372616.126553,373774.335129,374743.310085,374887.158218,374222.635105,373283.837228,372632.458458,Dallas


In [5]:
commute_df = pd.read_csv("Commute_Time_By_Top_25_Metro_Area_All_Years__Three Buckets_Percentage.csv")
# Drop rows in the home_prices_df where "Metro" doesn't match with any city in "Metro" column of commute_df
filtered_home_prices_df = home_prices_df[home_prices_df["Metro"].isin(commute_df["Metro"])]
# Display the filtered DataFrame to verify the rows have been dropped correctly
filtered_home_prices_df.head()

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,2000-01-31,2000-02-29,2000-03-31,2000-04-30,2000-05-31,...,2023-12-31,2024-01-31,2024-02-29,2024-03-31,2024-04-30,2024-05-31,2024-06-30,2024-07-31,2024-08-31,Metro
1,394913,1,"New York, NY",msa,NY,218233.216724,219160.584763,220096.541902,221993.153236,223957.159952,...,639265.650987,642178.317205,646255.694516,651704.39572,657863.854229,662561.428749,665667.764258,668437.168324,672210.970325,New York
2,753899,2,"Los Angeles, CA",msa,CA,222104.914025,222931.426109,224032.332323,226222.765868,228618.413461,...,926554.539271,926708.887683,923740.986284,922618.466845,926105.99673,932034.718291,936012.934083,940417.087163,945635.978294,Los Angeles
3,394463,3,"Chicago, IL",msa,IL,153956.315498,154098.834197,154370.26357,155045.512395,155857.755512,...,311913.816691,312578.14075,313972.158826,316395.381263,319279.856054,321409.733463,322638.67466,323378.146509,324304.487366,Chicago
4,394514,4,"Dallas, TX",msa,TX,126111.301852,126167.472266,126232.045908,126399.736872,126620.832817,...,372112.028509,372133.731174,372616.126553,373774.335129,374743.310085,374887.158218,374222.635105,373283.837228,372632.458458,Dallas
5,394692,5,"Houston, TX",msa,TX,121737.010654,121759.686522,121674.688937,121725.49546,121772.36285,...,304832.859556,304826.840219,305204.501474,306189.659687,307208.643453,307743.352455,307625.056921,307261.595181,307043.671704,Houston


In [6]:
commute_df = pd.read_csv("Commute_Time_By_Top_25_Metro_Area_All_Years__Three Buckets_Percentage.csv")
# Drop rows in the home_prices_df where "Metro City" doesn't match with any city in "Metro" column of commute_df
filtered_home_prices_df = home_prices_df[home_prices_df["Metro"].isin(commute_df["Metro"])]

# Display the filtered DataFrame to verify the rows have been dropped correctly
filtered_home_prices_df

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,2000-01-31,2000-02-29,2000-03-31,2000-04-30,2000-05-31,...,2023-12-31,2024-01-31,2024-02-29,2024-03-31,2024-04-30,2024-05-31,2024-06-30,2024-07-31,2024-08-31,Metro
1,394913,1,"New York, NY",msa,NY,218233.216724,219160.584763,220096.541902,221993.153236,223957.159952,...,639265.7,642178.3,646255.7,651704.4,657863.9,662561.4,665667.8,668437.2,672211.0,New York
2,753899,2,"Los Angeles, CA",msa,CA,222104.914025,222931.426109,224032.332323,226222.765868,228618.413461,...,926554.5,926708.9,923741.0,922618.5,926106.0,932034.7,936012.9,940417.1,945636.0,Los Angeles
3,394463,3,"Chicago, IL",msa,IL,153956.315498,154098.834197,154370.26357,155045.512395,155857.755512,...,311913.8,312578.1,313972.2,316395.4,319279.9,321409.7,322638.7,323378.1,324304.5,Chicago
4,394514,4,"Dallas, TX",msa,TX,126111.301852,126167.472266,126232.045908,126399.736872,126620.832817,...,372112.0,372133.7,372616.1,373774.3,374743.3,374887.2,374222.6,373283.8,372632.5,Dallas
5,394692,5,"Houston, TX",msa,TX,121737.010654,121759.686522,121674.688937,121725.49546,121772.36285,...,304832.9,304826.8,305204.5,306189.7,307208.6,307743.4,307625.1,307261.6,307043.7,Houston
6,395209,6,"Washington, DC",msa,VA,180842.151942,180989.190625,181260.19265,181873.81913,182806.635695,...,550163.4,550587.0,552037.8,555090.4,559298.1,562754.7,564449.5,565015.7,565538.4,Washington
7,394974,7,"Philadelphia, PA",msa,PA,122513.075912,122785.376616,122971.211007,123429.776479,123873.533424,...,353067.3,354281.0,355843.5,358186.4,360942.5,363008.9,363937.8,364323.1,365052.2,Philadelphia
8,394856,8,"Miami, FL",msa,FL,112841.961497,113151.736319,113479.771936,114142.475317,114783.78124,...,478092.2,480478.4,482415.2,484565.1,486547.3,488237.7,488803.7,488994.7,489014.6,Miami
9,394347,9,"Atlanta, GA",msa,GA,145114.91365,145449.378651,145843.440336,146681.566032,147617.899469,...,377067.6,377615.0,378542.6,380073.1,381843.0,382984.5,383207.3,383004.3,382861.4,Atlanta
10,394404,10,"Boston, MA",msa,MA,216568.526977,217441.540329,218396.691528,220159.489052,222052.850426,...,669611.4,671597.0,674285.7,678926.7,684945.7,690059.8,692789.4,694389.5,696074.5,Boston


In [7]:
# Step 1: Identify columns that contain date information
date_columns = filtered_home_prices_df.filter(regex=r'^\d{4}-\d{2}-\d{2}$').columns

# Step 2: Create a new DataFrame to store yearly averages
# Extract the year from the date columns
filtered_home_prices_df_yearly_avg = filtered_home_prices_df.copy()

# Loop through each year from the date columns
for year in range(2000, 2025):  # From 2000 to 2024
    # Select all columns that match the current year
    year_columns = [col for col in date_columns if col.startswith(str(year))]
    
    # Calculate the average for that year across the matching columns
    filtered_home_prices_df_yearly_avg[str(year)] = filtered_home_prices_df[year_columns].mean(axis=1)

# Drop the original date columns
filtered_home_prices_df_yearly_avg = filtered_home_prices_df_yearly_avg.drop(columns=date_columns)

# Display the updated DataFrame with yearly averages
filtered_home_prices_df_yearly_avg.head()

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,Metro,2000,2001,2002,2003,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
1,394913,1,"New York, NY",msa,NY,New York,228236.333677,256591.123039,289406.051521,332275.171124,...,417767.27805,436687.392137,458570.405676,481006.500113,497671.075482,514606.039175,562982.333172,609970.348571,623831.364522,658359.949166
2,753899,2,"Los Angeles, CA",msa,CA,Los Angeles,232411.457365,257442.579993,291677.819819,346346.762012,...,507705.187043,527084.898482,573910.05907,633907.170969,649673.994925,683222.738515,778279.042016,882911.563806,881706.948498,931659.381922
3,394463,3,"Chicago, IL",msa,IL,Chicago,158276.964195,171890.36831,185654.507685,199856.902187,...,200373.019284,210856.095856,221825.942265,232372.941507,238138.587799,244847.236934,270610.015172,294727.81717,303240.128095,319244.572361
4,394514,4,"Dallas, TX",msa,TX,Dallas,127272.724323,129451.749818,132613.591398,136409.226463,...,182117.86722,203734.502389,224890.79438,243209.782563,251633.656933,262748.133755,304406.673419,369235.098967,371281.8261,373536.698994
5,394692,5,"Houston, TX",msa,TX,Houston,122403.123483,124028.932793,127026.323361,131259.404075,...,183534.022383,194909.774814,202110.803554,210371.454699,218684.752642,227258.825353,256617.624359,300703.437909,304271.847022,306637.915137


In [8]:
# Select the columns representing years between 2010 and 2024 (inclusive)
columns_to_keep = [str(year) for year in range(2010, 2025)]  # This creates a list of year columns from 2010 to 2024

# Keep only the required year columns and any non-year columns (like 'Metro', etc.)
# Assuming there are other necessary columns (e.g., 'Metro'), we include those as well
necessary_columns = ['Metro']  # Adjust this list to include any non-year columns you want to keep

# Filter the DataFrame to keep only the necessary and year columns
filtered_home_prices_df_yearly_avg = filtered_home_prices_df_yearly_avg[necessary_columns + columns_to_keep]

# Display the updated DataFrame
filtered_home_prices_df_yearly_avg.head()

Unnamed: 0,Metro,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
1,New York,396589.64763,384564.909211,375518.290765,384470.975667,404313.654609,417767.27805,436687.392137,458570.405676,481006.500113,497671.075482,514606.039175,562982.333172,609970.348571,623831.364522,658359.949166
2,Los Angeles,404119.91661,381628.22517,372390.499355,430129.125075,483187.074509,507705.187043,527084.898482,573910.05907,633907.170969,649673.994925,683222.738515,778279.042016,882911.563806,881706.948498,931659.381922
3,Chicago,197308.543137,179647.50149,167993.290058,175350.97222,192030.788324,200373.019284,210856.095856,221825.942265,232372.941507,238138.587799,244847.236934,270610.015172,294727.81717,303240.128095,319244.572361
4,Dallas,147029.550825,141360.812372,142964.04891,153243.2814,166007.231691,182117.86722,203734.502389,224890.79438,243209.782563,251633.656933,262748.133755,304406.673419,369235.098967,371281.8261,373536.698994
5,Houston,148284.112874,142123.906743,140945.029754,149056.617277,165669.306473,183534.022383,194909.774814,202110.803554,210371.454699,218684.752642,227258.825353,256617.624359,300703.437909,304271.847022,306637.915137


In [14]:
Home_Prices_Adjusted_df = pd.melt(filtered_home_prices_df_yearly_avg, id_vars=['Metro'], var_name='Year', value_name='Value')
Home_Prices_Adjusted_df = Home_Prices_Adjusted_df.sort_values(by=['Metro', 'Year']).reset_index(drop=True)
Home_Prices_Adjusted_df.head(30)


Unnamed: 0,Metro,Year,Value
0,Atlanta,2010,148689.10017
1,Atlanta,2011,131296.644196
2,Atlanta,2012,127172.80665
3,Atlanta,2013,141549.412391
4,Atlanta,2014,162089.948743
5,Atlanta,2015,174951.646311
6,Atlanta,2016,189197.874487
7,Atlanta,2017,204153.911849
8,Atlanta,2018,222524.201892
9,Atlanta,2019,237693.585081


In [20]:
import pandas as pd

# Assuming Home_Prices_Adjusted_df is already loaded
# If not, you'd need to load it first

# Sort the dataframe by Metro and Year
Home_Prices_Adjusted_df = Home_Prices_Adjusted_df.sort_values(['Metro', 'Year'])

# Group by Metro and calculate percent change
Change_In_Home_Prices_Adjusted_df = Home_Prices_Adjusted_df.groupby('Metro', group_keys=False).apply(
    lambda x: x.assign(
        PercentChange=x['Value'].pct_change().shift(-1) * 100
    )
).reset_index(drop=True)

# Format PercentChange to 5 decimal places
Change_In_Home_Prices_Adjusted_df['PercentChange'] = Change_In_Home_Prices_Adjusted_df['PercentChange'].apply(lambda x: f"{x:.5f}" if pd.notnull(x) else x)

# Remove the last year (2014) as it doesn't have a next year to compare to
Change_In_Home_Prices_Adjusted_df = Change_In_Home_Prices_Adjusted_df[Change_In_Home_Prices_Adjusted_df['Year'] != 2014]

# Display the first few rows of the new dataframe
print(Change_In_Home_Prices_Adjusted_df.head())

     Metro  Year          Value PercentChange
0  Atlanta  2010  148689.100170     -11.69720
1  Atlanta  2011  131296.644196      -3.14086
2  Atlanta  2012  127172.806650      11.30478
3  Atlanta  2013  141549.412391      14.51121
4  Atlanta  2014  162089.948743       7.93491


  Change_In_Home_Prices_Adjusted_df = Home_Prices_Adjusted_df.groupby('Metro', group_keys=False).apply(


In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Assuming Change_In_Home_Prices_Adjusted_df and commute_times_df are already loaded

# Merge datasets
merged_data = pd.merge(Change_In_Home_Prices_Adjusted_df, commute_times_df, on=['Metro', 'Year'], how='inner')

# Ensure 'Year' is numeric and filter for years 2010 through 2023
merged_data['Year'] = pd.to_numeric(merged_data['Year'], errors='coerce')
merged_data = merged_data[(merged_data['Year'] >= 2010) & (merged_data['Year'] <= 2023)]

# Prepare features
three_bucket_features = [
    'Short Commutes - Less than half-hour (%)',
    'Medium Commutes - Half-hour to one hour (%)',
    'Long Commutes - More than one hour (%)'
]
yes_no_feature = ['Yes Commute (more than 5 minutes) (%)']

# Function to run analysis
def run_analysis(X, y, feature_name):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    return {
        'R-squared': r2,
        'MSE': mse,
        'Coefficients': dict(zip(feature_name, model.coef_)),
        'Intercept': model.intercept_
    }

# Analysis for three buckets
X_three_buckets = merged_data[three_bucket_features]
y = merged_data['PercentChange']
three_buckets_results = run_analysis(X_three_buckets, y, three_bucket_features)

# Analysis for Yes/No commute
X_yes_no = merged_data[yes_no_feature]
yes_no_results = run_analysis(X_yes_no, y, yes_no_feature)

# Print results
print("Results for Three Commute Buckets:")
print(f"R-squared: {three_buckets_results['R-squared']:.4f}")
print(f"Mean Squared Error: {three_buckets_results['MSE']:.4f}")
print("Coefficients:")
for feature, coef in three_buckets_results['Coefficients'].items():
    print(f"  {feature}: {coef:.4f}")
print(f"Intercept: {three_buckets_results['Intercept']:.4f}")

print("\nResults for Yes/No Commute:")
print(f"R-squared: {yes_no_results['R-squared']:.4f}")
print(f"Mean Squared Error: {yes_no_results['MSE']:.4f}")
print("Coefficient:")
for feature, coef in yes_no_results['Coefficients'].items():
    print(f"  {feature}: {coef:.4f}")
print(f"Intercept: {yes_no_results['Intercept']:.4f}")

Results for Three Commute Buckets:
R-squared: -0.0672
Mean Squared Error: 1469.0717
Coefficients:
  Short Commutes - Less than half-hour (%): -19897.0778
  Medium Commutes - Half-hour to one hour (%): -19894.0470
  Long Commutes - More than one hour (%): -19898.3521
Intercept: 1989627.6724

Results for Yes/No Commute:
R-squared: -0.0041
Mean Squared Error: 1382.2544
Coefficient:
  Yes Commute (more than 5 minutes) (%): 20.4775
Intercept: -1998.3434


In [51]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.cluster import KMeans

# Assuming merged_data is already prepared as before

# Prepare features and target
features = ['Short Commutes - Less than half-hour (%)', 
            'Medium Commutes - Half-hour to one hour (%)', 
            'Long Commutes - More than one hour (%)',
            'Yes Commute (more than 5 minutes) (%)']
X = merged_data[features]
y = merged_data['PercentChange']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to evaluate model
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2

# 1. Polynomial Regression
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)
X_poly_train, X_poly_test = train_test_split(X_poly, test_size=0.2, random_state=42)
poly_model = LinearRegression()
poly_mse, poly_r2 = evaluate_model(poly_model, X_poly_train, X_poly_test, y_train, y_test)

# 2. Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_mse, rf_r2 = evaluate_model(rf_model, X_train, X_test, y_train, y_test)

# 3. Lasso Regression
lasso_model = Lasso(alpha=1.0)
lasso_mse, lasso_r2 = evaluate_model(lasso_model, X_train, X_test, y_train, y_test)

# 4. Ridge Regression
ridge_model = Ridge(alpha=1.0)
ridge_mse, ridge_r2 = evaluate_model(ridge_model, X_train, X_test, y_train, y_test)

# 5. Clustering + Regression
kmeans = KMeans(n_clusters=3, random_state=42)
merged_data['Cluster'] = kmeans.fit_predict(X)
cluster_results = []
for cluster in range(3):
    cluster_data = merged_data[merged_data['Cluster'] == cluster]
    X_cluster = cluster_data[features]
    y_cluster = cluster_data['PercentChange']
    X_train, X_test, y_train, y_test = train_test_split(X_cluster, y_cluster, test_size=0.2, random_state=42)
    cluster_model = LinearRegression()
    cluster_mse, cluster_r2 = evaluate_model(cluster_model, X_train, X_test, y_train, y_test)
    cluster_results.append((cluster, cluster_mse, cluster_r2))

# Print results
print("Polynomial Regression - MSE: {:.4f}, R-squared: {:.4f}".format(poly_mse, poly_r2))
print("Random Forest - MSE: {:.4f}, R-squared: {:.4f}".format(rf_mse, rf_r2))
print("Lasso Regression - MSE: {:.4f}, R-squared: {:.4f}".format(lasso_mse, lasso_r2))
print("Ridge Regression - MSE: {:.4f}, R-squared: {:.4f}".format(ridge_mse, ridge_r2))
print("Clustering + Regression Results:")
for cluster, mse, r2 in cluster_results:
    print("Cluster {}: MSE: {:.4f}, R-squared: {:.4f}".format(cluster, mse, r2))

# Feature importance for Random Forest
feature_importance = pd.DataFrame({'feature': features, 'importance': rf_model.feature_importances_})
print("\nRandom Forest Feature Importance:")
print(feature_importance.sort_values('importance', ascending=False))



Polynomial Regression - MSE: 1422.4838, R-squared: -0.0333
Random Forest - MSE: 3472.7205, R-squared: -1.5227
Lasso Regression - MSE: 1435.4495, R-squared: -0.0428
Ridge Regression - MSE: 1432.2625, R-squared: -0.0404
Clustering + Regression Results:
Cluster 0: MSE: 8518.6208, R-squared: -0.0123
Cluster 1: MSE: 58.1880, R-squared: -0.4406
Cluster 2: MSE: 7.0388, R-squared: -0.5436

Random Forest Feature Importance:
                                       feature  importance
3        Yes Commute (more than 5 minutes) (%)    0.340030
2       Long Commutes - More than one hour (%)    0.278600
1  Medium Commutes - Half-hour to one hour (%)    0.232509
0     Short Commutes - Less than half-hour (%)    0.148861
