In [1]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

file_path = "Civil_Engineering_Regression_Dataset.csv"
df = pd.read_csv(file_path)

X = df[['Building_Height', 'Material_Quality_Index', 'Labor_Cost', 'Concrete_Strength', 'Foundation_Depth']]
y = df['Construction_Cost']

X = sm.add_constant(X)

model = sm.OLS(y, X).fit()

regression_coefficients = model.params
highest_impact_variable = regression_coefficients[1:].abs().idxmax()

# Compute R-squared and Adjusted R-squared for multiple regression
r_squared_multiple = model.rsquared
adjusted_r_squared_multiple = model.rsquared_adj

# Fit a simple linear regression model using only 'Building_Height'
X_simple = sm.add_constant(df[['Building_Height']])
model_simple = sm.OLS(y, X_simple).fit()
r_squared_simple = model_simple.rsquared

# Calculate VIF for each independent variable
vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Model Interpretation & Conclusion
summary = {
    "Key Takeaways": {
        "R-squared Comparison": (r_squared_simple, r_squared_multiple),
        "Adjusted R-squared": adjusted_r_squared_multiple,
        "Highest Impact Variable": highest_impact_variable,
        "Multicollinearity (VIF)": vif_data.to_dict()
    },
    "Usage in Construction": "Regression analysis helps estimate costs by identifying key cost drivers, improving budgeting accuracy, and optimizing resource allocation.",
    "Limitations": "Potential data quality issues, multicollinearity, and omitted variable bias could impact accuracy.",
    "Improvements": "Including variables like location, labor experience, weather conditions, and material supply chain factors could enhance the model.",
    "Conclusion": "Regression analysis is a powerful tool in civil engineering, allowing companies to optimize construction costs and improve project planning through data-driven decision-making."
}

print("Model Summary & Conclusion:")
print(summary)

Model Summary & Conclusion:
{'Key Takeaways': {'R-squared Comparison': (0.9154177373112963, 0.9997946519351985), 'Adjusted R-squared': 0.9997837291657942, 'Highest Impact Variable': 'Building_Height', 'Multicollinearity (VIF)': {'Variable': {0: 'const', 1: 'Building_Height', 2: 'Material_Quality_Index', 3: 'Labor_Cost', 4: 'Concrete_Strength', 5: 'Foundation_Depth'}, 'VIF': {0: 36.2172436051685, 1: 1.0471642393113527, 2: 1.0480671139395974, 3: 1.0540858954504875, 4: 1.019701324974831, 5: 1.0405938701183752}}}, 'Usage in Construction': 'Regression analysis helps estimate costs by identifying key cost drivers, improving budgeting accuracy, and optimizing resource allocation.', 'Limitations': 'Potential data quality issues, multicollinearity, and omitted variable bias could impact accuracy.', 'Improvements': 'Including variables like location, labor experience, weather conditions, and material supply chain factors could enhance the model.', 'Conclusion': 'Regression analysis is a powerful