# 	ML Application: Predicting Real Estate Sales in Connecticut

---

## Dependencies and Loading

In [1]:
# Import the modules
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
file_path = Path("Data/real_estate_sales_2020.xlsx")
re_sales_df = pd.read_excel(file_path)

# Review the DataFrame
re_sales_df.head()

Unnamed: 0,Serial Number,List Year,Date Recorded,Town,Address,Assessed Value,Sale Amount,Sales Ratio,Property Type,Residential Type,Non Use Code,Assessor Remarks,OPM remarks,Location
0,2020177,2020,2021-04-14,Ansonia,323 BEAVER ST,133000,248400.0,0.5354,Residential,Single Family,,,,POINT (-73.06822 41.35014)
1,2020225,2020,2021-05-26,Ansonia,152 JACKSON ST,110500,239900.0,0.4606,Residential,Three Family,,,,
2,2020348,2020,2021-09-13,Ansonia,230 WAKELEE AVE,150500,325000.0,0.463,Commercial,,,,,
3,2020090,2020,2020-12-14,Ansonia,57 PLATT ST,127400,202500.0,0.6291,Residential,Two Family,,,,
4,200500,2020,2021-09-07,Avon,245 NEW ROAD,217640,400000.0,0.5441,Residential,Single Family,,,,


## Preprocessing

In [None]:
# Obtain zipcodes for each address using GeoPy

# Add zipcodes into re_sales_df column


In [6]:
# Review values for Property Type and Residential Type
print(re_sales_df['Property Type'].value_counts(dropna=False))
print(re_sales_df['Residential Type'].value_counts(dropna=False))

Residential       60728
Vacant Land        3163
Commercial         1981
Apartments          486
Industrial          228
Public Utility        5
NaN                   1
Name: Property Type, dtype: int64
Single Family    43404
Condo            12360
NaN               5864
Two Family        3201
Three Family      1495
Four Family        268
Name: Residential Type, dtype: int64


In [None]:
# Drop unnecessary columns
re_sales_df.drop(columns=['Serial Number', 'List Year', 'Date Recorded', 'Address', 'Assessed Value', 'Sales Ratio', 'Non Use Code', 'Assessor Remarks', 'OPM remarks', 'Location'])

In [None]:
# Remove rows with NaN values
re_sales_df.dropna()

# Re-review Property Type and Residential Type values
print(re_sales_df['Property Type'].value_counts(dropna=False))
print(re_sales_df['Residential Type'].value_counts(dropna=False))

# Get new row count
print(re_sales_df.shape())

In [None]:
# One-hot encode property types, residential types, towns, and zipcodes
property_dummies = pd.get_dummies(re_sales_df["Property Type"])
residential_dummies = pd.get_dummies(re_sales_df["Residential Type"])
town_dummies = pd.get_dummies(re_sales_df["Town"])
zipcode_dummies = pd.get_dummies(re_sales_df[""])

# Display transformed data
print(property_dummies.head())
print(residential_dummies.head())
print(town_dummies.head())
print(zipcode_dummies.head())

In [None]:
# Concatenate transformed data to re_sales_df
re_sales_df = pd.concat([re_sales_df, property_dummies], axis=1)
re_sales_df = pd.concat([re_sales_df, residential_dummies], axis=1)
re_sales_df = pd.concat([re_sales_df, town_dummies], axis=1)
re_sales_df = pd.concat([re_sales_df, zipcode_dummies], axis=1)

# Drop original columns
re_sales_df = re_sales_df.drop(columns=['Property Type', 'Residential Type', "Town", ""])

# Display dataframe
re_sales_df.head()

## Building

In [8]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = re_sales_df['Sale Amount']

# Separate the X variable, the features
X = re_sales_df.drop(columns=['Sale Amount'])

In [9]:
# Check the balance of our target values to determine if features are unbalanced
y.value_counts()

250000.0     835
200000.0     701
300000.0     677
225000.0     577
350000.0     571
            ... 
460786.0       1
738675.0       1
362075.0       1
3102258.0      1
7986111.0      1
Name: Sale Amount, Length: 6960, dtype: int64

Unbalanced status:

In [None]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

#Split data into testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### If unbalanced status Y:

---


In [None]:

# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
ros_model = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_res, y_res = ros_model.fit_resample(X_train, y_train)

In [None]:
# Count the distinct values of the resampled labels data
print('Resampled dataset shape %s' % Counter(y_res))

---

In [None]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(random_state=1)

# Fit the model using training data
lr_model = logistic_regression_model.fit(X_train, y_train)

In [None]:
# Make a prediction using testing data
testing_predictions = logistic_regression_model.predict(X_test)


## Initial Evaluation

In [None]:
# Print the balanced_accuracy score of the model 
balanced_accuracy_score(y_test, testing_predictions)

## Optimizing

## Evaluating

In [None]:
# Print the balanced_accuracy score of the model 
balanced_accuracy_score(y_test, testing_predictions)

In [None]:
# Generate a confusion matrix for the model
test_matrix = confusion_matrix(y_test, testing_predictions)
print(test_matrix)

In [None]:
# Print the classification report for the model
testing_report = classification_report(y_test, testing_predictions)

print(testing_report)

## Visuals