In [27]:
import pandas as pd

# Load the dataset
data_path = 'NY-House-Dataset.csv'
ny_houses = pd.read_csv(data_path)


In [29]:
# Checking for missing values
print(ny_houses.isnull().sum())

# Calculate the median for numeric columns only
numeric_cols = ny_houses.select_dtypes(include=['int64', 'float64']).columns
ny_houses[numeric_cols] = ny_houses[numeric_cols].fillna(ny_houses[numeric_cols].median())

# Optionally, for non-numeric columns, if you have any NaN values, you might want to fill them with the mode or a placeholder
# For example, to fill with the most common value (mode) for each column:
categorical_cols = ny_houses.select_dtypes(include=['object']).columns
ny_houses[categorical_cols] = ny_houses[categorical_cols].fillna(ny_houses[categorical_cols].mode().iloc[0])


BROKERTITLE                    0
TYPE                           0
PRICE                          0
BEDS                           0
BATH                           0
PROPERTYSQFT                   0
ADDRESS                        0
STATE                          0
MAIN_ADDRESS                   0
ADMINISTRATIVE_AREA_LEVEL_2    0
LOCALITY                       0
SUBLOCALITY                    0
STREET_NAME                    0
LONG_NAME                      0
FORMATTED_ADDRESS              0
LATITUDE                       0
LONGITUDE                      0
dtype: int64


In [30]:
from sklearn.model_selection import train_test_split

# Defining the features and target for regression
X = ny_houses.drop(['PRICE', 'ADDRESS', 'FORMATTED_ADDRESS', 'MAIN_ADDRESS'], axis=1)  # dropping non-numeric and target columns
y = ny_houses['PRICE']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [33]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Checking for non-numeric columns and converting them
non_numeric_cols = X_train.select_dtypes(exclude=['int64', 'float64']).columns
X_train = pd.get_dummies(X_train, columns=non_numeric_cols)
X_test = pd.get_dummies(X_test, columns=non_numeric_cols)

# Ensuring both train and test sets have the same columns
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Replace inf/-inf with NaN and then fill with the median for numeric columns
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.fillna(X_train.median(), inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.fillna(X_train.median(), inplace=True)

# Linear Regression Model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)

# Decision Tree Regression Model
tree_model = DecisionTreeRegressor()
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)

# Evaluation of Regression Models
print("Linear Regression MSE:", mean_squared_error(y_test, y_pred_linear))
print("Decision Tree MSE:", mean_squared_error(y_test, y_pred_tree))
print("Linear Regression R²:", r2_score(y_test, y_pred_linear))
print("Decision Tree R²:", r2_score(y_test, y_pred_tree))


Linear Regression MSE: 46055228021336.02
Decision Tree MSE: 11941737339126.959
Linear Regression R²: -0.8279631205571871
Decision Tree R²: 0.5260243757518247


In [49]:
# Conclusion:
# The Decision Tree model outperforms the Linear Regression model in both MSE and R² metrics.
# Recommendations for improving the Linear Regression model include checking the assumptions of linear regression,
# applying data transformations, and considering more complex models or feature engineering to capture non-linear relationships.

In [43]:
from sklearn.ensemble import RandomForestClassifier

# Creating the Random Forest classifier
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Fitting the model
random_forest_model.fit(X_train, y_class_train)
y_pred_forest = random_forest_model.predict(X_test)

# Evaluating the Random Forest model
from sklearn.metrics import classification_report
print("Random Forest Classifier Report:")
print(classification_report(y_class_test, y_pred_forest))

from sklearn.svm import SVC

# Creating the SVM classifier
svm_model = SVC(kernel='linear')  # You can change the kernel to 'rbf' for non-linear issues

# Fitting the model
svm_model.fit(X_train, y_class_train)
y_pred_svm = svm_model.predict(X_test)

# Evaluating the SVM model
print("Support Vector Machine Classifier Report:")
print(classification_report(y_class_test, y_pred_svm))


Random Forest Classifier Report:
              precision    recall  f1-score   support

           0       0.90      0.89      0.90       489
           1       0.89      0.90      0.90       472

    accuracy                           0.90       961
   macro avg       0.90      0.90      0.90       961
weighted avg       0.90      0.90      0.90       961

Support Vector Machine Classifier Report:
              precision    recall  f1-score   support

           0       0.87      0.84      0.86       489
           1       0.84      0.88      0.86       472

    accuracy                           0.86       961
   macro avg       0.86      0.86      0.86       961
weighted avg       0.86      0.86      0.86       961



In [50]:
# Based on the classification reports provided above, the Random Forest Classifier shows superior performance
# compared to the Support Vector Machine. With an accuracy and F1-score of 0.90 compared to 0.86 for the SVM,
# the Random Forest model not only predicts more accurately but also balances the precision and recall better.
# This makes it the preferred model for scenarios where both identifying the positive class accurately and
# ensuring the comprehensive identification of positive cases are crucial.


In [46]:
# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Convert categorical columns to dummy variables
X_encoded = pd.get_dummies(X, columns=categorical_cols)

# Now, X_encoded contains only numeric data and is ready for clustering.

from sklearn.cluster import KMeans

# Clustering with k-means using the encoded data
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X_encoded)

# Adding cluster labels to the DataFrame
X['CLUSTER'] = clusters

# Displaying some of the data to verify clustering results
print(X.head())

from sklearn.decomposition import PCA

# Reducing the dimensionality
pca = PCA(n_components=2)  # Reduce to 2 dimensions for visualization or based on explained variance
X_pca = pca.fit_transform(X_encoded)

# Applying k-means on the reduced data
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X_pca)

# Adding cluster labels to the DataFrame
X['CLUSTER'] = clusters


                                         BROKERTITLE                TYPE  \
0        Brokered by Douglas Elliman  -111 Fifth Ave      Condo for sale   
1                                Brokered by Serhant      Condo for sale   
2                             Brokered by Sowae Corp      House for sale   
3                                Brokered by COMPASS      Condo for sale   
4  Brokered by Sotheby's International Realty - E...  Townhouse for sale   

   BEDS       BATH  PROPERTYSQFT                    STATE  \
0     2   2.000000        1400.0       New York, NY 10022   
1     7  10.000000       17545.0       New York, NY 10019   
2     4   2.000000        2015.0  Staten Island, NY 10312   
3     3   1.000000         445.0      Manhattan, NY 10022   
4     7   2.373861       14175.0       New York, NY 10065   

  ADMINISTRATIVE_AREA_LEVEL_2  LOCALITY      SUBLOCALITY       STREET_NAME  \
0             New York County  New York        Manhattan  East 55th Street   
1               Unit