In [41]:
import pandas as pd
from pandas.plotting import parallel_coordinates
from math import radians, cos, sin, asin, sqrt, log  # Import required math functions
import os
import folium
from folium.plugins import HeatMap

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [29]:
# Load the dataset
raw_housing_data = pd.read_csv("data/PrimaryDataset-MLS-RentalProperties.csv")

# Display basic information about the dataset
print(raw_housing_data.shape)
raw_housing_data.info() # Get summary information about the columns and data types

# Display the first few rows of the dataset to inspect it
raw_housing_data.head()

raw_housing_data.columns

(1749, 77)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1749 entries, 0 to 1748
Data columns (total 77 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   List Number                           1749 non-null   object 
 1   Agency Phone                          1736 non-null   object 
 2   Listing Agent                         1749 non-null   object 
 3   Co-Listing Agent                      285 non-null    object 
 4   Property Type                         1749 non-null   object 
 5   Card Format                           1749 non-null   object 
 6   Book Section                          1749 non-null   object 
 7   Property Sub Type                     1749 non-null   object 
 8   Listing Contract Date                 1749 non-null   object 
 9   Back on Market Date                   78 non-null     object 
 10  Status                                1749 non-null   object 
 11  Status

Index(['List Number', 'Agency Phone', 'Listing Agent', 'Co-Listing Agent',
       'Property Type', 'Card Format', 'Book Section', 'Property Sub Type',
       'Listing Contract Date', 'Back on Market Date', 'Status',
       'Status Change Timestamp', 'Original List Price', 'List Price',
       'Living Area Source', 'MLS Area Major', 'Lock Box Number',
       'Street Direction Prefix', 'Street Direction Suffix',
       'Non-Representative Compensation Type', 'City', 'State', 'County',
       'Country', 'Postal Code', 'Stories Total', 'Stories', 'Latitude',
       'Longitude', 'GeoID', 'Living Area', 'Year Built', 'Direction Faces',
       'Lot Size Dimensions', 'Bedrooms Total', 'Bathrooms Total',
       'Bathrooms Full', 'Bathrooms Half', 'Parcel Number', 'Owner Name',
       'mod_timestamp', 'Association Fee Frequency', 'Occupant Type',
       'Listing Agreement', 'Association Fee 2 Frequency', 'Listing Service',
       'Lease Term', 'Association Fee', 'Building Area Total', 'Garage Sp

# Selecting Relevant and Numerical Columns
In this cell, we focus on selecting specific columns from the dataset for further analysis:

<!-- **Relevant Columns:** We define the relevant_columns list, which contains a broader set of columns, including both numerical and categorical data that are deemed important for our analysis. -->

**Numerical Columns**: From the relevant_columns, we further narrow down the dataset by selecting only the numerical columns for ease of analysis, such as price, number of bedrooms, bathrooms, area size, etc. These are stored in the numerical_columns list.

**Dropping Irrelevant Columns**: Using the numerical_columns, we create a new DataFrame, cleaned_housing_data, which contains only the relevant numerical columns. This step simplifies the dataset and prepares it for further processing, like model training or statistical analysis.

In [30]:
# Define the relevant columns to be used in the analysis
# relevant_columns = [ 'List Number'
#     'List Price', 'Bedrooms Total', 'Bathrooms Total', 'Living Area', 
#     'MLS Area Major', 'Year Built', 
#     'Lot Size Acres', 'Days on Market', 'Non-Representative Compensation',
#     'Waterfront YN', 'Garage YN', 'Stories Total', 'Stories', 'Bedrooms Total', 'Bathrooms Total',
#     'Bathrooms Full', 'Bathrooms Half', 'Garage YN', 'Garage Spaces', 'Original List Price', 'Latitude', 'Longitude', 'Rooms', 'Features'
# ]

# Select only the numerical columns for analysis
numerical_columns = [
    'List Number','List Price', 'Bedrooms Total', 'Bathrooms Total', 'Living Area', 'MLS Area Major', 'Year Built', 'Lot Size Acres', 'Days on Market', 'Non-Representative Compensation',
    'Stories Total', 'Stories', 'Bathrooms Full', 'Bathrooms Half', 'Garage Spaces', 'Original List Price', 'Latitude', 'Longitude'
]

# Create a new DataFrame with only the selected numerical columns
numerical_housing_data = raw_housing_data[numerical_columns]

# Cleaning the Dataset
In this cell, we perform data cleaning operations to ensure that the dataset is suitable for further analysis

In [31]:
# Drop rows with any missing values
numerical_housing_data = numerical_housing_data.dropna()

# Check for duplicate columns and remove them
numerical_housing_data = numerical_housing_data.loc[:, ~numerical_housing_data.columns.duplicated()]

# Output the dimensions of the cleaned dataset and preview its structure
print(numerical_housing_data.shape)
numerical_housing_data.info()
numerical_housing_data.head()

(340, 18)
<class 'pandas.core.frame.DataFrame'>
Index: 340 entries, 24 to 1745
Data columns (total 18 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   List Number                      340 non-null    object 
 1   List Price                       340 non-null    int64  
 2   Bedrooms Total                   340 non-null    int64  
 3   Bathrooms Total                  340 non-null    int64  
 4   Living Area                      340 non-null    float64
 5   MLS Area Major                   340 non-null    int64  
 6   Year Built                       340 non-null    int64  
 7   Lot Size Acres                   340 non-null    float64
 8   Days on Market                   340 non-null    int64  
 9   Non-Representative Compensation  340 non-null    float64
 10  Stories Total                    340 non-null    float64
 11  Stories                          340 non-null    float64
 12  Bathrooms Full 

Unnamed: 0,List Number,List Price,Bedrooms Total,Bathrooms Total,Living Area,MLS Area Major,Year Built,Lot Size Acres,Days on Market,Non-Representative Compensation,Stories Total,Stories,Bathrooms Full,Bathrooms Half,Garage Spaces,Original List Price,Latitude,Longitude
24,gAAAAABm9wKy8iEF2iPj4yUG9W9aJzlue0ha1rOr4yH2hV...,1695,2,2,1230.0,26,2008,0.01,55,10.0,2.0,2.0,2,0.0,1.0,1725,30.284262,-81.454856
26,gAAAAABm9wKyZy5yVqVfOab8ncZ2KX9fPYOjfO9KuSTwOh...,2700,5,2,2498.0,14,1996,0.46,43,0.0,2.0,2.0,2,0.0,2.0,2900,30.156717,-81.629576
37,gAAAAABm9wKyO0FKSTfQtXpJt527HRb4FRUbo6r7zsshdH...,2500,5,2,1990.0,135,1980,0.27,53,100.0,1.0,1.0,2,0.0,2.0,2600,30.160414,-81.744717
39,gAAAAABm9wKyzMbRTpX1vRA4eWcmaYrxsjpy9YlV1jQyBf...,4425,3,3,2442.0,43,2012,0.1,23,1.0,2.0,2.0,2,1.0,2.0,4425,30.341041,-81.461648
40,gAAAAABm9wKyN72eWI3levdaCpsUo_0YQoDg888YszAwJH...,4500,3,2,1426.0,212,1979,0.14,21,1.0,2.0,2.0,2,0.0,1.0,4500,30.266153,-81.398813


# Saving the Cleaned Dataset
In this cell, we aim to save the cleaned data to a CSV file for future use, while ensuring that we don't overwrite an existing file unintentionally:

In [32]:
# File path to save the CSV
numerical_file_path = 'data/numerical_housing_data.csv'

# Check if the file already exists
if not os.path.exists(numerical_file_path):
    numerical_housing_data.to_csv(numerical_file_path, index=False)
    print(f"File saved as {numerical_file_path}")
else:
    print(f"File {numerical_file_path} already exists. No action taken.")


File saved as data/numerical_housing_data.csv


# Creating Engineered Features
In this cell, we create new price ratio features using the data from the numerical_housing_data DataFrame. Instead of relying solely on the list price, these ratios offer more accurate and meaningful measures of the property’s value by adjusting for specific characteristics like the number of bedrooms, bathrooms, and square footage.

In [33]:
# Add engineered price metrics to the main DataFrame
numerical_housing_data['Price per Bedroom'] = numerical_housing_data['List Price'] / numerical_housing_data['Bedrooms Total']
numerical_housing_data['Price per Full Bathroom'] = numerical_housing_data['List Price'] / numerical_housing_data['Bathrooms Full']
numerical_housing_data['Price per Total Bathroom'] = numerical_housing_data['List Price'] / numerical_housing_data['Bathrooms Total']
numerical_housing_data['Price per Story'] = numerical_housing_data['List Price'] / numerical_housing_data['Stories']
numerical_housing_data['Price per Garage Space'] = numerical_housing_data['List Price'] / numerical_housing_data['Garage Spaces']
numerical_housing_data['Price per Living Area'] = numerical_housing_data['List Price'] / numerical_housing_data['Living Area']
numerical_housing_data['Price per Lot Size Acre'] = numerical_housing_data['List Price'] / numerical_housing_data['Lot Size Acres']
numerical_housing_data['Price per Year Built'] = numerical_housing_data['List Price'] / numerical_housing_data['Year Built']

# Handle NaN and infinity values for consistency in the main DataFrame
columns_to_fix = ['Price per Bedroom', 'Price per Full Bathroom', 'Price per Total Bathroom', 
                  'Price per Story', 'Price per Garage Space', 'Price per Living Area', 
                  'Price per Lot Size Acre', 'Price per Year Built']

# Fix NaN and infinite values
for col in columns_to_fix:
    numerical_housing_data[col].replace([np.inf, -np.inf], np.nan, inplace=True)
    numerical_housing_data[col].fillna(numerical_housing_data[col].median(), inplace=True)

# Now, all price metrics are stored within numerical_housing_data, and we can proceed with other features or flags


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  numerical_housing_data[col].replace([np.inf, -np.inf], np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  numerical_housing_data[col].fillna(numerical_housing_data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will nev

In the context of detecting fraudulent or nefarious listings, the `Under_30_Days_Flag` provides important insights into the listing behavior:
1. Rapid Turnover Can Indicate Fraud:
    Listings that are on the market for 30 days or less and then disappear could be suspicious. Scammers may list fake properties, get quick responses, and remove the listing once they've attracted victims. A high turnover rate may suggest that the listing is either underpriced to lure in victims or that the listing is fake and meant to disappear quickly.
2. Uncommon Market Behavior:
    In most real estate markets, properties tend to stay listed for more than 30 days unless they are highly desirable or misrepresented. Listings with a flag of 1 (30 days or less) can be used as a potential red flag for deeper analysis, especially when combined with other indicators like price anomalies or inconsistent agent information.

3. Behavioral Clustering:
    By using the Under_30_Days_Flag, we can cluster or categorize properties based on their time on the market. If a cluster of properties flagged as "Under 30 Days" shares other suspicious characteristics (e.g., odd pricing, proximity to multiple points of interest), it strengthens the likelihood that the listings are fraudulent.

In [34]:
# Add the Under_30_Days_Flag (1 = 30 days or less, 0 = more than 30 days)
numerical_housing_data['Under_30_Days_Flag'] = numerical_housing_data['Days on Market'].apply(lambda x: 1 if x <= 30 else 0)

# Verify the first few rows of the updated dataset
print(numerical_housing_data[['Days on Market', 'Under_30_Days_Flag']].head())

    Days on Market  Under_30_Days_Flag
24              55                   0
26              43                   0
37              53                   0
39              23                   1
40              21                   1


### Suspiciousness Score Based on Proximity to Important Locations
In this cell, we calculate a suspicion score for each listing based on its proximity to important locations such as universities and military bases. Listings close to high-traffic or important areas might be more likely to exhibit suspicious behavior, such as underpricing or quick turnover, which could indicate fraudulent activity.

1. Important Locations:
A list of important locations (e.g., universities, military bases) is used to compute suspiciousness scores. Each location is assigned a weight based on its importance, such as the size of the population it serves.
2. Suspiciousness Calculation:
We use the Haversine formula to calculate the distance between the listing and important locations. A suspiciousness score is calculated using a weighted logarithmic distance function, where listings closer to high-weight locations have higher suspiciousness scores.
3. Filtering and Visualization:
Listings without latitude or longitude are filtered out. We then visualize the suspiciousness scores on a heatmap using Folium, with the intensity of the color indicating higher levels of suspicion.
4. Heatmap:
The heatmap is centered around Duval County, and it is saved as an HTML file for further inspection.

In [42]:
# List of important locations with weights
important_coordinates = [
    {"name": "University of North Florida", "latitude": 30.2715, "longitude": -81.5094, "weight": 0.7727},  # Jacksonville, Duval County
    {"name": "Flagler College", "latitude": 29.8947, "longitude": -81.3145, "weight": 0.1182},  # St. Augustine, St. Johns County
    {"name": "St. Johns River State College", "latitude": 29.6486, "longitude": -81.6417, "weight": 0.2955},  # Palatka, Putnam County
    {"name": "Edward Waters University", "latitude": 30.3422, "longitude": -81.6794, "weight": 0.0455},  # Jacksonville, Duval County
    {"name": "Concorde Career Institute", "latitude": 30.3374, "longitude": -81.5546, "weight": 0.0227},  # Jacksonville, Duval County
    {"name": "First Coast Technical College", "latitude": 29.8922, "longitude": -81.3305, "weight": 0.0182},  # St. Augustine, St. Johns County
    {"name": "Jacksonville University", "latitude": 30.3532, "longitude": -81.6068, "weight": 0.2045},  # Jacksonville, Duval County
    {"name": "Jones Technical Institute", "latitude": 30.2449, "longitude": -81.5322, "weight": 0.0182},  # Jacksonville, Duval County
    {"name": "Tulsa Welding School", "latitude": 30.3385, "longitude": -81.5637, "weight": 0.0136},  # Jacksonville, Duval County
    {"name": "Chamberlain University-Florida", "latitude": 30.2598, "longitude": -81.5904, "weight": 0.0409},  # Jacksonville, Duval County
    {"name": "Fortis College-Orange Park", "latitude": 30.1785, "longitude": -81.7079, "weight": 0.0318},  # Orange Park, Clay County
    {"name": "Florida State College at Jacksonville", "latitude": 30.3322, "longitude": -81.6557, "weight": 1.0000},  # Jacksonville, Duval County
    {"name": "Trinity Baptist College", "latitude": 30.2395, "longitude": -81.7802, "weight": 0.0227},  # Jacksonville, Duval County
    {"name": "Keiser University", "latitude": 30.3326, "longitude": -81.6562, "weight": 0.0455},  # Jacksonville, Duval County
    {"name": "Heritage Institute", "latitude": 30.2033, "longitude": -81.5837, "weight": 0.0182},  # Jacksonville, Duval County
    {"name": "Embry-Riddle Aeronautical University", "latitude": 29.1880, "longitude": -81.0479, "weight": 0.4091},  # Daytona Beach, Volusia County
    {"name": "Naval Air Station Jacksonville", "latitude": 30.2358, "longitude": -81.6800, "weight": 0.9545},  # Jacksonville, Duval County
    {"name": "Naval Station Mayport", "latitude": 30.3915, "longitude": -81.4245, "weight": 0.5455},  # Jacksonville, Duval County
    {"name": "Camp Blanding Joint Training Center", "latitude": 29.9693, "longitude": -81.9840, "weight": 0.6818},  # Clay County
    {"name": "Marine Corps Blount Island Command", "latitude": 30.4111, "longitude": -81.5059, "weight": 0.1364},  # Jacksonville, Duval County
]

# Haversine formula to calculate the distance between two points
def haversine(lat1, lon1, lat2, lon2):
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * asin(sqrt(a))
    km = 6371 * c  # Radius of Earth in kilometers
    return km

# Function to calculate suspicion score based on proximity to important locations
def calculate_weighted_suspiciousness(row, important_locations, max_distance=50, scaling_factor=1):
    listing_lat = row['Latitude']
    listing_lon = row['Longitude']
    
    total_suspiciousness = 0
    baseline_suspiciousness = 0.05  # To account for listings far from important locations
    
    for location in important_locations:
        dist = haversine(listing_lat, listing_lon, location['latitude'], location['longitude'])
        
        # Only compute suspiciousness for distances within the max limit
        if dist <= max_distance:
            location_suspiciousness = location['weight'] * (1 / (log(dist + 1) + scaling_factor))
            total_suspiciousness += location_suspiciousness
    
    # Add baseline score if total suspiciousness is very low
    total_suspiciousness = max(baseline_suspiciousness, total_suspiciousness)
    
    return total_suspiciousness

# Apply the suspicion calculation to each listing in the dataset
numerical_housing_data['Distance Suspiciousness'] = numerical_housing_data.apply(lambda row: calculate_weighted_suspiciousness(row, important_coordinates), axis=1)

# Create a map centered around Duval County
m = folium.Map(location=[30.3322, -81.6557], zoom_start=10)

# Create a list of coordinates and weights (suspicion scores) for the heatmap
heatmap_data = [[row['Latitude'], row['Longitude'], row['Distance Suspiciousness']] for index, row in numerical_housing_data.iterrows()]

# Add the heatmap layer with suspicion scores
HeatMap(heatmap_data, max_value=1, radius=15, blur=10).add_to(m)

# Save the map to an HTML file
m.save('suspicion_score_heatmap.html')


In [25]:
plt.hist(numerical_housing_data['Distance Suspiciousness'], bins=30, edgecolor='black')
# Add titles and labels
plt.title('Distribution of Distance Suspiciousness')
plt.xlabel('Suspiciousness Score')
plt.ylabel('Frequency')

# Show the plot
plt.show()

KeyError: 'Distance Suspiciousness'

### Detecting Suspicious Phone Numbers Based on Multiple Agent Associations
In this cell, we aim to detect suspicious phone numbers by identifying numbers that are associated with more than one unique listing agent. These suspicious phone numbers could indicate potential fraud, such as a single person or entity listing properties under multiple names to create the illusion of different agents.

In [51]:
print(numerical_housing_data.columns)
print(raw_housing_data.columns)
print(numerical_housing_data['List Number'].duplicated().sum())
print(raw_housing_data['List Number'].duplicated().sum())


Index(['List Number', 'List Price', 'Bedrooms Total', 'Bathrooms Total',
       'Living Area', 'MLS Area Major', 'Year Built', 'Lot Size Acres',
       'Days on Market', 'Non-Representative Compensation', 'Stories Total',
       'Stories', 'Bathrooms Full', 'Bathrooms Half', 'Garage Spaces',
       'Original List Price', 'Latitude', 'Longitude', 'Price per Bedroom',
       'Price per Full Bathroom', 'Price per Total Bathroom',
       'Price per Story', 'Price per Garage Space', 'Price per Living Area',
       'Price per Lot Size Acre', 'Price per Year Built', 'Under_30_Days_Flag',
       'Distance Suspiciousness', 'Listing Agent_x', 'Agency Phone_x',
       'is_phone_suspicious', 'Listing Agent_y', 'Agency Phone_y',
       'Listing Agent', 'Agency Phone'],
      dtype='object')
Index(['List Number', 'Agency Phone', 'Listing Agent', 'Co-Listing Agent',
       'Property Type', 'Card Format', 'Book Section', 'Property Sub Type',
       'Listing Contract Date', 'Back on Market Date', 'Stat

In [49]:
print(numerical_housing_data['List Number'].dtype)
print(raw_housing_data['List Number'].dtype)


object
object


In [57]:
# Step 1: Merge based on 'List Number' as the unique identifier
# Drop 'Listing Agent' and 'Agency Phone' in numerical_housing_data if they exist
numerical_housing_data = numerical_housing_data.drop(columns=['Listing Agent', 'Agency Phone'], errors='ignore')

# Step 1: Merge based on 'List Number' as the unique identifier
numerical_housing_data = numerical_housing_data.merge(
    raw_housing_data[['List Number', 'Listing Agent', 'Agency Phone']], 
    on='List Number', 
    how='left'
)


# Step 2: Group by 'Agency Phone' and aggregate the unique 'Listing Agent' values
duplicates = numerical_housing_data.groupby('Agency Phone')['Listing Agent'].nunique()

# Step 3: Filter for phone numbers that are associated with more than one unique agent name
suspicious_numbers = duplicates[duplicates > 1]

# Step 4: Flag listings that contain suspicious phone numbers
def flag_suspicious_phone(row):
    if row['Agency Phone'] in suspicious_numbers.index:
        return 1  # Flag as suspicious
    return 0  # Not suspicious

# Step 5: Apply the function to the raw_housing_data DataFrame
numerical_housing_data['is_phone_suspicious'] = numerical_housing_data.apply(flag_suspicious_phone, axis=1)

# Step 6: Display flagged listings
suspicious_phone_listings = numerical_housing_data[numerical_housing_data['is_phone_suspicious'] == 1]
print(suspicious_phone_listings)


                                           List Number  List Price  \
0    gAAAAABm9wKy8iEF2iPj4yUG9W9aJzlue0ha1rOr4yH2hV...        1695   
1    gAAAAABm9wKyZy5yVqVfOab8ncZ2KX9fPYOjfO9KuSTwOh...        2700   
2    gAAAAABm9wKyO0FKSTfQtXpJt527HRb4FRUbo6r7zsshdH...        2500   
3    gAAAAABm9wKyzMbRTpX1vRA4eWcmaYrxsjpy9YlV1jQyBf...        4425   
4    gAAAAABm9wKyN72eWI3levdaCpsUo_0YQoDg888YszAwJH...        4500   
..                                                 ...         ...   
327  gAAAAABm9wKy4l8o4r_XnMbd9SBNRZZoxejrP1PVND5ugM...        4050   
331  gAAAAABm9wKyZ7oyYD1xjxFZxxKXEh4OlVsFjgug-4QJqk...        3200   
334  gAAAAABm9wKy3EKc9CTpbCMDFCnW0hOmDro6Bu-gIDu_4X...        3400   
338  gAAAAABm9wKyKdxHsPqZYBHf9lHdjPWGImYwIRwLd3NEmP...        1620   
339  gAAAAABm9wKy_Kh_pxh7FOqfUuzhjUAQuSn4K3cfqHvILr...        3550   

     Bedrooms Total  Bathrooms Total  Living Area  MLS Area Major  Year Built  \
0                 2                2       1230.0              26        2008 


# Principal Component Analysis (PCA) and Anomaly Detection

In this section, we apply PCA for dimensionality reduction and anomaly detection on housing data.
We will retain 95% of variance and also explore anomaly detection using DBSCAN and Isolation Forest.

---

### Step 1: Fit PCA and Transform the Data
Here, we fit PCA to the scaled dataset and retain 95% of the variance.

In [None]:
# Step 1: Fit the PCA (retaining 95% of variance)
scaler = StandardScaler()
x_scaled = scaler.fit_transform(cleaned_housing_data)

print(x_scaled.shape)
pca_model = PCA(n_components=0.95)
x_pca = pca_model.fit_transform(x_scaled)  # 'x_scaled' is the scaled input data


### Step 2: Create a DataFrame for the Principal Components

We create a DataFrame that contains the principal components for each data point.
Each component captures the variance in the original features.


In [54]:

# Create column names for PCA components
pca_columns = [f'PC{i+1}' for i in range(x_pca.shape[1])]

# Create a DataFrame for the PCA-transformed data
pca_df = pd.DataFrame(x_pca, columns=pca_columns)
pca_df.head()

model1 = IsolationForest(n_estimators=100, max_samples='auto', contamination='auto', random_state=12)

model1.fit(x_pca)

predict = model1.predict(x_pca)
anomalies = x_scaled[predict == -1]

In [None]:
# Step 1: Fit the PCA as before
pca = PCA(n_components=0.95)  # Retaining 95% of variance
x_pca = pca.fit_transform(x_scaled)

# Step 2: Create a DataFrame for the principal components
pca_columns = [f'PC{i+1}' for i in range(x_pca.shape[1])] 
pca_df = pd.DataFrame(x_pca, columns=pca_columns)

# Step 3: Get the contributions of each original feature to each principal component
loadings = pca.components_.T  # Transpose to get original features as rows
print(loadings.shape)
print(len(numerical_columns))
contributions_df = pd.DataFrame(loadings, index=numerical_columns, columns=pca_columns)


# Step 4: Get the top contributing feature for each principal component (for labeling purposes)
top_features_per_pc = contributions_df.abs().idxmax()

# Step 5: Create a box plot and label each principal component by its strongest contributing original feature
plt.figure(figsize=(15, 10))
sns.boxplot(data=pca_df)

# Rotate the text diagonally and align it so it reads from top to bottom
plt.xticks(range(len(top_features_per_pc)), top_features_per_pc, rotation=45, ha="right", rotation_mode="anchor")

# Title
plt.title('Boxplot for Each Principal Component - Labeled by Top Contributing Feature')
plt.show()

In [None]:
# Step 1: Fit the PCA as before
pca = PCA(n_components=0.95)  # Retaining 95% of variance
x_pca = pca.fit_transform(x_scaled)

# Step 2: Create a DataFrame for the principal components
pca_columns = [f'PC{i+1}' for i in range(x_pca.shape[1])]
pca_df = pd.DataFrame(x_pca, columns=pca_columns)

# Step 3: Get the contributions of each original feature to each principal component
loadings = pca.components_.T  # Transpose to get original features as rows
contributions_df = pd.DataFrame(loadings, index=numerical_columns, columns=pca_columns)

# Step 4: Get the top contributing feature for each principal component (for labeling purposes)
top_features_per_pc = contributions_df.abs().idxmax()

# Step 5: Add anomaly labels to the PCA DataFrame
pca_df['anomaly'] = predict  # Anomalies (-1) and normal points (1)

# Step 6: Parallel Coordinates Plot with PCA data and anomaly labels
plt.figure(figsize=(12, 6))
parallel_coordinates(pca_df, 'anomaly', color=['blue', 'red'])

# Step 7: Adjust the x-axis labels (principal components) with top contributing original feature names
plt.xticks(range(len(top_features_per_pc)), top_features_per_pc, rotation=45, ha="right", rotation_mode="anchor")

# Title
plt.title('Parallel Coordinates Plot for PCA-Transformed Housing Data')
plt.show()

In [None]:
# Step 1: Fit the PCA as before
pca = PCA(n_components=0.95)  # Retaining 95% of variance
x_pca = pca.fit_transform(x_scaled)

# Step 2: Create a DataFrame for the principal components
pca_columns = [f'PC{i+1}' for i in range(x_pca.shape[1])]
pca_df = pd.DataFrame(x_pca, columns=pca_columns)

# Step 3: Get the contributions of each original feature to each principal component
loadings = pca.components_.T  # Transpose to get original features as rows
contributions_df = pd.DataFrame(loadings, index=numerical_columns, columns=pca_columns)

# Step 4: Get the top contributing feature for each principal component (for labeling purposes)
top_features_per_pc = contributions_df.abs().idxmax()

# Step 5: Filter the PCA data to show only anomalies (-1)
anomalous_data = pca_df[predict == -1]

# Step 6: Plot a heatmap of the anomalous entries (use PCA-transformed data)
plt.figure(figsize=(12, 8))
sns.heatmap(anomalous_data, cmap='coolwarm', annot=False, linewidths=0.5)

# Step 7: Adjust x-axis labels to reflect original features (diagonal or vertical)
plt.xticks(range(len(top_features_per_pc)), top_features_per_pc, rotation=45, ha="right", rotation_mode="anchor")

# Title
plt.title('Heatmap of Anomalous Entries Across Principal Components (PCA-Transformed Data)')
plt.show()

In [None]:
# Step 1: Fit DBSCAN on the PCA-transformed data
dbscan_model = DBSCAN(eps=0.5, min_samples=5)  # Adjust eps and min_samples based on your data
dbscan_labels = dbscan_model.fit_predict(x_pca)

# Step 2: Add DBSCAN cluster labels to the PCA DataFrame
pca_df['dbscan_cluster'] = dbscan_labels

# Step 3: Visualize the clusters
plt.figure(figsize=(10, 7))
sns.scatterplot(x=pca_df['PC1'], y=pca_df['PC2'], hue=pca_df['dbscan_cluster'], palette='viridis')
plt.title('DBSCAN Clustering on PCA-Transformed Data')
plt.show()

# Step 4: Cross-examine DBSCAN clusters with Isolation Forest results
pca_df['anomaly_iforest'] = predict  # Add Isolation Forest anomaly labels (-1 for anomalies, 1 for normal)

# Compare clusters and anomalies
cross_examined = pca_df[(pca_df['dbscan_cluster'] != -1) & (pca_df['anomaly_iforest'] == -1)]

print(f"Number of points flagged by both DBSCAN and Isolation Forest: {len(cross_examined)}")
cross_examined

In [None]:
suspicious_words = [
    'urgent', 'alert', 'wire transfer', 'guaranteed', 'free',
    'cash only', 'as seen on', 'limited time', 'don’t miss out',
    'risk-free', 'act now', 'exclusive', 'once in a lifetime',
   'contact now', 'no credit check', 'easy approval','Foreclosure',
     "no deposit required", "move-in specials", "free month rent", "lease takeover", "rent-to-own", 
    "pre-approval needed", "urgent rental", "hurry, limited time offer", "cash only, no checks", 
    "first month free", "no background check", "instant approval", "no credit history needed", 
    "temporary housing", "assume the lease", "short-term rental", "virtual tour only", 
    "sublease opportunity", "guaranteed approval", "utilities included", "all bills paid", 
   "no application fee", "get approved today", "house sitting", "unbelievably low rent", 
    "no lease required", "instant income", "non-refundable deposit", "limited properties available", 
    "you won't believe the price", "exclusive listings", "flexible terms", "unforeseen circumstances", 
    "background check waived", "contact immediately", "first come, first served", "urgent need to rent", 
    "newly renovated", "don’t get left out", "act fast before it’s gone", "scam-free guarantee", 
    "friendly landlord", "best value rental", "quick approval process", "no hassle, no fees", 
    "all-inclusive rental", "hidden gem", "affordable living", "ideal for students", 
    "rent today, move in tomorrow"
]

def flag_suspicious_listings(row):
    # Join relevant columns into one text
    text = f"{row['Features']}".lower()  # Combine and convert to lowercase
    # Check for suspicious words
    for word in suspicious_words:
        if word in text:
            return 1  # Flag as suspicious
    return 0  # Not suspicious

# Assuming Features is the column to check
raw_housing_data['is_suspicious'] = raw_housing_data.apply(flag_suspicious_listings, axis=1)

# Display flagged listings
suspicious_listings = raw_housing_data[raw_housing_data['is_suspicious'] == 1]
print(suspicious_listings)

In [None]:

# Step 1: Start with the base cleaned DataFrame
final_df = cleaned_housing_data.copy()

# Step 2: Add the new features you created
# Assuming you have DataFrames or Series for these additional features, like df_price_metrics, proximity_scores, etc.

# Example: Add price metrics
final_df = pd.concat([final_df, df_price_metrics], axis=1)

# Example: Add scammy words flag (assuming 'is_suspicious' is a column in the raw_housing_data)
final_df['suspicious_diction'] = raw_housing_data['is_suspicious']

final_df['phone_is_suspicious'] =  raw_housing_data['is_phone_suspicious']
# Step 3: Clean the final DataFrame
# Handle NaN values, infinity, and other inconsistencies
final_df.fillna(final_df.median(), inplace=True)  # Fill NaNs with the median of each column


# Step 4: Check the final DataFrame
final_df.info()  # To see if everything looks good
final_df.head()  # To preview the first few rows


file_path = 'data/final_training_set.csv'

# Check if the file already exists
if not os.path.exists(file_path):
    final_df.to_csv(file_path, index=False)
    print(f"File saved as {file_path}")

In [None]:
final_df.columns

columns_to_drop = ['List Price', 'Bedrooms Total', 'Bathrooms Total', 'Living Area',
                   'MLS Area Major', 'Year Built', 'Lot Size Acres', 'Days on Market',
                   'Non-Representative Compensation', 'Stories Total', 'Stories',
                   'Bathrooms Full', 'Bathrooms Half', 'Garage Spaces',
                   'Original List Price', 'Latitude', 'Longitude']

# Drop the columns
final_df = final_df.drop(columns=columns_to_drop)


'''Over_30_Days_Flag',
       'Distance Suspiciousness', 'Price per Bedroom',
       'Price per Full Bathroom', 'Price per Total Bathroom',
       'Price per Story', 'Price per Garage Space', 'Price per Living Area',
       'Price per Lot Size Acre', 'Price per Year Built', 'suspicious_diction',
       'phone_is_suspicious'''

print(final_df.columns)

In [None]:
# Step 1: Select relevant features for anomaly detection
features = final_df.columns

print(features)

# Standardize the features
scaler = StandardScaler()

# Step 1: Replace infinite values with NaN
final_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Step 2: Fill NaN values (e.g., with the median of each column)
final_df.fillna(final_df.median(), inplace=True)

X_scaled = scaler.fit_transform(final_df[features])

In [98]:
# Initialize and fit Isolation Forest model
isolation_forest = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)
final_df['isolation_forest_flag'] = isolation_forest.fit_predict(X_scaled)

# Anomalies are flagged as -1
anomalies_if = final_df[final_df['isolation_forest_flag'] == -1]


In [99]:
# Initialize and fit LOF model
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
final_df['lof_flag'] = lof.fit_predict(X_scaled)

# LOF also flags anomalies as -1
anomalies_lof = final_df[final_df['lof_flag'] == -1]


In [None]:
'''# Prepare a label where 0 is normal and 1 is anomaly
final_df['anomaly_label'] = (final_df['suspicious_diction'] == 1).astype(int)

# Prepare the DMatrix for XGBoost
X = final_df[features]
y = final_df['anomaly_label']

dtrain = xgb.DMatrix(X, label=y)

# Train the XGBoost model (we can treat it as a classification problem)
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 3,
    'learning_rate': 0.1,
    'scale_pos_weight': len(y) / y.sum()  # Adjust for class imbalance
}
xgb_model = xgb.train(params, dtrain, num_boost_round=100)

# Predict anomaly scores (using threshold 0.5 to flag anomalies)
final_df['xgboost_flag'] = (xgb_model.predict(dtrain) > 0.5).astype(int)

# Flag XGBoost anomalies
anomalies_xgb = final_df[final_df['xgboost_flag'] == 1]
'''

In [None]:
# Combine flags: Check if all three models flagged the same listing
final_df['flagged_by_all'] = (
    (final_df['isolation_forest_flag'] == -1) & 
    (final_df['lof_flag'] == -1)) #& 
    # (final_df['xgboost_flag'] == 1)
#)

# Get the listings flagged by all models
anomalies_all_models = final_df[final_df['flagged_by_all']]

# Display the results
anomalies_all_models.head()


In [None]:
# Isolation Forest Scatter Plot
plt.figure(figsize=(10, 6))
plt.scatter(final_df.index, final_df['Price per Bedroom'], 
            c=final_df['isolation_forest_flag'], cmap='coolwarm', label='Anomaly Score')
plt.title('Isolation Forest - Anomaly Detection based on Price per Bedroom')
plt.xlabel('Index')
plt.ylabel('Price per Bedroom')
plt.colorbar(label='Flag (-1: Anomaly, 1: Normal)')
plt.show()


In [None]:
# LOF Scatter Plot
plt.figure(figsize=(10, 6))
plt.scatter(final_df.index, final_df['Price per Bedroom'], 
            c=final_df['lof_flag'], cmap='coolwarm', label='Anomaly Score')
plt.title('LOF - Anomaly Detection based on Price per Bedroom')
plt.xlabel('Index')
plt.ylabel('Price per Bedroom')
plt.colorbar(label='Flag (-1: Anomaly, 1: Normal)')
plt.show()


In [None]:
'''# XGBoost Probability Scatter Plot
xgboost_probs = xgb_model.predict(dtrain)

plt.figure(figsize=(10, 6))
plt.scatter(final_training_set.index, xgboost_probs, c=(xgboost_probs > 0.5).astype(int), cmap='coolwarm')
plt.title('XGBoost - Predicted Anomaly Probabilities')
plt.xlabel('Index')
plt.ylabel('Anomaly Probability')
plt.colorbar(label='Flag (1: Anomaly, 0: Normal)')
plt.show()'''


In [None]:
from matplotlib_venn import venn2

# Count how many were flagged by each model
isolation_flags = set(final_df.index[final_df['isolation_forest_flag'] == -1])
lof_flags = set(final_df.index[final_df['lof_flag'] == -1])

# Create a Venn diagram comparing the results of Isolation Forest and LOF
plt.figure(figsize=(8, 8))
venn = venn2([isolation_flags, lof_flags], 
             set_labels=('Isolation Forest', 'LOF'))

plt.title('Venn Diagram of Anomalies Flagged by Isolation Forest and LOF')
plt.show()
plt.figure(figsize=(10, 6))
plt.scatter(final_df.index, final_df['Price per Bedroom'], 
            c=final_df['flagged_by_all'], cmap='coolwarm')
plt.title('Listings Flagged as Anomalies by All Models')
plt.xlabel('Index')
plt.ylabel('List Price')
plt.colorbar(label='Flag (1: Anomaly by all models, 0: Normal)')
plt.show()



In [None]:
# Get the indices of listings flagged as anomalies by either Isolation Forest or LOF
anomalies_by_either = isolation_flags.union(lof_flags)
anomalies_by_either_list = list(anomalies_by_either)

# Filter the DataFrame to get the listings marked as anomalies by either model
anomalous_listings = final_df.loc[anomalies_by_either_list]

# Print the listings marked as anomalies by either model
print("Listings flagged as anomalies by either Isolation Forest or LOF:")
print(anomalous_listings)

In [None]:

# Assuming final_df is your updated DataFrame
# Step 1: Select only the numerical features from final_df for PCA
numeric_features = final_df.select_dtypes(include=['float64', 'int64']).columns

# Step 2: Extract the numeric features from final_df
X = final_df[numeric_features].values

# Step 3: Standardize the numeric features before applying PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Check the shape of X_scaled to confirm preprocessing
print(X_scaled.shape)

# Step 4: Apply PCA, retaining 95% of the variance
pca = PCA(n_components=0.95)  # Retaining 95% variance

x_pca = pca.fit_transform(X_scaled)  # Perform PCA on scaled data

# Step 5: Create a DataFrame for the PCA-transformed data
pca_columns = [f'PC{i+1}' for i in range(x_pca.shape[1])]

# Get the top contributing features for each principal component
top_features_per_pc = []
for i in range(pca.components_.shape[0]):
    top_feature_index = np.argmax(np.abs(pca.components_[i]))
    top_feature = numeric_features[top_feature_index]
    top_features_per_pc.append(top_feature)

# Create more descriptive principal component labels
pca_columns_descriptive = [f'PC{i+1} ({top_features_per_pc[i]})' for i in range(len(pca_columns))]

pca_df = pd.DataFrame(x_pca, columns=pca_columns)

# Step 6: Visualize the explained variance for each principal component
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_, marker='o')
plt.title('Explained Variance by Principal Components')
plt.xlabel('Principal Components')
plt.ylabel('Variance Ratio')
plt.show()

# Step 4: Get the loadings (contributions) of each original feature to each principal component
loadings = pca.components_.T  # Transpose to get original features as rows
contributions_df = pd.DataFrame(loadings, index=numeric_features, columns=pca_columns)

# Step 5: Generate a heatmap of the contributions
plt.figure(figsize=(12, 8))
sns.heatmap(contributions_df, cmap='coolwarm', annot=True, fmt='.2f', linewidths=0.5)
plt.title('Heatmap of Feature Contributions to Principal Components')
plt.show()

# Limit to a reasonable number of principal components (e.g., 10 for visualization)
pca_df_limited = pca_df.iloc[:, :17]

# Generate the boxplot
plt.figure(figsize=(12, 8))
sns.boxplot(data=pca_df_limited)

# Fix the xticks and labels
plt.title('Boxplot of Principal Components')
plt.xticks(rotation=90, ticks=np.arange(len(pca_df_limited.columns)), labels=pca_columns_descriptive[:17])
plt.show()


# # Step 6: Boxplot for each principal component
# plt.figure(figsize=(12, 8))
# # pca_df.boxplot()
# sns.boxplot(data=pca_df)

# plt.title('Boxplot of Principal Components')
# plt.xticks(rotation=90, ticks=np.arange(1, len(pca_columns_descriptive) + 1), labels=pca_columns_descriptive)
# plt.show()

#Step 7: Perform DBSCAN on the PCA-transformed data
dbscan = DBSCAN(eps=0.5, min_samples=5)  # You can adjust eps and min_samples based on your data
dbscan_labels = dbscan.fit_predict(x_pca)

# Step 8: Add the DBSCAN labels to the PCA DataFrame
pca_df['DBSCAN_label'] = dbscan_labels

# Step 9: Visualize the DBSCAN results using a scatter plot of the first two principal components
plt.figure(figsize=(10, 6))
sns.scatterplot(x='PC1', y='PC2', hue='DBSCAN_label', data=pca_df, palette='coolwarm')
plt.title('DBSCAN Clustering on Principal Components')
plt.show()

In [None]:
# Get the top contributing features for each principal component
top_features_per_pc = []
for i in range(pca.components_.shape[0]):
    top_feature_index = np.argmax(np.abs(pca.components_[i]))
    top_feature = numeric_features[top_feature_index]
    top_features_per_pc.append(top_feature)

# Create more descriptive principal component labels
pca_columns_descriptive = [f'PC{i+1} ({top_features_per_pc[i]})' for i in range(len(pca_columns))]

# Update the heatmap with descriptive labels
plt.figure(figsize=(12, 8))
sns.heatmap(contributions_df, cmap='coolwarm', annot=True, fmt='.2f', linewidths=0.5)
plt.title('Feature Contribution to Principal Components', fontsize=16)
plt.xlabel('Principal Components (Top Feature in Parentheses)', fontsize=12)
plt.ylabel('Feature Names', fontsize=12)
plt.xticks(ticks=np.arange(len(pca_columns_descriptive)) + 0.5, labels=pca_columns_descriptive, rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()  # Adjust layout to prevent overlap
plt.show()

# Use descriptive labels for the principal components in the explained variance plot
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_, marker='o')
plt.title('Variance Explained by Principal Components', fontsize=16)
plt.xlabel('Principal Components (Top Feature)', fontsize=12)
plt.ylabel('Variance Ratio (Explained Variance %)', fontsize=12)
plt.xticks(ticks=np.arange(1, len(pca_columns_descriptive) + 1), labels=pca_columns_descriptive, rotation=45, ha='right')
plt.grid(True)
plt.tight_layout()  # Ensure layout doesn't overlap
plt.show()



In [105]:
def estimate_financial_damage(anomalies_df, precision=0.8, scam_success_rate=0.5):
    """
    Estimate financial damages caused by scams based on anomalies.

    Parameters:
    anomalies_df (DataFrame): The dataframe of detected anomalies.
    precision (float): The precision rate of actual scams within anomalies (default=0.8).
    scam_success_rate (float): The success rate of scams succeeding (default=0.5).

    Returns:
    float: Estimated financial damages.
    """
    # Calculate the estimated number of actual scams
    estimated_scams = len(anomalies_df) * precision * scam_success_rate
    
    # Calculate the mean price of the anomalies
    mean_price = anomalies_df['Price per Story'].mean()
    
    # Estimate total financial damage
    estimated_damage = estimated_scams * mean_price
    
    return estimated_damage

In [None]:
estimate_financial_damage(anomalous_listings)



In [None]:
len(anomalous_listings)

In [None]:
anomalous_listings.sort_values(by = "Distance Suspiciousness", ascending=False)

In [None]:
features_for_detection = final_df.columns
features_for_detection

print(final_df['Under_30_Days_Flag'])

In [None]:
'''
# Prepare the data for anomaly detection (select relevant columns)
# Assuming df_price_metrics is the DataFrame with price metrics
features_for_detection = final_df.columns

# Fill NaN values with the median of each column (or another appropriate strategy)
features_for_detection = features_for_detection.fillna(features_for_detection.median())

# Fit the Isolation Forest model
iso_forest = IsolationForest(contamination=0.05, random_state=42)  # Adjust contamination rate
iso_forest.fit(features_for_detection)

# Get anomaly scores (-1 means anomaly)
anomaly_labels = iso_forest.predict(features_for_detection)
df_price_metrics['Anomaly'] = anomaly_labels

# Filter out anomalies (scam candidates)
anomalous_listings = df_price_metrics[df_price_metrics['Anomaly'] == -1]

# Display the anomalies
print(f"Number of anomalies detected: {len(anomalous_listings)}")
anomalous_listings.head()
'''

# Select only numerical columns for the anomaly detection
numerical_features = final_df.select_dtypes(include=[np.number])

# Fill NaN values with the median of each column (or another appropriate strategy)
numerical_features = numerical_features.fillna(numerical_features.median())

# Initialize StandardScaler
scaler = StandardScaler()

# Fit the scaler on the numerical data and transform it
scaled_features = scaler.fit_transform(numerical_features)

# Convert the scaled features back to a DataFrame and assign column names
scaled_df = pd.DataFrame(scaled_features, columns=numerical_features.columns)

# Replace the original numerical features in final_df with the scaled ones
final_df[numerical_features.columns] = scaled_df

# Fit the Isolation Forest model
iso_forest = IsolationForest(contamination=0.05, random_state=42)  # Adjust contamination rate
iso_forest.fit(numerical_features)

# Get anomaly scores (-1 means anomaly)
anomaly_labels = iso_forest.predict(numerical_features)
final_df['Anomaly'] = anomaly_labels

# Filter out anomalies (scam candidates)

# Display the anomalies
print(f"Number of anomalies detected: {len(anomalous_listings)}")
anomalous_listings


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Re-apply Min-Max Scaling on the numerical columns (excluding irrelevant ones)
min_max_scaler = MinMaxScaler()

# Fit the scaler and transform the clean numerical data
scaled_data_min_max = min_max_scaler.fit_transform(numerical_features)

# Convert the scaled data back to a DataFrame for easier manipulation
scaled_df_min_max = pd.DataFrame(scaled_data_min_max, columns=numerical_columns)

# Fit the Isolation Forest model on the Min-Max scaled data
iso_forest_min_max = IsolationForest(contamination=0.05, random_state=42)
iso_forest_min_max.fit(scaled_df_min_max)

# Get anomaly labels (-1 means anomaly)
anomaly_labels_min_max = iso_forest_min_max.predict(scaled_df_min_max)

# Adding the anomaly labels to the original dataframe
final_df['Anomaly'] = anomaly_labels_min_max

# Filter out anomalies (scam candidates)
anomalous_listings_min_max = final_df[final_df['Anomaly'] == -1]

# Displaying the number of anomalies and first few anomalous rows
len(anomalous_listings_min_max), anomalous_listings_min_max.head()


In [None]:
from mpl_toolkits.mplot3d import Axes3D

# Create a 3D plot
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection='3d')

# Plot normal listings
ax.scatter(final_df[final_df['Anomaly'] == 1]['Price per Story'],
           final_df[final_df['Anomaly'] == 1]['Price per Bedroom'],
           final_df[final_df['Anomaly'] == 1]['Distance Suspiciousness'],
           c='blue', label='Normal Listings', alpha=0.6)

# Plot anomalous listings
ax.scatter(final_df[final_df['Anomaly'] == -1]['Price per Story'],
           final_df[final_df['Anomaly'] == -1]['Price per Bedroom'],
           final_df[final_df['Anomaly'] == -1]['Distance Suspiciousness'],
           c='red', label='Anomalies (Scams)', alpha=0.8)

# Labeling
ax.set_title('Anomaly Detection - 3D Plot')
ax.set_xlabel('Price per Story')
ax.set_ylabel('Price per Bedroom')
ax.set_zlabel('Distance Suspiciousness')

plt.legend()
plt.show()


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
import numpy as np
import pandas as pd

# Assuming 'final_df' is your DataFrame with all the features
# Select only numerical columns for scaling
# Select only numerical columns for the anomaly detection

# Initialize StandardScaler
scaler = StandardScaler()

# Fit the scaler on the numerical data and transform it
scaled_features = scaler.fit_transform(numerical_features)

# Convert the scaled features back to a DataFrame and assign column names
scaled_df = pd.DataFrame(scaled_features, columns=numerical_features.columns)

# Replace the original numerical features in final_df with the scaled ones
final_df[numerical_features.columns] = scaled_df

# Now, final_df is standardized and ready for training with Isolation Forest

# Initialize the Isolation Forest
iso_forest = IsolationForest(contamination=0.05, random_state=42)

# Fit the model on the standardized data
iso_forest.fit(final_df.select_dtypes(include=[np.number]))

# Predict anomalies (-1 means anomaly, 1 means normal)
anomaly_labels = iso_forest.predict(final_df.select_dtypes(include=[np.number]))

# Add the anomaly labels to the DataFrame
final_df['Anomaly'] = anomaly_labels

# Filter out the anomalies (scam candidates)
anomalous_listings = final_df[final_df['Anomaly'] == -1]

# Display the anomalies
print(f"Number of anomalies detected: {len(anomalous_listings)}")
print(anomalous_listings.head())
