In [2]:
import pandas as pd
import numpy as np

# Load your dataset (update the path accordingly)
df = pd.read_csv('/Users/khali/Desktop/Coding Workspace/DataMining/data/cleaned/cleaned_data.csv')

# Function to identify outliers using IQR
def identify_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

# List of columns to analyze for anomalies
columns_to_check = ['sellingprice', 'odometer', 'condition', 'year']

# Finding anomalies in each specified column
anomalies = {}
for column in columns_to_check:
    anomalies[column] = identify_outliers_iqr(df, column)

# Display potential anomalies
for column, anomaly_df in anomalies.items():
    print(f"Potential anomalies in '{column}':")
    print(anomaly_df[['year', 'make', 'model', column]])  # Display relevant columns
    print("\n")


Potential anomalies in 'sellingprice':
        year           make                model  sellingprice
4       2014            BMW  6 Series Gran Coupe       67000.0
6       2014            BMW                   M5       65000.0
10      2014           Audi                   A6       49750.0
15      2014           Audi                   Q5       40000.0
17      2014            BMW             6 Series       67200.0
...      ...            ...                  ...           ...
472275  2014  Mercedes-Benz              E-Class       40500.0
472276  2012       Maserati         Quattroporte       40250.0
472302  2014  Mercedes-Benz              E-Class       69500.0
472308  2013  Mercedes-Benz              G-Class      105000.0
472315  2013           Audi                   S5       42000.0

[14118 rows x 4 columns]


Potential anomalies in 'odometer':
        year       make            model  odometer
258     2013    Hyundai    Elantra Coupe  999999.0
1528    2012     Nissan         Frontier

In [4]:
# Another anomaly detection algorithm for extra credit

from sklearn.neighbors import LocalOutlierFactor

df = pd.read_csv('/Users/khali/Desktop/Coding Workspace/DataMining/data/cleaned/cleaned_data.csv')

# Select the relevant features
X = df[["sellingprice", "odometer", "condition"]]

# Initialize and fit the LOF model for multivariate analysis
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.01)  # Adjust contamination for outlier fraction
outlier_labels = lof.fit_predict(X)

# Convert outlier labels to a new column in the DataFrame
df["lof_outlier"] = (outlier_labels == -1).astype(int)  # 1 for outliers, 0 for inliers

# Print counts of inliers and outliers
print("LOF Outlier counts:")
print(df["lof_outlier"].value_counts())

# Display a sample of detected outliers for inspection
print("\nSample of detected LOF outliers:")
print(df[df["lof_outlier"] == 1].head(10))



LOF Outlier counts:
lof_outlier
0    467601
1      4724
Name: count, dtype: int64

Sample of detected LOF outliers:
      year       make                model               trim           body  \
37    2014        BMW                   X6          xDrive35i            SUV   
53    2015     Nissan                Versa             1.6 SL          Sedan   
67    2014        BMW  6 Series Gran Coupe               650i          Sedan   
204   2013   Infiniti              G Sedan        G37 Journey        G Sedan   
258   2013    Hyundai        Elantra Coupe                 GS  Elantra Coupe   
319   2013    Hyundai             Santa Fe         Sport 2.0T            SUV   
428   2013        Kia             Sportage                 LX            SUV   
930   2012  Chevrolet               Malibu                 LT          Sedan   
982   2012        BMW                   X5  xDrive35i Premium            SUV   
1005  2012  Chevrolet               Malibu           LS Fleet          Sedan   

   

### Extra Credit

1) The scikit-learn library has the Local Outlier Factor algorithm that can help us determine potential anomalies and outliers. This algorithm selects three attributes, in our case sellingprice, odometer, and condition, and are analyzed together. the way this algorithm works is by fitting a model with an n number of neighbors, 20 in this case, and finding the desnity of a point. The less dense a point is compared to its neighbor, it's a potential outlier. The fit_predict() function then finds and scored outliers and inliers, -1 and 1 respectively. 

2) Please refer to the code for the selected attributes.

3) We did in fact find some anomalies. A part of the sample anomalies were also present in our MAD algorithm, for exmaple the Hyundai Elantra with a million miles.

4) One anomaly was the 2014 BMW X6 wtih less than a thousand miles and a condition of 4, this can mean that the vehicle was involved in an accident soon after it was bought and was never resold. Another anomaly was the 2013 G37 sedan with a condiiton of 44 and a mileage of 14,486. This car was probbaly kept in very good condition and was possibly only driven in the summer or weekends. Finally, of course we have the example of the million mile Elantra which can be ragrded as a clear outlier. A couple of these examples such as the Elantra and the X6 can be disregarded altogether due to their abnormal combination of attributes. 