In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Packages

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import probplot
from pandas.plotting import scatter_matrix
from scipy import stats

In [None]:
car_prices_df = pd.read_csv('/kaggle/input/vehicle-sales-data/car_prices.csv', sep=';')
car_prices_df.head()

#### Description of the dataset columns

1. **Year:** The manufacturing year of the vehicle (e.g., 2015) <br>
2. **Make:** The brand or manufacturer of the vehicle (e.g., Kia, BMW, Volvo) <br>
3. **Model:** The specific model of the vehicle (e.g., Sorento, 3 Series, S60, 6 Series Gran Coupe) <br>
4. **Trim:** Additional designation for a particular version or option package of the model (e.g., LX, 328i SULEV, T5, 650i) <br>
5. **Body:** The type of vehicle body (e.g., SUV, Sedan) <br>
6. **Transmission:** The type of transmission in the vehicle (e.g., automatic) <br>
7. **VIN:** The Vehicle Identification Number, a unique code used to identify individual motor vehicles <br>
8. **State:** The state in which the vehicle is located or registered (e.g., CA for California) <br>
9. **Condition:** A numerical representation of the condition of the vehicle (e.g., 5.0) <br>
10. **Odometer:** The mileage or distance traveled by the vehicle <br>
11. **Color:** The exterior color of the vehicle <br>
12. **Interior:** The interior color of the vehicle <br>
13. **Seller:** The entity or company selling the vehicle (e.g., Kia Motors America Inc, Financial Services Remarketing) <br>
14. **MMR:** Manheim Market Report, a pricing tool used in the automotive industry <br>
15. **Selling Price:** The price at which the vehicle was sold <br>
16. **Sale Date:** The date and time when the vehicle was sold <br>

In [None]:
car_prices_df.info()

In [None]:
df = car_prices_df.copy()
df.isnull().sum()

The dataset comprises 558,837 entries with 16 columns detailing vehicle attributes. Numerical features include manufacturing year, vehicle condition, odometer reading, Manheim Market Report, and selling price. Categorical features encompass make, model, trim, body type, transmission type, VIN number, color, interior color, seller information, and sale date. Some columns contain missing values, requiring data type adjustments for analysis.

# Exploratory Data Analysis

#### Handling Missing Values In Categorical Columns

When dealing with a categorical column like 'make' with a significant number of null values, filling them requires careful consideration. Since 'make' represents the brand or manufacturer of the vehicle, blindly filling null values with the most common value may introduce bias.

1. Fill with a Placeholder Category <br>
2. Use Mode, Median, Mean (most frequent category) <br>
3. Remove Null values <br>

We are going to use all of the above Imputation techniques to our columns


In [None]:
#fill with 'Other' category
df['make'].fillna('Other', inplace=True)
df['model'].fillna('Other', inplace=True)
df['trim'].fillna('Other', inplace=True)
df['color'].fillna('Other', inplace=True)

#fill with 'Mode'
df['body'].fillna(df.body.mode()[0], inplace=True)
df['transmission'].fillna(df['transmission'].mode()[0], inplace=True)
df['interior'].fillna(df['interior'].mode()[0], inplace=True)

#remove null values
df.dropna(subset=['vin'], inplace=True)
df.dropna(subset=['saledate'], inplace=True)


In [None]:
df.isnull().sum()

#### Handling Missing Values in Numerical Columns

In [None]:
# Assuming 'df' is your DataFrame
plt.figure(figsize=(12, 8))

# Plot histogram for 'condition'
plt.subplot(3, 1, 1)
plt.hist(df['condition'].dropna(), bins=20, color='blue', alpha=0.7)
plt.title('Distribution of Condition')
plt.xlabel('Condition')
plt.ylabel('Frequency')

# Plot histogram for 'odometer'
plt.subplot(3, 1, 2)
plt.hist(df['odometer'].dropna(), bins=20, color='green', alpha=0.7)
plt.title('Distribution of Odometer')
plt.xlabel('Odometer')
plt.ylabel('Frequency')

# Plot histogram for 'mmr'
plt.subplot(3, 1, 3)
plt.hist(df['mmr'].dropna(), bins=20, color='orange', alpha=0.7)
plt.title('Distribution of MMR')
plt.xlabel('MMR')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()


Visualize the distribution of values in the 'condition', 'odometer', and 'mmr' columns. The peaks and shapes of the histograms will give you insights into where the most values lie.

In [None]:
df['condition'].fillna(df['condition'].median(), inplace=True)
df['odometer'].fillna(df['odometer'].mean(), inplace=True)
df['mmr'].fillna(df['mmr'].mean(), inplace=True)

In [None]:
df.isnull().sum()

<br>

#### Checking For Duplicate Value

In [None]:
duplicate_rows = df[df.duplicated()]
duplicate_rows.sum()

#if the duplicate values exist we can remove it using this line of code
# df.drop_duplicates()

In our dataset there is no duplicate values

#### Analyzing Dataset

In [None]:
df.describe()

Provided statistics (count, mean, std, min, 25%, 50%, 75%, max) for the columns 'year', 'condition', 'odometer', 'mmr', and 'sellingprice'. These statistics provide a summary of the central tendency, dispersion, and range of values in each column.

#### Outliers Handling

**Identify Outliers**
    
You can use statistical methods or visualizations to identify outliers. Common methods include
    
1. Box Plots: Visualize the distribution of each numerical variable and identify points beyond the whiskers. <br>
2.
Z-Score: Calculate the Z-score for each data point in a numerical column. Points with a high absolute Z-score (e.g., >3 or <-3) can be considered outlierers.:

In [None]:
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns

fig, axes = plt.subplots(nrows=len(numerical_columns), ncols=1, figsize=(10, 8))
fig.subplots_adjust(hspace=0.5)

for i, column in enumerate(numerical_columns):
    axes[i].boxplot(df[column].dropna())
    axes[i].set_title(f'Box Plot for {column}')
    axes[i].set_ylabel(column)

plt.show()

In [None]:
scatter_matrix(df[numerical_columns], alpha=0.5, figsize=(12, 12), diagonal='hist')
plt.suptitle('Scatter Matrix for Numerical Columns', y=1.02)
plt.show()


In [None]:
# Remove outliers using Z-score
z_scores = stats.zscore(df[numerical_columns])
clean_df = df[(z_scores < 2).all(axis=1)]
clean_df.shape

In [None]:
numerical_columns = clean_df.select_dtypes(include=['float64', 'int64']).columns

fig, axes = plt.subplots(nrows=len(numerical_columns), ncols=1, figsize=(10, 8))
fig.subplots_adjust(hspace=0.5)

for i, column in enumerate(numerical_columns):
    axes[i].boxplot(clean_df[column].dropna())
    axes[i].set_title(f'Box Plot for {column}')
    axes[i].set_ylabel(column)

plt.show()

In [None]:
clean_df.head(2)

**Understand the Impact**<br>
    
Before removing outliers, it's crucial to understand the impact on your analysis or model. Outliers may contain valuable information or be indicative of specific patterns in your data. Always consider the context of your data and the goals of your analysis or modeling before deciding to remove outliers.

In [None]:
clean_df.describe()

In [None]:
#we are dropping the saledate column because we face error inside this column. And its not going to help us in analysis. in Timeseries analysis it can help but we are not doing time series analysis on this data.
clean_df.drop(columns=['saledate'], inplace=True)

In [None]:
clean_df.head(5)

In [None]:
clean_df.shape

In [None]:
plt.figure(figsize=(20, 6))
plt.hist(clean_df['sellingprice'], bins=30, edgecolor='black', alpha=0.7)
plt.title('Distribution of Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(20, 6))
plt.scatter(clean_df['odometer'], clean_df['sellingprice'], alpha=0.5)
plt.title('Scatter Plot: Kilometres vs. Price')
plt.xlabel('Kilometres')
plt.ylabel('Price')
plt.show()

In [None]:
plt.figure(figsize=(20, 6))

clean_df['make'].value_counts().plot(kind='bar', color='orange')
plt.title('Distribution of Cars by Brand')
plt.xlabel('Brand')
plt.ylabel('Count')
plt.xticks(rotation=90, ha='right')
plt.show()

In [None]:
plt.figure(figsize=(20, 6))
clean_df['body'].value_counts().plot(kind='bar', color='red')
plt.title('Distribution of Cars by Body Type')
plt.xlabel('Body Type')
plt.ylabel('Count')
plt.xticks(rotation=90, ha='right')
plt.show()

In [None]:
#Correlation matrix
numerical_columns = df.select_dtypes(include=['float64', 'int64'])

numerical_columns.corr()

- The "year" and "odometer" columns have a strong negative correlation (-0.772497), which makes sense as older cars tend to have higher mileage.
- The "year" and "sellingprice" columns have a strong positive correlation (0.586484), indicating that newer cars tend to have higher selling prices.

- The "condition" column has positive correlations with other columns, but they are not as strong as some of the other correlations.

-
The "mmr" and "sellingprice" columns have a very strong positive correlation (0.983635), suggesting a high degree of correlation between the M R (Manheim Market Report) values and the selling pricetions.

# Feature Engineering

In [None]:
# Normalize/Scalling function. it will return scale dataframe.
def normalize(df):
    # Nominal categories
    for name in features_nom:
        df[name] = df[name].astype("category")
        # Add a None category for missing values
        if "None" not in df[name].cat.categories:
            df[name] = df[name].cat.add_categories("None")
    # Ordinal categories
    for name, levels in ordered_levels.items():
        df[name] = df[name].astype(CategoricalDtype(levels,
                                                    ordered=True))
    return df

The numeric features are already encoded correctly (`float` for continuous, `int` for discrete), but the categoricals we'll need to do ourselves

In [None]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

def normalize_features(df):
    # Copy the original DataFrame to avoid modifying the input
    normalized_df = df.copy()

    # Numerical columns to be normalized
    numerical_cols = ['year', 'condition', 'odometer', 'mmr']

    # Categorical columns to be encoded
    categorical_cols = [col for col in df.columns if col not in numerical_cols]

    # Normalize numerical features using Min-Max Scaling
    scaler_dict = {}
    for col in numerical_cols:
        scaler = MinMaxScaler()
        normalized_df[col] = scaler.fit_transform(df[[col]])
        scaler_dict[col] = scaler

    # Encode categorical features using Label Encoding
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        normalized_df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

    # Encode the target column
    target_col = 'sellingprice'
    if target_col in df.columns:
        normalized_df[target_col] = label_encoders[target_col].transform(df[target_col])
        label_encoders[target_col] = le  # Add the target encoder to the dictionary

    return normalized_df, scaler_dict, label_encoders


Normalize numerical features using Min-Max Scaling and encode categorical features.

    Parameters:
    - df: DataFrame, the input DataFrame containing both numerical and categorical features.

    Returns:
    - normalized_df: DataFrame, the normalized DataFrame.
    - scaler_dict: dict, dictionary containing MinMaxScaler objects for numerical columns.
    - label_encoders: dict, dictionary containing LabelEncoder objects for categorical columns.

In [None]:
normalized_df, scaler_dict, label_encoders = normalize_features(clean_df)

In [None]:
normalized_df.head(5)

In [None]:
scaler_dict

In [None]:
label_encoders

We want to perform feature selection using all columns (both numerical and categorical) without explicitly specifying features, we can use techniques that handle both types of features. One such approach is to use tree-based models (like Random Forest) that inherently handle a mixture of numerical and categorical features. The feature_importances_ attribute of tree-based models can be used for feature selection.

# Dataset Generation/Splitting

### Random Forest Model:

- Use the original, non-scaled DataFrame.
- Random Forests are generally not sensitive to feature scales, and using the raw data is a common practice.

In [None]:
from sklearn.model_selection import train_test_split

X_for_randomForest = normalized_df.drop(['sellingprice', 'year', 'vin', 'seller', 'state'], axis=1)  # Features
y_for_randomForest = normalized_df['sellingprice']  # Target variable

# Split the data into training (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X_for_randomForest, y_for_randomForest, test_size=0.2, random_state=42)

# Further split the training data into training (80%) and validation (20%) sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

print("Train set:", X_train.shape, y_train.shape)
print("Validation set:", X_val.shape, y_val.shape)
print("Test set:", X_test.shape, y_test.shape)

In [None]:
X_for_randomForest.columns

## Random Forest Model Fitting

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV


# Random Forest hyperparameter tuning using GridSearchCV
rf_param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [30, 40, 50],
    'min_samples_split': [8, 12],
    'min_samples_leaf': [4, 5]
}

rf_model = RandomForestRegressor(random_state=42)
rf_grid_search = GridSearchCV(rf_model, rf_param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
rf_grid_search.fit(X_train, y_train)

In [None]:
rf_grid_search.best_params_

In [None]:
# Get the best parameters and fit the model
best_rf_params = rf_grid_search.best_params_
best_rf_model = RandomForestRegressor(**best_rf_params)
best_rf_model.fit(X_train, y_train)

### Evaluating Random Forest Model

In [None]:
from sklearn.metrics import mean_squared_error

y_val_pred_rf = best_rf_model.predict(X_val)
mse_rf = mean_squared_error(y_val, y_val_pred_rf)
print(f'Random Forest Validation Mean Squared Error: {mse_rf}')

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Assuming X_test and y_test are your test data
y_pred_test = best_rf_model.predict(X_test)

# Evaluate the model's performance on the test data
mse_test = mean_squared_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)
mae_test = mean_absolute_error(y_test, y_pred_test)

print("Mean Squared Error (MSE) on Test Data:", mse_test)
print("R-squared (R2) on Test Data:", r2_test)
print("Mean Absolute Error (MAE) on Test Data:", mae_test)


In [None]:
import joblib

# Assuming best_rf_model is your trained Random Forest model
model_filename = 'random_forest_model0.97.pkl'
joblib.dump(best_rf_model, model_filename)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Scatter plot for actual vs predicted values
plt.figure(figsize=(10, 6))

plt.scatter(y_test, y_pred_test, alpha=0.5)
plt.title('Actual vs Predicted Values')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.grid(True)
plt.show()

# Residual plot
residuals = y_test - y_pred_test
plt.figure(figsize=(10, 6))

plt.scatter(y_test, residuals, alpha=0.5)
plt.title('Residual Plot')
plt.xlabel('Actual Values')
plt.ylabel('Residuals (Actual - Predicted)')
plt.axhline(y=0, color='r', linestyle='--', linewidth=2)
plt.grid(True)
plt.show()
