# EXCERCISE 1

## LETS IMPORT NECESSARY LIBRARIES

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
## lets import the car dataset

dataset = pd.read_csv("C:\\Users/DELL/Desktop/summative_assignment/car_sales_data.csv")

In [None]:
dataset.head(6)  # a view of the first 6 rows of the dataset.

In [None]:
dataset.info()

In [None]:
dataset.isna().sum() #......*checking for empty cells*........#

In [None]:
dataset.nunique()  #.....*checking for unique variables*.........#

In [None]:
data_set = dataset.drop_duplicates()  #.....dropping duplicates........#

In [None]:
data_set.describe()    #.....*some describtive statistics on the dataset*.......#

In [None]:
data_set.shape[0]

## GOAL (A)
Compare regression models that predict the price of a car based on a single 
numerical input feature. Based on your results, which numerical variable in the 
dataset is the best predictor for a car’s price, and why? For each numerical input 
feature, is the price better fit by a linear model or by a non-linear (e.g. polynomial) 
model?

In [None]:

# List of numerical variables
numerical_variables = ["Engine size", "Year of manufacture", "Mileage"]

# Split the data into features (X) and target variable (y)
X = data_set[numerical_variables]
y = data_set["Price"]

# Dictionary to store MSE values for each feature and model type
mse_results = {}

# Function to train and evaluate linear and polynomial regression models
def train_and_evaluate_model(feature):
    # Linear Regression
    model_linear = LinearRegression()
    model_linear.fit(X_train[[feature]], y_train)
    y_pred_linear = model_linear.predict(X_test[[feature]])
    
    # Polynomial Regression (degree=2)
    model_poly = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
    model_poly.fit(X_train[[feature]], y_train)
    y_pred_poly = model_poly.predict(X_test[[feature]])
    
    # Calculate Mean Squared Error for both models
    mse_linear = mean_squared_error(y_test, y_pred_linear)
    mse_poly = mean_squared_error(y_test, y_pred_poly)
    
    # Store MSE values in the dictionary
    mse_results[feature] = {"Linear": mse_linear, "Polynomial": mse_poly}
    
    # Visualize the results
    plt.scatter(X_test[[feature]], y_test, color='black', label='Actual', s = 5 )
    plt.savefig("Featu.jpeg")
    plt.plot(X_test[[feature]], y_pred_linear, color='blue', linewidth=1, label='Linear Regression')
    plt.plot(X_test[[feature]], y_pred_poly, color='red', linewidth=1, label='Polynomial Regression (degree=2)')
    plt.title(f"{feature} vs Price")
    plt.xlabel(feature)
    plt.ylabel("Price")
    plt.legend()
    plt.show()



In [None]:
# Train and evaluate models for each numerical variable
for feature in numerical_variables:
    X_train, X_test, y_train, y_test = train_test_split(X[[feature]], y, test_size=0.2, random_state=42)
    train_and_evaluate_model(feature)

# Analyze results and identify the best predictor
best_feature = min(mse_results, key=lambda x: mse_results[x]["Linear"])
print(f"The best predictor for a car's price is '{best_feature}'.")

# Analyze the best fit model for each numerical feature
for feature, mse_dict in mse_results.items():
    best_model = min(mse_dict, key=mse_dict.get)
    print(f"For {feature}, the price is better fit by a {best_model} model.")


## GOAL B
Consider regression models that take multiple numerical variables as input features 
to predict the price of a car. Does the inclusion of multiple input features improve 
the accuracy of the model’s prediction compared to the single-input feature models 
that you explored in part (a)?


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to train and evaluate linear and polynomial regression models
def train_and_evaluate_model(features):
    # Linear Regression
    model_linear = LinearRegression()
    model_linear.fit(X_train[features], y_train)
    y_pred_linear = model_linear.predict(X_test[features])
    
    # Polynomial Regression (degree=2)
    model_poly = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
    model_poly.fit(X_train[features], y_train)
    y_pred_poly = model_poly.predict(X_test[features])
    
    # Calculate Mean Squared Error for both models
    mse_linear = mean_squared_error(y_test, y_pred_linear)
    mse_poly = mean_squared_error(y_test, y_pred_poly)
    
    return mse_linear, mse_poly

# Train and evaluate models for each numerical variable individually
mse_results_single = {}
for feature in numerical_variables:
    mse_linear, mse_poly = train_and_evaluate_model([feature])
    mse_results_single[feature] = {"Linear": mse_linear, "Polynomial": mse_poly}

# Train and evaluate models using all numerical variables
mse_results_multiple = {}
mse_linear, mse_poly = train_and_evaluate_model(numerical_variables)
mse_results_multiple["All Numerical Variables"] = {"Linear": mse_linear, "Polynomial": mse_poly}




### Compare MSE results for single input features

In [None]:

print("MSE Results for Single Input Features:")
for feature, mse_dict in mse_results_single.items():
    print(f"{feature}:")
    print(f"  Linear: {mse_dict['Linear']}")
    print(f"  Polynomial: {mse_dict['Polynomial']}")
    print()


### Compare MSE results for multiple input features

In [None]:
print("MSE Results for Multiple Input Features:")
for features, mse_dict in mse_results_multiple.items():
    print(f"{features}:")
    print(f"  Linear: {mse_dict['Linear']}")
    print(f"  Polynomial: {mse_dict['Polynomial']}")
    print()


## GOAL (C)
In parts (a) and (b) you only considered models that use the numerical variables from 
the dataset as inputs. However, there are also several categorical variables in the
dataset that are likely to affect the price of the car. Now train a regression model 
that uses all relevant input variables (both categorical and numerical) to predict the 
price (e.g. a Random Forest Regressor model). Does this improve the accuracy of 
your results? 

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

# Define features and target variable
X = data_set.drop("Price", axis=1)  # Features
y = data_set["Price"]  # Target variable

# Define numerical and categorical features
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

# Create transformers for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

### Combine transformers using ColumnTransformer

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs = -1))
])

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


## GOAL (D)
Develop an Artificial Neural Network (ANN) model to predict the price of a car based 
on all the available information from the dataset. How does its performance 
compare to the other supervised learning models that you have considered? Discuss 
your choices for the architecture of the neural network that you used, and describe 
how you tuned the hyperparameters in your model to achieve the best performance.

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError

# Drop non-numeric columns and the target variable (price)
X = data_set.drop(['Manufacturer', 'Model', 'Fuel type'], axis=1)
y = data_set['Price']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)





### standardise the model data and build neural network model

In [None]:
# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Build the neural network model for regression
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))  # Linear activation for regression

### compile, train and evaluate the model

In [None]:
# Compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss=MeanSquaredError())

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))

# Evaluate the model on the validation set
loss = model.evaluate(X_val, y_val)
print(f'Validation Loss: {loss}')


## comparing with other Supervised learning models

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Assuming 'model' is your trained neural network model
y_val_pred = model.predict(X_val)

mse = mean_squared_error(y_val, y_val_pred)   #mean squared error
mae = mean_absolute_error(y_val, y_val_pred)  # mean absolute error
r2 = r2_score(y_val, y_val_pred)              # r2_score

print(f'MSE: {mse}, MAE: {mae}, R-squared: {r2}')


## GOAL E
Based on the results of your analysis, what is the best model for predicting the price 
of a car and why? You should use suitable figures and evaluation metrics to support 
your conclusions. 

In [None]:

#using y_val and y_val_pred as the  actual and predicted values
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_val.values.flatten(), y=y_val_pred.flatten(), s = 5)  # to Ensure both are 1-dimensional
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs. Predicted Prices')
plt.savefig("Actual vs. Predicted Prices.jpeg")
plt.show()


## Evaluation metric

In [None]:
from sklearn.neural_network import MLPRegressor  # using scikit-learn for the neural network
# X and y are the features and target variable
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 1: Define and Train Neural Network
model_nn = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42) 
model_nn.fit(X_train, y_train)

# Step 2: Define and Train Linear Regression
model_linear = LinearRegression()
model_linear.fit(X_train, y_train)

# Step 3: Define and Train Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


### Step 4: Make Predictions

In [None]:
y_val_pred_nn = model_nn.predict(X_val)
y_val_pred_linear = model_linear.predict(X_val)
y_val_pred_rf = rf_model.predict(X_val)

### Step 5: Evaluate Models

In [None]:
# Calculate metrics for neural network
mse_nn = mean_squared_error(y_val, y_val_pred_nn)
mae_nn = mean_absolute_error(y_val, y_val_pred_nn)
r2_nn = r2_score(y_val, y_val_pred_nn)

# Calculate metrics for linear regression
mse_linear = mean_squared_error(y_val, y_val_pred_linear)
mae_linear = mean_absolute_error(y_val, y_val_pred_linear)
r2_linear = r2_score(y_val, y_val_pred_linear)

# Calculate metrics for random forest
mse_rf = mean_squared_error(y_val, y_val_pred_rf)
mae_rf = mean_absolute_error(y_val, y_val_pred_rf)
r2_rf = r2_score(y_val, y_val_pred_rf)



### Step 6: Create the Comparison Table

In [None]:
data = {
    'Model': ['Neural Network', 'Linear Regression', 'Random Forest'],
    'MSE': [mse_nn, mse_linear, mse_rf],
    'MAE': [mae_nn, mae_linear, mae_rf],
    'R-squared': [r2_nn, r2_linear, r2_rf]
}

comparison_table = pd.DataFrame(data)

print(comparison_table)


## GOAL F
Use the k-Means clustering algorithm to identify clusters in the car sales data. 
Consider different combinations of the numerical variables in the dataset to use as 
input features for the clustering algorithm. In each case, what is the optimal number 
of clusters (k) to use and why? Which combination of variables produces the best 
clustering results? Use appropriate evaluation metrics to support your conclusions. 

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from itertools import combinations

# Consider numerical features for clustering same as numerical variable
numerical_features = ['Engine size', 'Year of manufacture', 'Mileage', 'Price']

# Choose a range of k values for clustering
k_range = range(2, 5)

# Initialize variables to track the best clustering results
best_silhouette_score = -1
best_feature_combination = None
best_k = None

# Iterate over all combinations of features
for num_features in range(1, len(numerical_features) + 1):
    feature_combinations = combinations(numerical_features, num_features)

    for features in feature_combinations:
        # Select the subset of features
        X_subset = data_set[list(features)]

        # Standardize the features
        scaler = StandardScaler()
        X_scaled_subset = scaler.fit_transform(X_subset)

        # Experiment with different k values
        silhouette_scores = []
        for k in k_range:
            kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
            labels = kmeans.fit_predict(X_scaled_subset)
            silhouette_avg = silhouette_score(X_scaled_subset, labels)
            silhouette_scores.append(silhouette_avg)
            print(f"Silhouette Score for k={k} with features {features}: {silhouette_avg}")

        # Find the index of the maximum silhouette score
        optimal_k_index = np.argmax(silhouette_scores)
        optimal_silhouette_score = silhouette_scores[optimal_k_index]

        # Update best results if the current combination is better
        if optimal_silhouette_score > best_silhouette_score:
            best_silhouette_score = optimal_silhouette_score
            best_feature_combination = features
            best_k = k_range[optimal_k_index]

In [None]:
print(f"Best combination of variables: {best_feature_combination} with Silhouette Score: {best_silhouette_score}")
print(f"Optimal number of clusters (k): {best_k}")


## GOAL G

Compare the results of the k-Means clustering model from part (f) to at least one 
other clustering algorithm. Which algorithm produces the best clustering? Use 
suitable evaluation metrics to justify your answer.

In [None]:

import pandas as pd
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Assuming car_data is your DataFrame
selected_features = ["Engine size", "Year of manufacture", "Mileage", "Price"]

# Extract the selected numerical features
X_numerical = data_set[selected_features]

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numerical)

# Reduce sample size for clustering
sample_size = 500  # Adjust as needed
X_scaled_sample = pd.DataFrame(X_scaled).sample(n=sample_size, random_state=42)



### Apply k-Means clustering

In [None]:
chosen_k_kmeans = 3  # Adjust the number of clusters as needed
kmeans = KMeans(n_clusters=chosen_k_kmeans,n_init = "auto", random_state=42)
kmeans_labels = kmeans.fit_predict(X_scaled_sample)
silhouette_kmeans = silhouette_score(X_scaled_sample, kmeans_labels)

### Apply MiniBatchKMeans clustering

In [None]:
chosen_k_minibatch = 3  # Adjust the number of clusters as needed
minibatch_kmeans = MiniBatchKMeans(n_clusters=chosen_k_minibatch,n_init = "auto", random_state=42)
minibatch_kmeans_labels = minibatch_kmeans.fit_predict(X_scaled_sample)
silhouette_minibatch_kmeans = silhouette_score(X_scaled_sample, minibatch_kmeans_labels)

### Visualize the results using PCA (for two principal components)

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled_sample)

### Scatter plot for k-Means clustering

In [None]:
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans_labels, cmap='viridis', edgecolors='k', s=50)
plt.title('k-Means Clustering')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.savefig("k-means Clustering.jpeg")

### Scatter plot for MiniBatchKMeans clustering

In [None]:
plt.subplot(1, 2, 2)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=minibatch_kmeans_labels, cmap='viridis', edgecolors='k', s=50)
plt.title('MiniBatchKMeans Clustering')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')

plt.tight_layout()
plt.savefig("minibatch k-means clustering")
plt.show()

### Compare the silhouette scores

In [None]:
print(f"Silhouette Score for k-Means Clustering: {silhouette_kmeans}")
print(f"Silhouette Score for MiniBatchKMeans Clustering: {silhouette_minibatch_kmeans}")