# COMP309 - a4
### Elliott Rose
### 300540768

## Part 1:

In [5]:
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.neural_network import MLPRegressor

In [6]:
# LOADING DATA
df = pd.read_csv('Part 1 - regression/diamonds.csv')

print(df.describe())
print(df.head())

# removing the pointless column
df = df.drop(columns = ['Unnamed: 0'])

# splitting the variables from the target variable
prices = df["price"]
data_x = df.drop(columns=["price"])

numerical_columns = data_x.select_dtypes(include=['number']).columns.to_list()
categorical_columns = data_x.select_dtypes(include=['object']).columns.to_list()

# imputing 0 values for dimensions
columns_to_impute = ['x', 'y', 'z']
for column in columns_to_impute:
    mean = df[column].mean()
    df[column] = df[column].replace(0, mean) 

# encoding categorical data using ordinal encoder for colour
color_encoder = OrdinalEncoder(categories=[['J', 'I', 'H', 'G', 'F', 'E', 'D']])
cut_encoder = OrdinalEncoder(categories=[['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']])
clarity_encoder = OrdinalEncoder(categories=[['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']])

encoded_colors = color_encoder.fit_transform(df[['color']])
encoded_cut = cut_encoder.fit_transform(df[['cut']])
encoded_clarity = clarity_encoder.fit_transform(df[['clarity']])

df['encoded_color'] = encoded_colors
df['encoded_cut'] = encoded_cut
df['encoded_clarity'] = encoded_clarity

# splitting
X_train, X_test, y_train, y_test = train_test_split(data_x, prices, test_size=0.3, random_state=309)

# normalizing
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numerical_columns])
X_test_scaled = scaler.transform(X_test[numerical_columns])

gbr = GradientBoostingRegressor()
gbr.fit(X_train_scaled, y_train)
pred = gbr.predict(X_test_scaled)
acc = mean_squared_error(pred, y_test)

         Unnamed: 0         carat         depth         table             x  \
count  53940.000000  53940.000000  53940.000000  53940.000000  53940.000000   
mean   26970.500000      0.797940     61.749405     57.457184      5.731157   
std    15571.281097      0.474011      1.432621      2.234491      1.121761   
min        1.000000      0.200000     43.000000     43.000000      0.000000   
25%    13485.750000      0.400000     61.000000     56.000000      4.710000   
50%    26970.500000      0.700000     61.800000     57.000000      5.700000   
75%    40455.250000      1.040000     62.500000     59.000000      6.540000   
max    53940.000000      5.010000     79.000000     95.000000     10.740000   

                  y             z         price  
count  53940.000000  53940.000000  53940.000000  
mean       5.734526      3.538734   3932.799722  
std        1.142135      0.705699   3989.439738  
min        0.000000      0.000000    326.000000  
25%        4.720000      2.910000    9

In [7]:
models = [
    LinearRegression(),
    KNeighborsRegressor(),
    Ridge(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    GradientBoostingRegressor(),
    SGDRegressor(),
    SVR(),
    LinearSVR(),
    MLPRegressor()
]

model_scores = {
    'Model' : [],
    'MSE' : [],
    'RMSE' : [],
    'RSE' : [],
    'MAE' : [], 
    'RUN TIME' : []
}

for model in models:
    model.fit(X_train_scaled, y_train)
    startTime = time.time()
    pred = model.predict(X_test_scaled)
    mse = mean_squared_error(pred, y_test)
    rmse = np.sqrt(mse)
    rse = mse / np.sqrt(np.mean((y_test - np.mean(y_test))**2))
    mae = mean_absolute_error(pred, y_test)
    model_scores["Model"].append(str(model))
    model_scores["MSE"].append("{:.2f}".format(mse))
    model_scores["RMSE"].append("{:.2f}".format(rmse))
    model_scores["RSE"].append("{:.2f}".format(rse))
    model_scores["MAE"].append("{:.2f}".format(mae))
    model_scores["RUN TIME"].append(time.time() - startTime)

model_scores_df = pd.DataFrame(model_scores)
model_scores_df.to_csv('part1.csv', index=False)
model_scores_df



Unnamed: 0,Model,MSE,RMSE,RSE,MAE,RUN TIME
0,LinearRegression(),2335265.55,1528.16,579.83,895.14,0.004603
1,KNeighborsRegressor(),2207730.63,1485.84,548.17,831.48,0.185915
2,Ridge(),2335259.54,1528.16,579.83,895.33,0.0
3,DecisionTreeRegressor(),3613430.34,1900.9,897.19,1050.34,0.011345
4,RandomForestRegressor(),1990624.58,1410.89,494.26,792.2,0.517433
5,GradientBoostingRegressor(),1879375.87,1370.9,466.64,776.1,0.015626
6,SGDRegressor(),2347426.68,1532.13,582.85,883.99,0.015625
7,SVR(),7577653.59,2752.75,1881.48,1349.92,63.247901
8,LinearSVR(),3301263.8,1816.94,819.68,985.26,0.0
9,MLPRegressor(),2122725.74,1456.96,527.06,823.62,0.017392
