In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [2]:
data = pd.read_csv('dataset/diamonds_processed.csv')
data.head()

Unnamed: 0,clarity,price,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,color_H,color_I,color_J,carat,depth,table,x,y,z
0,4,11053,0,0,1,0,0,1,0,0,0,0,0,0,0.85026,-0.524181,0.691311,0.94506,0.997975,0.878502
1,6,684,0,0,0,0,1,0,0,0,0,1,0,0,-1.049586,0.173808,0.691311,-1.309331,-1.286354,-1.25322
2,3,3921,0,0,0,1,0,0,0,0,0,1,0,0,0.238088,-1.501367,1.13856,0.463886,0.498278,0.285573
3,3,4043,0,0,1,0,0,0,0,1,0,0,0,0,0.005884,0.243607,-0.650436,0.178745,0.22166,0.229104
4,1,4751,0,0,1,0,0,0,1,0,0,0,0,0,0.449182,-1.780563,-0.650436,0.80249,0.765973,0.525568


In [3]:
from sklearn.linear_model import LinearRegression

def linear_model(x_train, y_train):
    print("Linear Regression")
    linear_reg = LinearRegression()
    linear_reg.fit(x_train, y_train)
    
    return linear_reg

In [4]:
from sklearn.linear_model import Lasso

def lasso_model(x_train, y_train):
    print('lasso regression')
    lasso_reg = Lasso(alpha= 0.8, max_iter= 10000)
    lasso_reg.fit(x_train, y_train)
    
    return lasso_reg

In [5]:
from sklearn.linear_model import Ridge

def ridge_model(x_train, y_train):
    print('ridge regression')
    ridge_reg = Ridge(alpha= 0.8)
    ridge_reg.fit(x_train, y_train)
    
    return ridge_reg

In [6]:
def build_train_model(data, target_name, reg_fn):
    X = data.drop(target_name, axis= 1)
    Y = data[target_name]
    
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size= 0.2, random_state = 0)
    
    model = reg_fn(x_train, y_train)
    
    score = model.score(x_train, y_train)
    print("training score is ", score)
    
    y_pred = model.predict(x_test)
    r_score = r2_score(y_test, y_pred)
    print("testing score is ", r_score)
    
    return {
        'model': model,
        'x-train': x_train, 'x_test': x_test,
        'y_train': y_train, 'y_test': y_test,
        'y_pred': y_pred
    }

In [7]:
linear_reg = build_train_model(data, 'price', linear_model)

Linear Regression
training score is  0.91209829166845
testing score is  0.9005423109537816


In [8]:
lasso_reg = build_train_model(data, 'price', lasso_model)

lasso regression
training score is  0.9120807661922641
testing score is  0.9006823122790389


In [9]:
ridge_reg = build_train_model(data, 'price', ridge_model)

ridge regression
training score is  0.9120982501674167
testing score is  0.900556306035974


In [10]:
linear_reg['model']

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [11]:
from sklearn.linear_model import SGDRegressor

def sgd_model(x_train, y_train):
    
    print("SGD Regression")
    sgd_reg = SGDRegressor(max_iter = 2000)
    sgd_reg.fit(x_train, y_train)
    
    return sgd_reg

In [12]:
sgd_reg = build_train_model(data, 'price', sgd_model)

SGD Regression
training score is  0.9119634780204797
testing score is  0.9003446887426528
