In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
plt.style.use('seaborn')
%matplotlib inline

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
df = pd.read_csv('cubic_zirconia.csv')
df.shape

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.describe()

### Processing

In [None]:
df.drop('Unnamed: 0',axis=1,inplace=True)
df = df.replace({'Ideal':4,'Premium':3,'Very Good':2,'Good':1,'Fair':0})

e = LabelEncoder()
df['color'] = e.fit_transform(df['color'])
df['clarity'] = e.fit_transform(df['clarity'])

In [None]:
df.isnull().sum()

In [None]:
df['depth_median'] = df['depth'].fillna(df.depth.median())
df.drop('depth',axis=1,inplace=True)

In [None]:
df.head()

In [None]:
plt.figure(figsize = (12,10))
plt.title('Correlation between variables')
sns.heatmap(df.corr(), annot=True, cmap= "bone_r")

### Prediction

In [None]:
x = df.drop(['price'],axis=1)
y = df[['price']]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)
print("The shape of x_train is      ", x_train.shape)
print("The shape of x_test is       ", x_test.shape)
print("The shape of y_train is      ", y_train.shape)
print("The shape of y_test is       ", y_test.shape)

### Linear Regression

In [None]:
regressor = LinearRegression()
regressor.fit(x_train, y_train)

y_pred = regressor.predict(x_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('R2 score:', metrics.r2_score(y_test, y_pred))

### Decision Tree

In [None]:
regressor = DecisionTreeRegressor()
regressor.fit(x_train, y_train)

y_pred = regressor.predict(x_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('R2 score:', metrics.r2_score(y_test, y_pred))

In [None]:
regressor = DecisionTreeRegressor(criterion = 'poisson')
regressor.fit(x_train, y_train)

y_pred = regressor.predict(x_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('R2 score:', metrics.r2_score(y_test, y_pred))

In [None]:
regressor = DecisionTreeRegressor(splitter = 'random')
regressor.fit(x_train, y_train)

y_pred = regressor.predict(x_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('R2 score:', metrics.r2_score(y_test, y_pred))

### Normalize and repredict

In [None]:
sns.boxplot(df.carat)

In [None]:
IQR = df.carat.quantile(0.75) - df.carat.quantile(0.25)
lower_bridge = df.carat.quantile(0.25) - (IQR * 1.5)
upper_bridge = df.carat.quantile(0.75) + (IQR * 1.5)
print(lower_bridge, upper_bridge)

In [None]:
df.loc[df['carat'] >= 2.025, 'carat'] = 2.025
df.loc[df['carat'] <= -0.57, 'carat'] = -0.57
sns.boxplot(df.carat)

In [None]:
sns.boxplot(df.table)

In [None]:
IQR = df.table.quantile(0.75) - df.table.quantile(0.25)
lower_bridge = df.table.quantile(0.25) - (IQR * 1.5)
upper_bridge = df.table.quantile(0.75) + (IQR * 1.5)
print(lower_bridge, upper_bridge)

In [None]:
df.loc[df['table'] >= 63.5, 'table'] = 63.5
df.loc[df['table'] <= 51.5, 'table'] = 51.5
sns.boxplot(df.table)

In [None]:
sns.boxplot(df.x)

In [None]:
IQR= df.x.quantile(0.75) - df.x.quantile(0.25)
lower_bridge = df.x.quantile(0.25) - (IQR * 1.5)
upper_bridge = df.x.quantile(0.75) + (IQR * 1.5)
print(lower_bridge, upper_bridge)

In [None]:
df.loc[df['x'] >= 9.30, 'x'] = 9.30
df.loc[df['x'] <= 1.95, 'x'] = 1.95
sns.boxplot(df.x)

In [None]:
sns.boxplot(df.y)

In [None]:
IQR = df.y.quantile(0.75) - df.y.quantile(0.25)
lower_bridge = df.y.quantile(0.25) - (IQR * 1.5)
upper_bridge = df.y.quantile(0.75) + (IQR * 1.5)
print(lower_bridge, upper_bridge)

In [None]:
df.loc[df['y'] >= 9.285, 'y'] = 9.285
df.loc[df['y'] <= 1.964, 'y'] = 1.964
sns.boxplot(df.y)

In [None]:
sns.boxplot(df.z)

In [None]:
IQR = df.z.quantile(0.75) - df.z.quantile(0.25)
lower_bridge = df.z.quantile(0.25) - (IQR * 1.5)
upper_bridge = df.z.quantile(0.75) + (IQR * 1.5)
print(lower_bridge, upper_bridge)

In [None]:
df.loc[df['z'] >= 5.75, 'z'] = 5.75
df.loc[df['z'] <= 1.18, 'z'] = 1.18
sns.boxplot(df.z)

In [None]:
sns.boxplot(df.depth_median)

In [None]:
IQR= df.depth_median.quantile(0.75) - df.depth_median.quantile(0.25)
lower_bridge = df.depth_median.quantile(0.25) - (IQR * 1.5)
upper_bridge = df.depth_median.quantile(0.75) + (IQR * 1.5)
print(lower_bridge, upper_bridge)

In [None]:
df.loc[df['depth_median'] >= 64.6, 'depth_median'] = 64.6
df.loc[df['depth_median'] <= 59.0, 'depth_median'] = 59.0
sns.boxplot(df.depth_median)

In [None]:
x = df.drop(['price'],axis=1)
y = df[['price']]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)
print("The shape of x_train is      ", x_train.shape)
print("The shape of x_test is       ", x_test.shape)
print("The shape of y_train is      ", y_train.shape)
print("The shape of y_test is       ", y_test.shape)

### Linear regression reprediction

In [None]:
regressor = LinearRegression()
regressor.fit(x_train, y_train)

y_pred = regressor.predict(x_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('R2 score:', metrics.r2_score(y_test, y_pred))