# **Data description**

**price**    price in US dollars (\$326--\$18,823)

**carat**    weight of the diamond (0.2--5.01)

**cut**      quality of the cut (Fair, Good, Very Good, Premium, Ideal)

**color**    diamond color, from J (worst) to D (best)

**clarity**   a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1,               IF (best))

**x**        length in mm (0--10.74)

**y**        width in mm (0--58.9)

**z**        depth in mm (0--31.8)

**depth**    total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)

**table**    width of the top of diamond relative to widest point (43--95)

# **Importing required packages**

In [1]:
# Supressing the warning messages
import warnings
warnings.filterwarnings('ignore')

In [2]:
#import the packeges 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.compose import ColumnTransformer

In [3]:
#Reading the data into python
data_train = pd.read_csv('../input/shai-club/train.csv')
data_test = pd.read_csv('../input/shai-club/test.csv')

In [4]:
submit_data=data_test

In [5]:
print(len(data_train) , len(data_test))


In [6]:
data_train.head()

In [7]:
data_test.head()

#  **Exploratory data analysis**

In [8]:
#get information about the data
data_train.info()

In [9]:
#checking the missing value for the training data
data_train.isna().sum()

In [10]:
#checking the missing value for the testing data 
data_test.isna().sum()

In [11]:
# Looking at the descriptive statistics of the data
data_train.describe(include='all')

# **Data Visualization** 

In [12]:

ax = sns.pairplot(data_train, hue= "cut")

In [13]:
data_train = data_train.drop(['Id'], axis=1)
data_test= data_test.drop(['Id'], axis=1)

In [14]:
data_train.hist(figsize = (20,15),bins = 50)
plt.show()

In [15]:
linear_vars = data_train.select_dtypes(include=[np.number]).columns
cat_attribs = list(data_train.select_dtypes(exclude=[np.number]).columns)

In [16]:
print(linear_vars)
print(cat_attribs)

In [17]:

fig = plt.figure(figsize=(18, 9))

fig, ax = plt.subplots(1, 3, figsize=(20, 8))
for variable, subplot in zip(cat_attribs, ax.flatten()):
    sns.countplot(data_train[variable], ax=subplot)
    for label in subplot.get_xticklabels():
        label.set_rotation(90)

In [18]:
def plot_boxplot(df, ft):
    sns.boxplot(df[ft])
    

In [19]:
fig = plt.figure(figsize=(18, 9))

fig, ax = plt.subplots(4, 2, figsize=(20, 20))
for variable, subplot in zip(linear_vars, ax.flatten()):
    sns.scatterplot(x=data_train[variable],y=data_train['price'], ax=subplot)
    for label in subplot.get_xticklabels():
        label.set_rotation(90)

In [20]:
fig = plt.figure(figsize=(18, 9))

fig, ax = plt.subplots(4, 2, figsize=(20, 20))
for variable, subplot in zip(linear_vars, ax.flatten()):
    sns.boxplot(x=data_train[variable], ax=subplot)
    for label in subplot.get_xticklabels():
        label.set_rotation(90)

# ****remove outliers****

In [21]:
#define function to remove outliers
def  outliers(df , ft ):
    
    Q1=df[ft].quantile(0.25)
    Q3=df[ft].quantile(0.75)
    IQR=Q3 - Q1
    # Upper bound
    upper = Q3+1.5*IQR
    # Lower bound
    lower = Q1-1.5*IQR

    ls=df.index[(df[ft] < lower) | (df[ft] > upper)]
    return ls 
    


In [22]:
#create empty list to store the list index

index_list=[]
for feature in linear_vars :
    index_list.extend(outliers(data_train , feature ))

In [23]:
len(index_list)

In [24]:
def remove(df, ls):
    ls=sorted(set(ls))
    df=df.drop(ls)
    return df

In [25]:
data_train= remove(data_train , index_list)

In [26]:
data_train.shape

In [27]:
fig = plt.figure(figsize=(18, 9))

fig, ax = plt.subplots(4, 2, figsize=(20, 20))
for variable, subplot in zip(linear_vars, ax.flatten()):
    sns.scatterplot(x=data_train[variable],y=data_train['price'], ax=subplot)
    for label in subplot.get_xticklabels():
        label.set_rotation(90)

In [28]:
fig = plt.figure(figsize=(18, 9))

fig, ax = plt.subplots(4, 2, figsize=(20, 20))
for variable, subplot in zip(linear_vars, ax.flatten()):
    sns.boxplot(x=data_train[variable], ax=subplot)
    for label in subplot.get_xticklabels():
        label.set_rotation(90)

In [29]:
data_train.hist(figsize = (20,15),bins = 50)
plt.show()

In [30]:
# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()
for col in cat_attribs:
    data_train[col] = label_encoder.fit_transform(data_train[col])
    data_test[col]=label_encoder.fit_transform(data_test[col])

In [31]:
data_train.head(20)

In [32]:
data_test.head()

# **Feature Selection & Data Split**

In [33]:
data_train.corr()['price'].sort_values(ascending = False)

In [34]:
corrmat= data_train.corr()
f, ax = plt.subplots(figsize=(12,12))
sns.heatmap(corrmat,cmap="Pastel2",annot=True)

In [35]:
data_train['volum']=data_train['y']* data_train['x']*data_train['z']
data_train = data_train.drop(['x','y','z'], axis=1)

In [36]:
data_test['volum']=data_test['y']* data_test['x']*data_test['z']
data_test = data_test.drop(['x','y','z'], axis=1)

In [37]:
corrmat= data_train.corr()
f, ax = plt.subplots(figsize=(12,12))
sns.heatmap(corrmat,cmap="Pastel2",annot=True)

In [38]:
data_train.head()

In [39]:
X= data_train.drop(["price"],axis =1)
y= data_train["price"]

In [40]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [41]:
linear_vars=linear_vars

In [42]:
numeric_features = data_train.select_dtypes(include=['int64', 'float64']).drop(['price'], axis=1).columns
categorical_features = data_train.select_dtypes(include=['object']).columns


In [43]:
numeric_features

In [44]:
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [45]:
#split the data for training and testing 
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.25, random_state=7)

# **Modeling**

In [46]:

# Building pipelins of  processor and model for varios regressors.

#pipline for linear regression model
pipeline_lr=Pipeline([("preprocessor",preprocessor),
                     ("lr_reg",LinearRegression())])

#pipline for Decision Tree Regressor
pipeline_dt=Pipeline([("preprocessor",preprocessor),
                     ("dt_reg",DecisionTreeRegressor())])

#pipline for Random Forest Regressor
pipeline_rf=Pipeline([("preprocessor",preprocessor),
                     ("rf_reg",RandomForestRegressor())])


#pipline for KNeighbors Regressor
pipeline_kn=Pipeline([("preprocessor",preprocessor),
                     ("rf_reg",KNeighborsRegressor())])

#pipline for Suport Vector  Regressor
pipeline_svm=Pipeline([("preprocessor",preprocessor),
                     ("svm_reg",SVR())])




In [47]:
# List of all the pipelines
pipelines = [pipeline_lr, pipeline_dt, pipeline_rf, pipeline_kn, pipeline_svm]

# Dictionary of pipelines and model types for ease of reference
pipe_dict = {0: "LinearRegression", 1: "DecisionTree", 2: "RandomForest",3: "KNeighbors", 4: "Support Vector"}


In [48]:
# Fit  all  pipelines
for pipe in pipelines:
    pipe.fit(X_train, y_train)

In [49]:

cv_results_rms = []
for i, model in enumerate(pipelines):
    cv_score = cross_val_score(model, X_train,y_train,scoring="neg_root_mean_squared_error", cv=10)
    cv_results_rms.append(cv_score)
    print("%s : %f " % (pipe_dict[i], cv_score.mean()))

In [50]:
pred = pipeline_rf.predict(X_test)
print("R^2:",metrics.r2_score(y_test, pred))
print("Adjusted R^2:",1 - (1-metrics.r2_score(y_test, pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))
print("MAE:",metrics.mean_absolute_error(y_test, pred))
print("MSE:",metrics.mean_squared_error(y_test, pred))
print("RMSE:",np.sqrt(metrics.mean_squared_error(y_test, pred)))

# **Fine-Tune Your Model**

In [51]:
from sklearn.model_selection import GridSearchCV


In [52]:

param_grid = { 
    'rf_reg__n_estimators': [200, 500],
    'rf_reg__max_features': ['auto', 'sqrt', 'log2'],
    'rf_reg__max_depth' : [4,5,6,7,8]}


grid_search = GridSearchCV(pipeline_rf,param_grid,cv = 10, scoring = 'neg_mean_squared_error',return_train_score = True)
grid_search.fit(X_train,y_train)

In [53]:
data_train.head()

In [None]:
search_score = cross_val_score(grid_search,
                               X_train,
                               y_train,
                               scoring="neg_mean_squared_error",
                               cv=10)
search_rmse_score=np.sqrt(-search_score)
print("Scores: ", search_rmse_score)
print("Mean: ", search_rmse_score.mean())
print("Standard Deviation: ", search_rmse_score.std())

# Analyze the Best Models and Their Errors

In [None]:
grid_search.best_params_

In [None]:
cvres = grid_search.cv_results_

for mean_score, params in zip(cvres["mean_test_score"],cvres["params"]):
  print(np.sqrt(-mean_score), params)

# Evaluate Your Model

In [None]:
data_test.head()

In [None]:
final_model = grid_search.best_estimator_
final_predictions = final_model.predict(X_test)
final_mse = mean_squared_error(y_test,final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

# compute a 95% confidence interval for the generalization error 

In [None]:
from scipy import stats


In [None]:
# CODE HERE
confidance=0.95

squared_errors=(final_predictions - y_test) ** 2 
np.sqrt(stats.t.interval
          (
              confidance ,
              len(squared_errors) -1 ,
              loc=squared_errors.mean(),
              scale=stats.sem(squared_errors)
           )
          )

In [None]:
data_test.head()

In [None]:
data_train.head()

In [None]:
prediction=final_model.predict(data_test)

In [None]:
#predict the test data 
prediction

# **Prepare Submission File**

In [None]:
my_submission = pd.DataFrame({'Id': submit_data.Id, 'price': prediction})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)